aboutsummaryrefslogtreecommitdiffstats
path: root/third_party/webrender/swgl/src
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/webrender/swgl/src')
-rw-r--r--third_party/webrender/swgl/src/blend.h864
-rw-r--r--third_party/webrender/swgl/src/composite.h1069
-rw-r--r--third_party/webrender/swgl/src/gl.cc3164
-rw-r--r--third_party/webrender/swgl/src/gl_defs.h42
-rw-r--r--third_party/webrender/swgl/src/glsl.h1308
-rw-r--r--third_party/webrender/swgl/src/lib.rs2
-rw-r--r--third_party/webrender/swgl/src/program.h82
-rw-r--r--third_party/webrender/swgl/src/rasterize.h1670
-rw-r--r--third_party/webrender/swgl/src/swgl_ext.h1826
-rw-r--r--third_party/webrender/swgl/src/swgl_fns.rs513
-rw-r--r--third_party/webrender/swgl/src/texture.h1162
-rw-r--r--third_party/webrender/swgl/src/vector_type.h87
12 files changed, 3223 insertions, 8566 deletions
diff --git a/third_party/webrender/swgl/src/blend.h b/third_party/webrender/swgl/src/blend.h
deleted file mode 100644
index 8bc1c93994e..00000000000
--- a/third_party/webrender/swgl/src/blend.h
+++ /dev/null
@@ -1,864 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-static ALWAYS_INLINE HalfRGBA8 packRGBA8(I32 a, I32 b) {
-#if USE_SSE2
- return _mm_packs_epi32(a, b);
-#elif USE_NEON
- return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b));
-#else
- return CONVERT(combine(a, b), HalfRGBA8);
-#endif
-}
-
-static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v,
- float scale = 255.0f) {
- ivec4 i = round_pixel(v, scale);
- HalfRGBA8 xz = packRGBA8(i.z, i.x);
- HalfRGBA8 yw = packRGBA8(i.y, i.w);
- HalfRGBA8 xyzwl = zipLow(xz, yw);
- HalfRGBA8 xyzwh = zipHigh(xz, yw);
- HalfRGBA8 lo = zip2Low(xyzwl, xyzwh);
- HalfRGBA8 hi = zip2High(xyzwl, xyzwh);
- return combine(lo, hi);
-}
-
-static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(Float alpha,
- float scale = 255.0f) {
- I32 i = round_pixel(alpha, scale);
- HalfRGBA8 c = packRGBA8(i, i);
- c = zipLow(c, c);
- return zip(c, c);
-}
-
-static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(float alpha,
- float scale = 255.0f) {
- I32 i = round_pixel(alpha, scale);
- return repeat2(packRGBA8(i, i));
-}
-
-UNUSED static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v,
- float scale = 255.0f) {
- I32 i = round_pixel((Float){v.z, v.y, v.x, v.w}, scale);
- return repeat2(packRGBA8(i, i));
-}
-
-static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8() {
- return pack_pixels_RGBA8(fragment_shader->gl_FragColor);
-}
-
-static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(WideRGBA32F v,
- float scale = 255.0f) {
- ivec4 i = round_pixel(bit_cast<vec4>(v), scale);
- return combine(packRGBA8(i.x, i.y), packRGBA8(i.z, i.w));
-}
-
-static ALWAYS_INLINE WideR8 packR8(I32 a) {
-#if USE_SSE2
- return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a)));
-#elif USE_NEON
- return vqmovun_s32(a);
-#else
- return CONVERT(a, WideR8);
-#endif
-}
-
-static ALWAYS_INLINE WideR8 pack_pixels_R8(Float c, float scale = 255.0f) {
- return packR8(round_pixel(c, scale));
-}
-
-static ALWAYS_INLINE WideR8 pack_pixels_R8() {
- return pack_pixels_R8(fragment_shader->gl_FragColor.x);
-}
-
-// Load a partial span > 0 and < 4 pixels.
-template <typename V, typename P>
-static ALWAYS_INLINE V partial_load_span(const P* src, int span) {
- return bit_cast<V>(
- (span >= 2
- ? combine(unaligned_load<V2<P>>(src),
- V2<P>{span > 2 ? unaligned_load<P>(src + 2) : P(0), 0})
- : V4<P>{unaligned_load<P>(src), 0, 0, 0}));
-}
-
-// Store a partial span > 0 and < 4 pixels.
-template <typename V, typename P>
-static ALWAYS_INLINE void partial_store_span(P* dst, V src, int span) {
- auto pixels = bit_cast<V4<P>>(src);
- if (span >= 2) {
- unaligned_store(dst, lowHalf(pixels));
- if (span > 2) {
- unaligned_store(dst + 2, pixels.z);
- }
- } else {
- unaligned_store(dst, pixels.x);
- }
-}
-
-// Dispatcher that chooses when to load a full or partial span
-template <typename V, typename P>
-static ALWAYS_INLINE V load_span(const P* src, int span) {
- if (span >= 4) {
- return unaligned_load<V, P>(src);
- } else {
- return partial_load_span<V, P>(src, span);
- }
-}
-
-// Dispatcher that chooses when to store a full or partial span
-template <typename V, typename P>
-static ALWAYS_INLINE void store_span(P* dst, V src, int span) {
- if (span >= 4) {
- unaligned_store<V, P>(dst, src);
- } else {
- partial_store_span<V, P>(dst, src, span);
- }
-}
-
-template <typename T>
-static ALWAYS_INLINE T muldiv256(T x, T y) {
- return (x * y) >> 8;
-}
-
-// (x*y + x) >> 8, cheap approximation of (x*y) / 255
-template <typename T>
-static ALWAYS_INLINE T muldiv255(T x, T y) {
- return (x * y + x) >> 8;
-}
-
-template <typename V>
-static ALWAYS_INLINE WideRGBA8 pack_span(uint32_t*, const V& v,
- float scale = 255.0f) {
- return pack_pixels_RGBA8(v, scale);
-}
-
-template <typename C>
-static ALWAYS_INLINE WideR8 pack_span(uint8_t*, C c, float scale = 255.0f) {
- return pack_pixels_R8(c, scale);
-}
-
-// Helper functions to apply a color modulus when available.
-struct NoColor {};
-
-template <typename P>
-static ALWAYS_INLINE P applyColor(P src, NoColor) {
- return src;
-}
-
-struct InvertColor {};
-
-template <typename P>
-static ALWAYS_INLINE P applyColor(P src, InvertColor) {
- return 255 - src;
-}
-
-template <typename P>
-static ALWAYS_INLINE P applyColor(P src, P color) {
- return muldiv255(color, src);
-}
-
-static ALWAYS_INLINE WideRGBA8 applyColor(PackedRGBA8 src, WideRGBA8 color) {
- return applyColor(unpack(src), color);
-}
-
-template <typename P, typename C>
-static ALWAYS_INLINE auto packColor(P* buf, C color) {
- return pack_span(buf, color, 255.0f);
-}
-
-template <typename P>
-static ALWAYS_INLINE NoColor packColor(UNUSED P* buf, NoColor noColor) {
- return noColor;
-}
-
-template <typename P>
-static ALWAYS_INLINE InvertColor packColor(UNUSED P* buf,
- InvertColor invertColor) {
- return invertColor;
-}
-
-// Single argument variation that takes an explicit destination buffer type.
-template <typename P, typename C>
-static ALWAYS_INLINE auto packColor(C color) {
- // Just pass in a typed null pointer, as the pack routines never use the
- // pointer's value, just its type.
- return packColor((P*)0, color);
-}
-
-// Byte-wise addition for when x or y is a signed 8-bit value stored in the
-// low byte of a larger type T only with zeroed-out high bits, where T is
-// greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used
-// upon signed operands, using up all the precision in a 16 bit integer, and
-// potentially losing the sign bit in the last >> 8 shift. Due to the
-// properties of two's complement arithmetic, even though we've discarded the
-// sign bit, we can still represent a negative number under addition (without
-// requiring any extra sign bits), just that any negative number will behave
-// like a large unsigned number under addition, generating a single carry bit
-// on overflow that we need to discard. Thus, just doing a byte-wise add will
-// overflow without the troublesome carry, giving us only the remaining 8 low
-// bits we actually need while keeping the high bits at zero.
-template <typename T>
-static ALWAYS_INLINE T addlow(T x, T y) {
- typedef VectorType<uint8_t, sizeof(T)> bytes;
- return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y));
-}
-
-// Replace color components of each pixel with the pixel's alpha values.
-template <typename T>
-static ALWAYS_INLINE T alphas(T c) {
- return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15);
-}
-
-// Replace the alpha values of the first vector with alpha values from the
-// second, while leaving the color components unmodified.
-template <typename T>
-static ALWAYS_INLINE T set_alphas(T c, T a) {
- return SHUFFLE(c, a, 0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31);
-}
-
-// Miscellaneous helper functions for working with packed RGBA8 data.
-static ALWAYS_INLINE HalfRGBA8 if_then_else(V8<int16_t> c, HalfRGBA8 t,
- HalfRGBA8 e) {
- return bit_cast<HalfRGBA8>((c & t) | (~c & e));
-}
-
-template <typename T, typename C, int N>
-static ALWAYS_INLINE VectorType<T, N> if_then_else(VectorType<C, N> c,
- VectorType<T, N> t,
- VectorType<T, N> e) {
- return combine(if_then_else(lowHalf(c), lowHalf(t), lowHalf(e)),
- if_then_else(highHalf(c), highHalf(t), highHalf(e)));
-}
-
-static ALWAYS_INLINE HalfRGBA8 min(HalfRGBA8 x, HalfRGBA8 y) {
-#if USE_SSE2
- return bit_cast<HalfRGBA8>(
- _mm_min_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y)));
-#elif USE_NEON
- return vminq_u16(x, y);
-#else
- return if_then_else(x < y, x, y);
-#endif
-}
-
-template <typename T, int N>
-static ALWAYS_INLINE VectorType<T, N> min(VectorType<T, N> x,
- VectorType<T, N> y) {
- return combine(min(lowHalf(x), lowHalf(y)), min(highHalf(x), highHalf(y)));
-}
-
-static ALWAYS_INLINE HalfRGBA8 max(HalfRGBA8 x, HalfRGBA8 y) {
-#if USE_SSE2
- return bit_cast<HalfRGBA8>(
- _mm_max_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y)));
-#elif USE_NEON
- return vmaxq_u16(x, y);
-#else
- return if_then_else(x > y, x, y);
-#endif
-}
-
-template <typename T, int N>
-static ALWAYS_INLINE VectorType<T, N> max(VectorType<T, N> x,
- VectorType<T, N> y) {
- return combine(max(lowHalf(x), lowHalf(y)), max(highHalf(x), highHalf(y)));
-}
-
-template <typename T, int N>
-static ALWAYS_INLINE VectorType<T, N> recip(VectorType<T, N> v) {
- return combine(recip(lowHalf(v)), recip(highHalf(v)));
-}
-
-// Helper to get the reciprocal if the value is non-zero, or otherwise default
-// to the supplied fallback value.
-template <typename V>
-static ALWAYS_INLINE V recip_or(V v, float f) {
- return if_then_else(v != V(0.0f), recip(v), V(f));
-}
-
-template <typename T, int N>
-static ALWAYS_INLINE VectorType<T, N> inversesqrt(VectorType<T, N> v) {
- return combine(inversesqrt(lowHalf(v)), inversesqrt(highHalf(v)));
-}
-
-// Extract the alpha components so that we can cheaply calculate the reciprocal
-// on a single SIMD register. Then multiply the duplicated alpha reciprocal with
-// the pixel data. 0 alpha is treated as transparent black.
-static ALWAYS_INLINE WideRGBA32F unpremultiply(WideRGBA32F v) {
- Float a = recip_or((Float){v[3], v[7], v[11], v[15]}, 0.0f);
- return v * a.xxxxyyyyzzzzwwww;
-}
-
-// Packed RGBA32F data is AoS in BGRA order. Transpose it to SoA and swizzle to
-// RGBA to unpack.
-static ALWAYS_INLINE vec4 unpack(PackedRGBA32F c) {
- return bit_cast<vec4>(
- SHUFFLE(c, c, 2, 6, 10, 14, 1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15));
-}
-
-// The following lum/sat functions mostly follow the KHR_blend_equation_advanced
-// specification but are rearranged to work on premultiplied data.
-static ALWAYS_INLINE Float lumv3(vec3 v) {
- return v.x * 0.30f + v.y * 0.59f + v.z * 0.11f;
-}
-
-static ALWAYS_INLINE Float minv3(vec3 v) { return min(min(v.x, v.y), v.z); }
-
-static ALWAYS_INLINE Float maxv3(vec3 v) { return max(max(v.x, v.y), v.z); }
-
-static inline vec3 clip_color(vec3 v, Float lum, Float alpha) {
- Float mincol = max(-minv3(v), lum);
- Float maxcol = max(maxv3(v), alpha - lum);
- return lum + v * (lum * (alpha - lum) * recip_or(mincol * maxcol, 0.0f));
-}
-
-static inline vec3 set_lum(vec3 base, vec3 ref, Float alpha) {
- return clip_color(base - lumv3(base), lumv3(ref), alpha);
-}
-
-static inline vec3 set_lum_sat(vec3 base, vec3 sref, vec3 lref, Float alpha) {
- vec3 diff = base - minv3(base);
- Float sbase = maxv3(diff);
- Float ssat = maxv3(sref) - minv3(sref);
- // The sbase range is rescaled to ssat. If sbase has 0 extent, then rescale
- // to black, as per specification.
- return set_lum(diff * ssat * recip_or(sbase, 0.0f), lref, alpha);
-}
-
-// Flags the reflect the current blend-stage clipping to be applied.
-enum SWGLClipFlag {
- SWGL_CLIP_FLAG_MASK = 1 << 0,
- SWGL_CLIP_FLAG_AA = 1 << 1,
- SWGL_CLIP_FLAG_BLEND_OVERRIDE = 1 << 2,
-};
-static int swgl_ClipFlags = 0;
-static BlendKey swgl_BlendOverride = BLEND_KEY_NONE;
-static WideRGBA8 swgl_BlendColorRGBA8 = {0};
-static WideRGBA8 swgl_BlendAlphaRGBA8 = {0};
-
-// A pointer into the color buffer for the start of the span.
-static void* swgl_SpanBuf = nullptr;
-// A pointer into the clip mask for the start of the span.
-static uint8_t* swgl_ClipMaskBuf = nullptr;
-
-static ALWAYS_INLINE WideR8 expand_mask(UNUSED uint8_t* buf, WideR8 mask) {
- return mask;
-}
-static ALWAYS_INLINE WideRGBA8 expand_mask(UNUSED uint32_t* buf, WideR8 mask) {
- WideRG8 maskRG = zip(mask, mask);
- return zip(maskRG, maskRG);
-}
-
-// Loads a chunk of clip masks. The current pointer into the color buffer is
-// used to reconstruct the relative position within the span. From there, the
-// pointer into the clip mask can be generated from the start of the clip mask
-// span.
-template <typename P>
-static ALWAYS_INLINE uint8_t* get_clip_mask(P* buf) {
- return &swgl_ClipMaskBuf[buf - (P*)swgl_SpanBuf];
-}
-
-template <typename P>
-static ALWAYS_INLINE auto load_clip_mask(P* buf, int span)
- -> decltype(expand_mask(buf, 0)) {
- return expand_mask(buf,
- unpack(load_span<PackedR8>(get_clip_mask(buf), span)));
-}
-
-// Temporarily removes masking from the blend stage, assuming the caller will
-// handle it.
-static ALWAYS_INLINE void override_clip_mask() {
- blend_key = BlendKey(blend_key - MASK_BLEND_KEY_NONE);
-}
-
-// Restores masking to the blend stage, assuming it was previously overridden.
-static ALWAYS_INLINE void restore_clip_mask() {
- blend_key = BlendKey(MASK_BLEND_KEY_NONE + blend_key);
-}
-
-// A pointer to the start of the opaque destination region of the span for AA.
-static const uint8_t* swgl_OpaqueStart = nullptr;
-// The size, in bytes, of the opaque region.
-static uint32_t swgl_OpaqueSize = 0;
-// AA coverage distance offsets for the left and right edges.
-static Float swgl_LeftAADist = 0.0f;
-static Float swgl_RightAADist = 0.0f;
-// AA coverage slope values used for accumulating coverage for each step.
-static Float swgl_AASlope = 0.0f;
-
-// Get the amount of pixels we need to process before the start of the opaque
-// region.
-template <typename P>
-static ALWAYS_INLINE int get_aa_opaque_start(P* buf) {
- return max(int((P*)swgl_OpaqueStart - buf), 0);
-}
-
-// Assuming we are already in the opaque part of the span, return the remaining
-// size of the opaque part.
-template <typename P>
-static ALWAYS_INLINE int get_aa_opaque_size(P* buf) {
- return max(int((P*)&swgl_OpaqueStart[swgl_OpaqueSize] - buf), 0);
-}
-
-// Temporarily removes anti-aliasing from the blend stage, assuming the caller
-// will handle it.
-static ALWAYS_INLINE void override_aa() {
- blend_key = BlendKey(blend_key - AA_BLEND_KEY_NONE);
-}
-
-// Restores anti-aliasing to the blend stage, assuming it was previously
-// overridden.
-static ALWAYS_INLINE void restore_aa() {
- blend_key = BlendKey(AA_BLEND_KEY_NONE + blend_key);
-}
-
-static PREFER_INLINE WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst,
- WideRGBA8 src, int span = 4) {
- WideRGBA8 dst = unpack(pdst);
- const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0, 0xFFFF, 0xFFFF,
- 0xFFFF, 0, 0xFFFF, 0xFFFF, 0xFFFF, 0,
- 0xFFFF, 0xFFFF, 0xFFFF, 0};
- const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF,
- 0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF};
- const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255,
- 0, 0, 0, 255, 0, 0, 0, 255};
-
-// clang-format off
- // Computes AA for the given pixel based on the offset of the pixel within
- // destination row. Given the initial coverage offsets for the left and right
- // edges, the offset is scaled by the slope and accumulated to find the
- // minimum coverage value for the pixel. A final weight is generated that
- // can be used to scale the source pixel.
-#define DO_AA(format, body) \
- do { \
- int offset = int((const uint8_t*)buf - swgl_OpaqueStart); \
- if (uint32_t(offset) >= swgl_OpaqueSize) { \
- Float delta = swgl_AASlope * float(offset); \
- Float dist = clamp(min(swgl_LeftAADist + delta.x, \
- swgl_RightAADist + delta.y), \
- 0.0f, 256.0f); \
- auto aa = pack_pixels_##format(dist, 1.0f); \
- body; \
- } \
- } while (0)
-
- // Each blend case is preceded by the MASK_ variant. The MASK_ case first
- // loads the mask values and multiplies the source value by them. After, it
- // falls through to the normal blending case using the masked source. The
- // AA_ variations may further precede the blend cases, in which case the
- // source value is further modified before use.
-#define BLEND_CASE_KEY(key) \
- case AA_##key: \
- DO_AA(RGBA8, src = muldiv256(src, aa)); \
- goto key; \
- case AA_MASK_##key: \
- DO_AA(RGBA8, src = muldiv256(src, aa)); \
- FALLTHROUGH; \
- case MASK_##key: \
- src = muldiv255(src, load_clip_mask(buf, span)); \
- FALLTHROUGH; \
- case key: key
-
-#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__))
-
- switch (blend_key) {
- BLEND_CASE(GL_ONE, GL_ZERO):
- return src;
- BLEND_CASE(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE,
- GL_ONE_MINUS_SRC_ALPHA):
- // dst + src.a*(src.rgb1 - dst)
- // use addlow for signed overflow
- return addlow(dst, muldiv255(alphas(src), (src | ALPHA_OPAQUE) - dst));
- BLEND_CASE(GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
- return src + dst - muldiv255(dst, alphas(src));
- BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR):
- return dst - muldiv255(dst, src);
- BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE):
- return dst - (muldiv255(dst, src) & RGB_MASK);
- BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA):
- return dst - muldiv255(dst, alphas(src));
- BLEND_CASE(GL_ZERO, GL_SRC_COLOR):
- return muldiv255(src, dst);
- BLEND_CASE(GL_ONE, GL_ONE):
- return src + dst;
- BLEND_CASE(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
- return src + dst - (muldiv255(dst, src) & ALPHA_MASK);
- BLEND_CASE(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE):
- // src*(1-dst.a) + dst*1 = src - src*dst.a + dst
- return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK);
- BLEND_CASE(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR):
- // src*k + (1-src)*dst = src*k + dst -
- // src*dst = dst + src*(k - dst) use addlow
- // for signed overflow
- return addlow(
- dst, muldiv255(src, repeat2(ctx->blendcolor) - dst));
-
- // We must explicitly handle the masked/anti-aliased secondary blend case.
- // The secondary color as well as the source must be multiplied by the
- // weights.
- case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
- WideRGBA8 secondary =
- applyColor(dst,
- packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
- return src + dst - secondary;
- }
- case MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
- WideRGBA8 secondary =
- applyColor(dst,
- packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
- WideRGBA8 mask = load_clip_mask(buf, span);
- return muldiv255(src, mask) + dst - muldiv255(secondary, mask);
- }
- case AA_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
- WideRGBA8 secondary =
- applyColor(dst,
- packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
- DO_AA(RGBA8, {
- src = muldiv256(src, aa);
- secondary = muldiv256(secondary, aa);
- });
- return src + dst - secondary;
- }
- case AA_MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
- WideRGBA8 secondary =
- applyColor(dst,
- packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
- WideRGBA8 mask = load_clip_mask(buf, span);
- DO_AA(RGBA8, mask = muldiv256(mask, aa));
- return muldiv255(src, mask) + dst - muldiv255(secondary, mask);
- }
-
- BLEND_CASE(GL_MIN):
- return min(src, dst);
- BLEND_CASE(GL_MAX):
- return max(src, dst);
-
- // The KHR_blend_equation_advanced spec describes the blend equations such
- // that the unpremultiplied values Cs, Cd, As, Ad and function f combine to
- // the result:
- // Cr = f(Cs,Cd)*As*Ad + Cs*As*(1-Ad) + Cd*AD*(1-As)
- // Ar = As*Ad + As*(1-Ad) + Ad*(1-As)
- // However, working with unpremultiplied values requires expensive math to
- // unpremultiply and premultiply again during blending. We can use the fact
- // that premultiplied value P = C*A and simplify the equations such that no
- // unpremultiplied colors are necessary, allowing us to stay with integer
- // math that avoids floating-point conversions in the common case. Some of
- // the blend modes require division or sqrt, in which case we do convert
- // to (possibly transposed/unpacked) floating-point to implement the mode.
- // However, most common modes can still use cheaper premultiplied integer
- // math. As an example, the multiply mode f(Cs,Cd) = Cs*Cd is simplified
- // to:
- // Cr = Cs*Cd*As*Ad + Cs*As*(1-Ad) + Cd*Ad*(1-As)
- // .. Pr = Ps*Pd + Ps - Ps*Ad + Pd - Pd*As
- // Ar = As*Ad + As - As*Ad + Ad - Ad*As
- // .. Ar = As + Ad - As*Ad
- // Note that the alpha equation is the same for all blend equations, such
- // that so long as the implementation results in As + Ad - As*Ad, we can
- // avoid using separate instructions to compute the alpha result, which is
- // dependent on the math used to implement each blend mode. The exact
- // reductions used to get the final math for every blend mode are too
- // involved to show here in comments, but mostly follows from replacing
- // Cs*As and Cd*Ad with Ps and Ps while factoring out as many common terms
- // as possible.
-
- BLEND_CASE(GL_MULTIPLY_KHR): {
- WideRGBA8 diff = muldiv255(alphas(src) - (src & RGB_MASK),
- alphas(dst) - (dst & RGB_MASK));
- return src + dst + (diff & RGB_MASK) - alphas(diff);
- }
- BLEND_CASE(GL_SCREEN_KHR):
- return src + dst - muldiv255(src, dst);
- BLEND_CASE(GL_OVERLAY_KHR): {
- WideRGBA8 srcA = alphas(src);
- WideRGBA8 dstA = alphas(dst);
- WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst);
- return src + dst +
- if_then_else(dst * 2 <= dstA, (diff & RGB_MASK) - alphas(diff),
- -diff);
- }
- BLEND_CASE(GL_DARKEN_KHR):
- return src + dst -
- max(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src)));
- BLEND_CASE(GL_LIGHTEN_KHR):
- return src + dst -
- min(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src)));
-
- BLEND_CASE(GL_COLORDODGE_KHR): {
- // Color-dodge and color-burn require division, so we convert to FP math
- // here, but avoid transposing to a vec4.
- WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
- WideRGBA32F srcA = alphas(srcF);
- WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
- WideRGBA32F dstA = alphas(dstF);
- return pack_pixels_RGBA8(
- srcA * set_alphas(
- min(dstA, dstF * srcA * recip_or(srcA - srcF, 255.0f)),
- dstF) +
- srcF * (255.0f - dstA) + dstF * (255.0f - srcA),
- 1.0f / 255.0f);
- }
- BLEND_CASE(GL_COLORBURN_KHR): {
- WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
- WideRGBA32F srcA = alphas(srcF);
- WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
- WideRGBA32F dstA = alphas(dstF);
- return pack_pixels_RGBA8(
- srcA * set_alphas((dstA - min(dstA, (dstA - dstF) * srcA *
- recip_or(srcF, 255.0f))),
- dstF) +
- srcF * (255.0f - dstA) + dstF * (255.0f - srcA),
- 1.0f / 255.0f);
- }
- BLEND_CASE(GL_HARDLIGHT_KHR): {
- WideRGBA8 srcA = alphas(src);
- WideRGBA8 dstA = alphas(dst);
- WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst);
- return src + dst +
- if_then_else(src * 2 <= srcA, (diff & RGB_MASK) - alphas(diff),
- -diff);
- }
-
- BLEND_CASE(GL_SOFTLIGHT_KHR): {
- // Soft-light requires an unpremultiply that can't be factored out as
- // well as a sqrt, so we convert to FP math here, but avoid transposing
- // to a vec4.
- WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
- WideRGBA32F srcA = alphas(srcF);
- WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
- WideRGBA32F dstA = alphas(dstF);
- WideRGBA32F dstU = unpremultiply(dstF);
- WideRGBA32F scale = srcF + srcF - srcA;
- return pack_pixels_RGBA8(
- dstF * (255.0f +
- set_alphas(
- scale *
- if_then_else(scale < 0.0f, 1.0f - dstU,
- min((16.0f * dstU - 12.0f) * dstU + 3.0f,
- inversesqrt(dstU) - 1.0f)),
- WideRGBA32F(0.0f))) +
- srcF * (255.0f - dstA),
- 1.0f / 255.0f);
- }
- BLEND_CASE(GL_DIFFERENCE_KHR): {
- WideRGBA8 diff =
- min(muldiv255(dst, alphas(src)), muldiv255(src, alphas(dst)));
- return src + dst - diff - (diff & RGB_MASK);
- }
- BLEND_CASE(GL_EXCLUSION_KHR): {
- WideRGBA8 diff = muldiv255(src, dst);
- return src + dst - diff - (diff & RGB_MASK);
- }
-
- // The HSL blend modes are non-separable and require complicated use of
- // division. It is advantageous to convert to FP and transpose to vec4
- // math to more easily manipulate the individual color components.
-#define DO_HSL(rgb) \
- do { \
- vec4 srcV = unpack(CONVERT(src, PackedRGBA32F)); \
- vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F)); \
- Float srcA = srcV.w * (1.0f / 255.0f); \
- Float dstA = dstV.w * (1.0f / 255.0f); \
- Float srcDstA = srcV.w * dstA; \
- vec3 srcC = vec3(srcV) * dstA; \
- vec3 dstC = vec3(dstV) * srcA; \
- return pack_pixels_RGBA8(vec4(rgb + vec3(srcV) - srcC + vec3(dstV) - dstC, \
- srcV.w + dstV.w - srcDstA), \
- 1.0f); \
- } while (0)
-
- BLEND_CASE(GL_HSL_HUE_KHR):
- DO_HSL(set_lum_sat(srcC, dstC, dstC, srcDstA));
- BLEND_CASE(GL_HSL_SATURATION_KHR):
- DO_HSL(set_lum_sat(dstC, srcC, dstC, srcDstA));
- BLEND_CASE(GL_HSL_COLOR_KHR):
- DO_HSL(set_lum(srcC, dstC, srcDstA));
- BLEND_CASE(GL_HSL_LUMINOSITY_KHR):
- DO_HSL(set_lum(dstC, srcC, srcDstA));
-
- // SWGL-specific extended blend modes.
- BLEND_CASE(SWGL_BLEND_DROP_SHADOW): {
- // Premultiplied alpha over blend, but with source color set to source alpha
- // modulated with a constant color.
- WideRGBA8 color = applyColor(alphas(src), swgl_BlendColorRGBA8);
- return color + dst - muldiv255(dst, alphas(color));
- }
-
- BLEND_CASE(SWGL_BLEND_SUBPIXEL_TEXT):
- // Premultiplied alpha over blend, but treats the source as a subpixel mask
- // modulated with a constant color.
- return applyColor(src, swgl_BlendColorRGBA8) + dst -
- muldiv255(dst, applyColor(src, swgl_BlendAlphaRGBA8));
-
- default:
- UNREACHABLE;
- // return src;
- }
-
-#undef BLEND_CASE
-#undef BLEND_CASE_KEY
- // clang-format on
-}
-
-static PREFER_INLINE WideR8 blend_pixels(uint8_t* buf, WideR8 dst, WideR8 src,
- int span = 4) {
-// clang-format off
-#define BLEND_CASE_KEY(key) \
- case AA_##key: \
- DO_AA(R8, src = muldiv256(src, aa)); \
- goto key; \
- case AA_MASK_##key: \
- DO_AA(R8, src = muldiv256(src, aa)); \
- FALLTHROUGH; \
- case MASK_##key: \
- src = muldiv255(src, load_clip_mask(buf, span)); \
- FALLTHROUGH; \
- case key: key
-
-#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__))
-
- switch (blend_key) {
- BLEND_CASE(GL_ONE, GL_ZERO):
- return src;
- BLEND_CASE(GL_ZERO, GL_SRC_COLOR):
- return muldiv255(src, dst);
- BLEND_CASE(GL_ONE, GL_ONE):
- return src + dst;
- default:
- UNREACHABLE;
- // return src;
- }
-
-#undef BLEND_CASE
-#undef BLEND_CASE_KEY
- // clang-format on
-}
-
-static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r) {
- unaligned_store(buf, pack(r));
-}
-
-static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r, int len) {
- partial_store_span(buf, pack(r), len);
-}
-
-static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r) {
- return blend_pixels(buf, unaligned_load<PackedRGBA8>(buf), r);
-}
-
-static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r, int len) {
- return blend_pixels(buf, partial_load_span<PackedRGBA8>(buf, len), r, len);
-}
-
-static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r) {
- unaligned_store(buf, r);
-}
-
-static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r, int len) {
- partial_store_span(buf, r, len);
-}
-
-static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r) {
- return pack(blend_span(buf, unpack(r)));
-}
-
-static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r,
- int len) {
- return pack(blend_span(buf, unpack(r), len));
-}
-
-static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r) {
- unaligned_store(buf, pack(r));
-}
-
-static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r, int len) {
- partial_store_span(buf, pack(r), len);
-}
-
-static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r) {
- return blend_pixels(buf, unpack(unaligned_load<PackedR8>(buf)), r);
-}
-
-static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r, int len) {
- return blend_pixels(buf, unpack(partial_load_span<PackedR8>(buf, len)), r,
- len);
-}
-
-static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r) {
- unaligned_store(buf, r);
-}
-
-static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r, int len) {
- partial_store_span(buf, r, len);
-}
-
-static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r) {
- return pack(blend_span(buf, unpack(r)));
-}
-
-static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r, int len) {
- return pack(blend_span(buf, unpack(r), len));
-}
-
-template <bool BLEND, typename P, typename R>
-static ALWAYS_INLINE void commit_blend_span(P* buf, R r) {
- if (BLEND) {
- commit_span(buf, blend_span(buf, r));
- } else {
- commit_span(buf, r);
- }
-}
-
-template <bool BLEND, typename P, typename R>
-static ALWAYS_INLINE void commit_blend_span(P* buf, R r, int len) {
- if (BLEND) {
- commit_span(buf, blend_span(buf, r, len), len);
- } else {
- commit_span(buf, r, len);
- }
-}
-
-template <typename P, typename R>
-static ALWAYS_INLINE void commit_blend_solid_span(P* buf, R r, int len) {
- for (P* end = &buf[len & ~3]; buf < end; buf += 4) {
- commit_span(buf, blend_span(buf, r));
- }
- len &= 3;
- if (len > 0) {
- partial_store_span(buf, pack(blend_span(buf, r, len)), len);
- }
-}
-
-template <bool BLEND>
-static void commit_solid_span(uint32_t* buf, WideRGBA8 r, int len) {
- commit_blend_solid_span(buf, r, len);
-}
-
-template <>
-ALWAYS_INLINE void commit_solid_span<false>(uint32_t* buf, WideRGBA8 r,
- int len) {
- fill_n(buf, len, bit_cast<U32>(pack(r)).x);
-}
-
-template <bool BLEND>
-static void commit_solid_span(uint8_t* buf, WideR8 r, int len) {
- commit_blend_solid_span(buf, r, len);
-}
-
-template <>
-ALWAYS_INLINE void commit_solid_span<false>(uint8_t* buf, WideR8 r, int len) {
- PackedR8 p = pack(r);
- if (uintptr_t(buf) & 3) {
- int align = 4 - (uintptr_t(buf) & 3);
- align = min(align, len);
- partial_store_span(buf, p, align);
- buf += align;
- len -= align;
- }
- fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(p));
- buf += len & ~3;
- len &= 3;
- if (len > 0) {
- partial_store_span(buf, p, len);
- }
-}
diff --git a/third_party/webrender/swgl/src/composite.h b/third_party/webrender/swgl/src/composite.h
deleted file mode 100644
index f88de485fdd..00000000000
--- a/third_party/webrender/swgl/src/composite.h
+++ /dev/null
@@ -1,1069 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-template <bool COMPOSITE, typename P>
-static inline void copy_row(P* dst, const P* src, int span) {
- // No scaling, so just do a fast copy.
- memcpy(dst, src, span * sizeof(P));
-}
-
-template <>
-void copy_row<true, uint32_t>(uint32_t* dst, const uint32_t* src, int span) {
- // No scaling, so just do a fast composite.
- auto* end = dst + span;
- while (dst + 4 <= end) {
- WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
- WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst));
- PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
- unaligned_store(dst, r);
- src += 4;
- dst += 4;
- }
- if (dst < end) {
- WideRGBA8 srcpx = unpack(partial_load_span<PackedRGBA8>(src, end - dst));
- WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst));
- auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
- partial_store_span(dst, r, end - dst);
- }
-}
-
-template <bool COMPOSITE, typename P>
-static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth,
- int span, int frac) {
- // Do scaling with different source and dest widths.
- for (P* end = dst + span; dst < end; dst++) {
- *dst = *src;
- // Step source according to width ratio.
- for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
- src++;
- }
- }
-}
-
-template <>
-void scale_row<true, uint32_t>(uint32_t* dst, int dstWidth, const uint32_t* src,
- int srcWidth, int span, int frac) {
- // Do scaling with different source and dest widths.
- // Gather source pixels four at a time for better packing.
- auto* end = dst + span;
- for (; dst + 4 <= end; dst += 4) {
- U32 srcn;
- srcn.x = *src;
- for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
- src++;
- }
- srcn.y = *src;
- for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
- src++;
- }
- srcn.z = *src;
- for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
- src++;
- }
- srcn.w = *src;
- for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
- src++;
- }
- WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn));
- WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst));
- PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
- unaligned_store(dst, r);
- }
- if (dst < end) {
- // Process any remaining pixels. Try to gather as many pixels as possible
- // into a single source chunk for compositing.
- U32 srcn = {*src, 0, 0, 0};
- if (end - dst > 1) {
- for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
- src++;
- }
- srcn.y = *src;
- if (end - dst > 2) {
- for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
- src++;
- }
- srcn.z = *src;
- }
- }
- WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn));
- WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst));
- auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
- partial_store_span(dst, r, end - dst);
- }
-}
-
-template <bool COMPOSITE = false>
-static NO_INLINE void scale_blit(Texture& srctex, const IntRect& srcReq,
- Texture& dsttex, const IntRect& dstReq,
- bool invertY, const IntRect& clipRect) {
- assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 &&
- dsttex.internal_format == GL_RGBA8));
- // Cache scaling ratios
- int srcWidth = srcReq.width();
- int srcHeight = srcReq.height();
- int dstWidth = dstReq.width();
- int dstHeight = dstReq.height();
- // Compute valid dest bounds
- IntRect dstBounds = dsttex.sample_bounds(dstReq).intersect(clipRect);
- // Compute valid source bounds
- IntRect srcBounds = srctex.sample_bounds(srcReq, invertY);
- // If srcReq is outside the source texture, we need to clip the sampling
- // bounds so that we never sample outside valid source bounds. Get texture
- // bounds relative to srcReq and scale to dest-space rounding inward, using
- // this rect to limit the dest bounds further.
- IntRect srcClip = srctex.bounds() - srcReq.origin();
- if (invertY) {
- srcClip.invert_y(srcReq.height());
- }
- srcClip.scale(srcWidth, srcHeight, dstWidth, dstHeight, true);
- dstBounds.intersect(srcClip);
- // Check if clipped sampling bounds are empty
- if (dstBounds.is_empty()) {
- return;
- }
-
- // Calculate source and dest pointers from clamped offsets
- int bpp = srctex.bpp();
- int srcStride = srctex.stride();
- int destStride = dsttex.stride();
- char* dest = dsttex.sample_ptr(dstReq, dstBounds);
- // Clip the source bounds by the destination offset.
- int fracX = srcWidth * dstBounds.x0;
- int fracY = srcHeight * dstBounds.y0;
- srcBounds.x0 = max(fracX / dstWidth, srcBounds.x0);
- srcBounds.y0 = max(fracY / dstHeight, srcBounds.y0);
- fracX %= dstWidth;
- fracY %= dstHeight;
- char* src = srctex.sample_ptr(srcReq, srcBounds, invertY);
- // Inverted Y must step downward along source rows
- if (invertY) {
- srcStride = -srcStride;
- }
- int span = dstBounds.width();
- for (int rows = dstBounds.height(); rows > 0; rows--) {
- switch (bpp) {
- case 1:
- if (srcWidth == dstWidth)
- copy_row<COMPOSITE>((uint8_t*)dest, (uint8_t*)src, span);
- else
- scale_row<COMPOSITE>((uint8_t*)dest, dstWidth, (uint8_t*)src,
- srcWidth, span, fracX);
- break;
- case 2:
- if (srcWidth == dstWidth)
- copy_row<COMPOSITE>((uint16_t*)dest, (uint16_t*)src, span);
- else
- scale_row<COMPOSITE>((uint16_t*)dest, dstWidth, (uint16_t*)src,
- srcWidth, span, fracX);
- break;
- case 4:
- if (srcWidth == dstWidth)
- copy_row<COMPOSITE>((uint32_t*)dest, (uint32_t*)src, span);
- else
- scale_row<COMPOSITE>((uint32_t*)dest, dstWidth, (uint32_t*)src,
- srcWidth, span, fracX);
- break;
- default:
- assert(false);
- break;
- }
- dest += destStride;
- // Step source according to height ratio.
- for (fracY += srcHeight; fracY >= dstHeight; fracY -= dstHeight) {
- src += srcStride;
- }
- }
-}
-
-template <bool COMPOSITE>
-static void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV,
- float srcDU, sampler2D sampler) {
- vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
- for (; span >= 4; span -= 4) {
- auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv));
- unaligned_store(dest, srcpx);
- dest += 4;
- uv.x += 4 * srcDU;
- }
- if (span > 0) {
- auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv));
- partial_store_span(dest, srcpx, span);
- }
-}
-
-template <>
-void linear_row_blit<true>(uint32_t* dest, int span, const vec2_scalar& srcUV,
- float srcDU, sampler2D sampler) {
- vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
- for (; span >= 4; span -= 4) {
- WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv));
- WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
- PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
- unaligned_store(dest, r);
-
- dest += 4;
- uv.x += 4 * srcDU;
- }
- if (span > 0) {
- WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv));
- WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, span));
- PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
- partial_store_span(dest, r, span);
- }
-}
-
-template <bool COMPOSITE>
-static void linear_row_blit(uint8_t* dest, int span, const vec2_scalar& srcUV,
- float srcDU, sampler2D sampler) {
- vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
- for (; span >= 4; span -= 4) {
- auto srcpx = textureLinearPackedR8(sampler, ivec2(uv));
- unaligned_store(dest, srcpx);
- dest += 4;
- uv.x += 4 * srcDU;
- }
- if (span > 0) {
- auto srcpx = textureLinearPackedR8(sampler, ivec2(uv));
- partial_store_span(dest, srcpx, span);
- }
-}
-
-template <bool COMPOSITE>
-static void linear_row_blit(uint16_t* dest, int span, const vec2_scalar& srcUV,
- float srcDU, sampler2D sampler) {
- vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
- for (; span >= 4; span -= 4) {
- auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv));
- unaligned_store(dest, srcpx);
- dest += 4;
- uv.x += 4 * srcDU;
- }
- if (span > 0) {
- auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv));
- partial_store_span(dest, srcpx, span);
- }
-}
-
-template <bool COMPOSITE = false>
-static NO_INLINE void linear_blit(Texture& srctex, const IntRect& srcReq,
- Texture& dsttex, const IntRect& dstReq,
- bool invertY, const IntRect& clipRect) {
- assert(srctex.internal_format == GL_RGBA8 ||
- srctex.internal_format == GL_R8 || srctex.internal_format == GL_RG8);
- assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 &&
- dsttex.internal_format == GL_RGBA8));
- // Compute valid dest bounds
- IntRect dstBounds = dsttex.sample_bounds(dstReq);
- dstBounds.intersect(clipRect);
- // Check if sampling bounds are empty
- if (dstBounds.is_empty()) {
- return;
- }
- // Initialize sampler for source texture
- sampler2D_impl sampler;
- init_sampler(&sampler, srctex);
- sampler.filter = TextureFilter::LINEAR;
- // Compute source UVs
- vec2_scalar srcUV(srcReq.x0, srcReq.y0);
- vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
- float(srcReq.height()) / dstReq.height());
- // Inverted Y must step downward along source rows
- if (invertY) {
- srcUV.y += srcReq.height();
- srcDUV.y = -srcDUV.y;
- }
- // Skip to clamped source start
- srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
- // Scale UVs by lerp precision
- srcUV = linearQuantize(srcUV, 128);
- srcDUV *= 128.0f;
- // Calculate dest pointer from clamped offsets
- int bpp = dsttex.bpp();
- int destStride = dsttex.stride();
- char* dest = dsttex.sample_ptr(dstReq, dstBounds);
- int span = dstBounds.width();
- for (int rows = dstBounds.height(); rows > 0; rows--) {
- switch (bpp) {
- case 1:
- linear_row_blit<COMPOSITE>((uint8_t*)dest, span, srcUV, srcDUV.x,
- &sampler);
- break;
- case 2:
- linear_row_blit<COMPOSITE>((uint16_t*)dest, span, srcUV, srcDUV.x,
- &sampler);
- break;
- case 4:
- linear_row_blit<COMPOSITE>((uint32_t*)dest, span, srcUV, srcDUV.x,
- &sampler);
- break;
- default:
- assert(false);
- break;
- }
- dest += destStride;
- srcUV.y += srcDUV.y;
- }
-}
-
-extern "C" {
-
-void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
- GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
- GLbitfield mask, GLenum filter) {
- assert(mask == GL_COLOR_BUFFER_BIT);
- Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER);
- if (!srcfb) return;
- Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER);
- if (!dstfb) return;
- Texture& srctex = ctx->textures[srcfb->color_attachment];
- if (!srctex.buf) return;
- Texture& dsttex = ctx->textures[dstfb->color_attachment];
- if (!dsttex.buf) return;
- assert(!dsttex.locked);
- if (srctex.internal_format != dsttex.internal_format) {
- assert(false);
- return;
- }
- // Force flipped Y onto dest coordinates
- if (srcY1 < srcY0) {
- swap(srcY0, srcY1);
- swap(dstY0, dstY1);
- }
- bool invertY = dstY1 < dstY0;
- if (invertY) {
- swap(dstY0, dstY1);
- }
- IntRect srcReq = IntRect{srcX0, srcY0, srcX1, srcY1} - srctex.offset;
- IntRect dstReq = IntRect{dstX0, dstY0, dstX1, dstY1} - dsttex.offset;
- if (srcReq.is_empty() || dstReq.is_empty()) {
- return;
- }
- IntRect clipRect = {0, 0, dstReq.width(), dstReq.height()};
- prepare_texture(srctex);
- prepare_texture(dsttex, &dstReq);
- if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR &&
- (srctex.internal_format == GL_RGBA8 || srctex.internal_format == GL_R8 ||
- srctex.internal_format == GL_RG8)) {
- linear_blit(srctex, srcReq, dsttex, dstReq, invertY, dstReq);
- } else {
- scale_blit(srctex, srcReq, dsttex, dstReq, invertY, clipRect);
- }
-}
-
-typedef Texture LockedTexture;
-
-// Lock the given texture to prevent modification.
-LockedTexture* LockTexture(GLuint texId) {
- Texture& tex = ctx->textures[texId];
- if (!tex.buf) {
- assert(tex.buf != nullptr);
- return nullptr;
- }
- if (__sync_fetch_and_add(&tex.locked, 1) == 0) {
- // If this is the first time locking the texture, flush any delayed clears.
- prepare_texture(tex);
- }
- return (LockedTexture*)&tex;
-}
-
-// Lock the given framebuffer's color attachment to prevent modification.
-LockedTexture* LockFramebuffer(GLuint fboId) {
- Framebuffer& fb = ctx->framebuffers[fboId];
- // Only allow locking a framebuffer if it has a valid color attachment.
- if (!fb.color_attachment) {
- assert(fb.color_attachment != 0);
- return nullptr;
- }
- return LockTexture(fb.color_attachment);
-}
-
-// Reference an already locked resource
-void LockResource(LockedTexture* resource) {
- if (!resource) {
- return;
- }
- __sync_fetch_and_add(&resource->locked, 1);
-}
-
-// Remove a lock on a texture that has been previously locked
-void UnlockResource(LockedTexture* resource) {
- if (!resource) {
- return;
- }
- if (__sync_fetch_and_add(&resource->locked, -1) <= 0) {
- // The lock should always be non-zero before unlocking.
- assert(0);
- }
-}
-
-// Get the underlying buffer for a locked resource
-void* GetResourceBuffer(LockedTexture* resource, int32_t* width,
- int32_t* height, int32_t* stride) {
- *width = resource->width;
- *height = resource->height;
- *stride = resource->stride();
- return resource->buf;
-}
-
-// Extension for optimized compositing of textures or framebuffers that may be
-// safely used across threads. The source and destination must be locked to
-// ensure that they can be safely accessed while the SWGL context might be used
-// by another thread. Band extents along the Y axis may be used to clip the
-// destination rectangle without effecting the integer scaling ratios.
-void Composite(LockedTexture* lockedDst, LockedTexture* lockedSrc, GLint srcX,
- GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX,
- GLint dstY, GLsizei dstWidth, GLsizei dstHeight,
- GLboolean opaque, GLboolean flip, GLenum filter, GLint clipX,
- GLint clipY, GLsizei clipWidth, GLsizei clipHeight) {
- if (!lockedDst || !lockedSrc) {
- return;
- }
- Texture& srctex = *lockedSrc;
- Texture& dsttex = *lockedDst;
- assert(srctex.bpp() == 4);
- assert(dsttex.bpp() == 4);
-
- IntRect srcReq =
- IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - srctex.offset;
- IntRect dstReq =
- IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset;
- // Compute clip rect as relative to the dstReq, as that's the same coords
- // as used for the sampling bounds.
- IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth,
- clipY - dstY + clipHeight};
-
- if (opaque) {
- // Ensure we have rows of at least 2 pixels when using the linear filter
- // to avoid overreading the row.
- if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR) {
- linear_blit<false>(srctex, srcReq, dsttex, dstReq, flip, clipRect);
- } else {
- scale_blit<false>(srctex, srcReq, dsttex, dstReq, flip, clipRect);
- }
- } else {
- if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR) {
- linear_blit<true>(srctex, srcReq, dsttex, dstReq, flip, clipRect);
- } else {
- scale_blit<true>(srctex, srcReq, dsttex, dstReq, flip, clipRect);
- }
- }
-}
-
-} // extern "C"
-
-// Saturated add helper for YUV conversion. Supported platforms have intrinsics
-// to do this natively, but support a slower generic fallback just in case.
-static inline V8<int16_t> addsat(V8<int16_t> x, V8<int16_t> y) {
-#if USE_SSE2
- return _mm_adds_epi16(x, y);
-#elif USE_NEON
- return vqaddq_s16(x, y);
-#else
- auto r = x + y;
- // An overflow occurred if the signs of both inputs x and y did not differ
- // but yet the sign of the result did differ.
- auto overflow = (~(x ^ y) & (r ^ x)) >> 15;
- // If there was an overflow, we need to choose the appropriate limit to clamp
- // to depending on whether or not the inputs are negative.
- auto limit = (x >> 15) ^ 0x7FFF;
- // If we didn't overflow, just use the result, and otherwise, use the limit.
- return (~overflow & r) | (overflow & limit);
-#endif
-}
-
-// Interleave and packing helper for YUV conversion. During transform by the
-// color matrix, the color components are de-interleaved as this format is
-// usually what comes out of the planar YUV textures. The components thus need
-// to be interleaved before finally getting packed to BGRA format. Alpha is
-// forced to be opaque.
-static inline PackedRGBA8 packYUV(V8<int16_t> gg, V8<int16_t> br) {
- return pack(bit_cast<WideRGBA8>(zip(br, gg))) |
- PackedRGBA8{0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
-}
-
-// clang-format off
-// Supports YUV color matrixes of the form:
-// [R] [1.1643835616438356, 0.0, rv ] [Y - 16]
-// [G] = [1.1643835616438358, -gu, -gv ] x [U - 128]
-// [B] [1.1643835616438356, bu, 0.0 ] [V - 128]
-// We must be able to multiply a YUV input by a matrix coefficient ranging as
-// high as ~2.2 in the U/V cases, where U/V can be signed values between -128
-// and 127. The largest fixed-point representation we can thus support without
-// overflowing 16 bit integers leaves us 6 bits of fractional precision while
-// also supporting a sign bit. The closest representation of the Y coefficient
-// ~1.164 in this precision is 74.5/2^6 which is common to all color spaces
-// we support. Conversions can still sometimes overflow the precision and
-// require clamping back into range, so we use saturated additions to do this
-// efficiently at no extra cost.
-// clang-format on
-struct YUVMatrix {
- // These constants are loaded off the "this" pointer via relative addressing
- // modes and should be about as quick to load as directly addressed SIMD
- // constant memory.
- V8<int16_t> rbCoeffs;
- V8<int16_t> gCoeffs;
- V8<uint16_t> yScale;
- V8<int16_t> yBias;
- V8<int16_t> uvBias;
- V8<int16_t> brMask;
-
- // Set the coefficients to cancel out and pass through YUV as GBR. All biases
- // are set to zero and the BR-mask is set to remove the contribution of Y to
- // the BR channels. Scales are set such that the shift by 6 in convert is
- // balanced.
- YUVMatrix()
- : rbCoeffs(1 << 6),
- gCoeffs(0),
- yScale(1 << (6 + 1)),
- yBias(0),
- uvBias(0),
- brMask(0) {}
-
- // Convert matrix coefficients to fixed-point representation.
- YUVMatrix(double rv, double gu, double gv, double bu)
- : rbCoeffs(
- zip(I16(int16_t(bu * 64.0 + 0.5)), I16(int16_t(rv * 64.0 + 0.5)))),
- gCoeffs(zip(I16(-int16_t(gu * -64.0 + 0.5)),
- I16(-int16_t(gv * -64.0 + 0.5)))),
- yScale(2 * 74 + 1),
- yBias(int16_t(-16 * 74.5) + (1 << 5)),
- uvBias(-128),
- brMask(-1) {}
-
- ALWAYS_INLINE PackedRGBA8 convert(V8<int16_t> yy, V8<int16_t> uv) const {
- // Bias Y values by -16 and multiply by 74.5. Add 2^5 offset to round to
- // nearest 2^6. Note that we have to use an unsigned multiply with a 2x
- // scale to represent a fractional scale and to avoid shifting with the sign
- // bit.
- yy = bit_cast<V8<int16_t>>((bit_cast<V8<uint16_t>>(yy) * yScale) >> 1) +
- yBias;
-
- // Bias U/V values by -128.
- uv += uvBias;
-
- // Compute (R, B) = (74.5*Y + rv*V, 74.5*Y + bu*U)
- auto br = rbCoeffs * uv;
- br = addsat(yy & brMask, br);
- br >>= 6;
-
- // Compute G = 74.5*Y + -gu*U + -gv*V
- auto gg = gCoeffs * uv;
- gg = addsat(
- yy,
- addsat(gg, bit_cast<V8<int16_t>>(bit_cast<V4<uint32_t>>(gg) >> 16)));
- gg >>= 6;
-
- // Interleave B/R and G values. Force alpha to opaque.
- return packYUV(gg, br);
- }
-};
-
-enum YUVColorSpace { REC_601 = 0, REC_709, REC_2020, IDENTITY };
-
-static const YUVMatrix yuvMatrix[IDENTITY + 1] = {
- // clang-format off
-// From Rec601:
-// [R] [1.1643835616438356, 0.0, 1.5960267857142858 ] [Y - 16]
-// [G] = [1.1643835616438358, -0.3917622900949137, -0.8129676472377708 ] x [U - 128]
-// [B] [1.1643835616438356, 2.017232142857143, 8.862867620416422e-17] [V - 128]
- {1.5960267857142858, -0.3917622900949137, -0.8129676472377708, 2.017232142857143},
-
-// From Rec709:
-// [R] [1.1643835616438356, 0.0, 1.7927410714285714] [Y - 16]
-// [G] = [1.1643835616438358, -0.21324861427372963, -0.532909328559444 ] x [U - 128]
-// [B] [1.1643835616438356, 2.1124017857142854, 0.0 ] [V - 128]
- {1.7927410714285714, -0.21324861427372963, -0.532909328559444, 2.1124017857142854},
-
-// From Re2020:
-// [R] [1.16438356164384, 0.0, 1.678674107142860 ] [Y - 16]
-// [G] = [1.16438356164384, -0.187326104219343, -0.650424318505057 ] x [U - 128]
-// [B] [1.16438356164384, 2.14177232142857, 0.0 ] [V - 128]
- {1.678674107142860, -0.187326104219343, -0.650424318505057, 2.14177232142857},
-
-// Identity
-// [R] [V]
-// [G] = [Y]
-// [B] [U]
- {},
- // clang-format on
-};
-
-// Helper function for textureLinearRowR8 that samples horizontal taps and
-// combines them based on Y fraction with next row.
-template <typename S>
-static ALWAYS_INLINE V8<int16_t> linearRowTapsR8(S sampler, I32 ix,
- int32_t offsety,
- int32_t stridey,
- int16_t fracy) {
- uint8_t* buf = (uint8_t*)sampler->buf + offsety;
- auto a0 = unaligned_load<V2<uint8_t>>(&buf[ix.x]);
- auto b0 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
- auto c0 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
- auto d0 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
- auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>);
- buf += stridey;
- auto a1 = unaligned_load<V2<uint8_t>>(&buf[ix.x]);
- auto b1 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
- auto c1 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
- auto d1 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
- auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>);
- abcd0 += ((abcd1 - abcd0) * fracy) >> 7;
- return abcd0;
-}
-
-// Optimized version of textureLinearPackedR8 for Y R8 texture. This assumes
-// constant Y and returns a duplicate of the result interleaved with itself
-// to aid in later YUV transformation.
-template <typename S>
-static inline V8<int16_t> textureLinearRowR8(S sampler, I32 ix, int32_t offsety,
- int32_t stridey, int16_t fracy) {
- assert(sampler->format == TextureFormat::R8);
-
- // Calculate X fraction and clamp X offset into range.
- I32 fracx = ix;
- ix >>= 7;
- fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F;
- ix = clampCoord(ix, sampler->width - 1);
-
- // Load the sample taps and combine rows.
- auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy);
-
- // Unzip the result and do final horizontal multiply-add base on X fraction.
- auto abcdl = SHUFFLE(abcd, abcd, 0, 0, 2, 2, 4, 4, 6, 6);
- auto abcdh = SHUFFLE(abcd, abcd, 1, 1, 3, 3, 5, 5, 7, 7);
- abcdl += ((abcdh - abcdl) * CONVERT(fracx, I16).xxyyzzww) >> 7;
-
- // The final result is the packed values interleaved with a duplicate of
- // themselves.
- return abcdl;
-}
-
-// Optimized version of textureLinearPackedR8 for paired U/V R8 textures.
-// Since the two textures have the same dimensions and stride, the addressing
-// math can be shared between both samplers. This also allows a coalesced
-// multiply in the final stage by packing both U/V results into a single
-// operation.
-template <typename S>
-static inline V8<int16_t> textureLinearRowPairedR8(S sampler, S sampler2,
- I32 ix, int32_t offsety,
- int32_t stridey,
- int16_t fracy) {
- assert(sampler->format == TextureFormat::R8 &&
- sampler2->format == TextureFormat::R8);
- assert(sampler->width == sampler2->width &&
- sampler->height == sampler2->height);
- assert(sampler->stride == sampler2->stride);
-
- // Calculate X fraction and clamp X offset into range.
- I32 fracx = ix;
- ix >>= 7;
- fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F;
- ix = clampCoord(ix, sampler->width - 1);
-
- // Load the sample taps for the first sampler and combine rows.
- auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy);
-
- // Load the sample taps for the second sampler and combine rows.
- auto xyzw = linearRowTapsR8(sampler2, ix, offsety, stridey, fracy);
-
- // We are left with a result vector for each sampler with values for adjacent
- // pixels interleaved together in each. We need to unzip these values so that
- // we can do the final horizontal multiply-add based on the X fraction.
- auto abcdxyzwl = SHUFFLE(abcd, xyzw, 0, 8, 2, 10, 4, 12, 6, 14);
- auto abcdxyzwh = SHUFFLE(abcd, xyzw, 1, 9, 3, 11, 5, 13, 7, 15);
- abcdxyzwl += ((abcdxyzwh - abcdxyzwl) * CONVERT(fracx, I16).xxyyzzww) >> 7;
-
- // The final result is the packed values for the first sampler interleaved
- // with the packed values for the second sampler.
- return abcdxyzwl;
-}
-
-// Casting to int loses some precision while stepping that can offset the
-// image, so shift the values by some extra bits of precision to minimize
-// this. We support up to 16 bits of image size, 7 bits of quantization,
-// and 1 bit for sign, which leaves 8 bits left for extra precision.
-const int STEP_BITS = 8;
-
-// Optimized version of textureLinearPackedR8 for Y R8 texture with
-// half-resolution paired U/V R8 textures. This allows us to more efficiently
-// pack YUV samples into vectors to substantially reduce math operations even
-// further.
-template <bool BLEND>
-static inline void upscaleYUV42R8(uint32_t* dest, int span, uint8_t* yRow,
- I32 yU, int32_t yDU, int32_t yStrideV,
- int16_t yFracV, uint8_t* cRow1,
- uint8_t* cRow2, I32 cU, int32_t cDU,
- int32_t cStrideV, int16_t cFracV,
- const YUVMatrix& colorSpace) {
- // As much as possible try to utilize the fact that we're only using half
- // the UV samples to combine Y and UV samples into single vectors. Here we
- // need to initialize several useful vector quantities for stepping fractional
- // offsets. For the UV samples, we take the average of the first+second and
- // third+fourth samples in a chunk which conceptually correspond to offsets
- // 0.5 and 1.5 (in 0..2 range). This allows us to reconstruct intermediate
- // samples 0.25, 0.75, 1.25, and 1.75 later. X fraction is shifted over into
- // the top 7 bits of an unsigned short so that we can mask off the exact
- // fractional bits we need to blend merely by right shifting them into
- // position.
- cU = (cU.xzxz + cU.ywyw) >> 1;
- auto ycFracX = CONVERT(combine(yU, cU), V8<uint16_t>)
- << (16 - (STEP_BITS + 7));
- auto ycFracDX = combine(I16(yDU), I16(cDU)) << (16 - (STEP_BITS + 7));
- auto ycFracV = combine(I16(yFracV), I16(cFracV));
- I32 yI = yU >> (STEP_BITS + 7);
- I32 cI = cU >> (STEP_BITS + 7);
- // Load initial combined YUV samples for each row and blend them.
- auto ycSrc0 =
- CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x]),
- combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x]),
- unaligned_load<V2<uint8_t>>(&cRow2[cI.x]))),
- V8<int16_t>);
- auto ycSrc1 = CONVERT(
- combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x + yStrideV]),
- combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x + cStrideV]),
- unaligned_load<V2<uint8_t>>(&cRow2[cI.x + cStrideV]))),
- V8<int16_t>);
- auto ycSrc = ycSrc0 + (((ycSrc1 - ycSrc0) * ycFracV) >> 7);
-
- // Here we shift in results from the next sample while caching results from
- // the previous sample. This allows us to reduce the multiplications in the
- // inner loop down to only two since we just need to blend the new samples
- // horizontally and then vertically once each.
- for (uint32_t* end = dest + span; dest < end; dest += 4) {
- yU += yDU;
- I32 yIn = yU >> (STEP_BITS + 7);
- cU += cDU;
- I32 cIn = cU >> (STEP_BITS + 7);
- // Load combined YUV samples for the next chunk on each row and blend them.
- auto ycSrc0n =
- CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x]),
- combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x]),
- unaligned_load<V2<uint8_t>>(&cRow2[cIn.x]))),
- V8<int16_t>);
- auto ycSrc1n = CONVERT(
- combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x + yStrideV]),
- combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x + cStrideV]),
- unaligned_load<V2<uint8_t>>(&cRow2[cIn.x + cStrideV]))),
- V8<int16_t>);
- auto ycSrcn = ycSrc0n + (((ycSrc1n - ycSrc0n) * ycFracV) >> 7);
-
- // The source samples for the chunk may not match the actual tap offsets.
- // Since we're upscaling, we know the tap offsets fall within all the
- // samples in a 4-wide chunk. Since we can't rely on PSHUFB or similar,
- // instead we do laborious shuffling here for the Y samples and then the UV
- // samples.
- auto yshuf = lowHalf(ycSrc);
- auto yshufn =
- SHUFFLE(yshuf, yIn.x == yI.w ? lowHalf(ycSrcn).yyyy : lowHalf(ycSrcn),
- 1, 2, 3, 4);
- if (yI.y == yI.x) {
- yshuf = yshuf.xxyz;
- yshufn = yshufn.xxyz;
- }
- if (yI.z == yI.y) {
- yshuf = yshuf.xyyz;
- yshufn = yshufn.xyyz;
- }
- if (yI.w == yI.z) {
- yshuf = yshuf.xyzz;
- yshufn = yshufn.xyzz;
- }
-
- auto cshuf = highHalf(ycSrc);
- auto cshufn =
- SHUFFLE(cshuf, cIn.x == cI.y ? highHalf(ycSrcn).yyww : highHalf(ycSrcn),
- 1, 4, 3, 6);
- if (cI.y == cI.x) {
- cshuf = cshuf.xxzz;
- cshufn = cshufn.xxzz;
- }
-
- // After shuffling, combine the Y and UV samples back into a single vector
- // for blending. Shift X fraction into position as unsigned to mask off top
- // bits and get rid of low bits to avoid multiplication overflow.
- auto yuvPx = combine(yshuf, cshuf);
- yuvPx += ((combine(yshufn, cshufn) - yuvPx) *
- bit_cast<V8<int16_t>>(ycFracX >> (16 - 7))) >>
- 7;
-
- // Cache the new samples as the current samples on the next iteration.
- ycSrc = ycSrcn;
- ycFracX += ycFracDX;
- yI = yIn;
- cI = cIn;
-
- // De-interleave the Y and UV results. We need to average the UV results
- // to produce values for intermediate samples. Taps for UV were collected at
- // offsets 0.5 and 1.5, such that if we take a quarter of the difference
- // (1.5-0.5)/4, subtract it from even samples, and add it to odd samples,
- // we can estimate samples 0.25, 0.75, 1.25, and 1.75.
- auto yPx = SHUFFLE(yuvPx, yuvPx, 0, 0, 1, 1, 2, 2, 3, 3);
- auto uvPx = SHUFFLE(yuvPx, yuvPx, 4, 6, 4, 6, 5, 7, 5, 7) +
- ((SHUFFLE(yuvPx, yuvPx, 4, 6, 5, 7, 4, 6, 5, 7) -
- SHUFFLE(yuvPx, yuvPx, 5, 7, 4, 6, 5, 7, 4, 6)) >>
- 2);
-
- commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx));
- }
-}
-
-// This is the inner loop driver of CompositeYUV that processes an axis-aligned
-// YUV span, dispatching based on appropriate format and scaling. This is also
-// reused by blendYUV to accelerate some cases of texture sampling in the
-// shader.
-template <bool BLEND = false>
-static void linear_row_yuv(uint32_t* dest, int span, sampler2DRect samplerY,
- const vec2_scalar& srcUV, float srcDU,
- sampler2DRect samplerU, sampler2DRect samplerV,
- const vec2_scalar& chromaUV, float chromaDU,
- int colorDepth, const YUVMatrix& colorSpace) {
- // Calculate varying and constant interp data for Y plane.
- I32 yU = cast(init_interp(srcUV.x, srcDU) * (1 << STEP_BITS));
- int32_t yV = int32_t(srcUV.y);
-
- // Calculate varying and constant interp data for chroma planes.
- I32 cU = cast(init_interp(chromaUV.x, chromaDU) * (1 << STEP_BITS));
- int32_t cV = int32_t(chromaUV.y);
-
- // We need to skip 4 pixels per chunk.
- int32_t yDU = int32_t((4 << STEP_BITS) * srcDU);
- int32_t cDU = int32_t((4 << STEP_BITS) * chromaDU);
-
- if (samplerY->width < 2 || samplerU->width < 2) {
- // If the source row has less than 2 pixels, it's not safe to use a linear
- // filter because it may overread the row. Just convert the single pixel
- // with nearest filtering and fill the row with it.
- I16 yuv = CONVERT(
- round_pixel((Float){texelFetch(samplerY, ivec2(srcUV)).x.x,
- texelFetch(samplerU, ivec2(chromaUV)).x.x,
- texelFetch(samplerV, ivec2(chromaUV)).x.x, 1.0f}),
- I16);
- commit_solid_span<BLEND>(
- dest,
- unpack(colorSpace.convert(V8<int16_t>(yuv.x),
- zip(I16(yuv.y), I16(yuv.z)))),
- span);
- } else if (samplerY->format == TextureFormat::R16) {
- // Sample each YUV plane, rescale it to fit in low 8 bits of word, and
- // then transform them by the appropriate color space.
- assert(colorDepth > 8);
- // Need to right shift the sample by the amount of bits over 8 it
- // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
- // of precision at the low end already, hence 1 is subtracted from the
- // color depth.
- int rescaleBits = (colorDepth - 1) - 8;
- for (; span >= 4; span -= 4) {
- auto yPx =
- textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >>
- rescaleBits;
- auto uPx =
- textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >>
- rescaleBits;
- auto vPx =
- textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >>
- rescaleBits;
- commit_blend_span<BLEND>(
- dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx)));
- dest += 4;
- yU += yDU;
- cU += cDU;
- }
- if (span > 0) {
- // Handle any remaining pixels...
- auto yPx =
- textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >>
- rescaleBits;
- auto uPx =
- textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >>
- rescaleBits;
- auto vPx =
- textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >>
- rescaleBits;
- commit_blend_span<BLEND>(
- dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx)), span);
- }
- } else {
- assert(samplerY->format == TextureFormat::R8);
- assert(colorDepth == 8);
-
- // Calculate varying and constant interp data for Y plane.
- int16_t yFracV = yV & 0x7F;
- yV >>= 7;
- int32_t yOffsetV = clampCoord(yV, samplerY->height) * samplerY->stride;
- int32_t yStrideV =
- yV >= 0 && yV < int32_t(samplerY->height) - 1 ? samplerY->stride : 0;
-
- // Calculate varying and constant interp data for chroma planes.
- int16_t cFracV = cV & 0x7F;
- cV >>= 7;
- int32_t cOffsetV = clampCoord(cV, samplerU->height) * samplerU->stride;
- int32_t cStrideV =
- cV >= 0 && cV < int32_t(samplerU->height) - 1 ? samplerU->stride : 0;
-
- // If we're sampling the UV planes at half the resolution of the Y plane,
- // then try to use half resolution fast-path.
- if (yDU >= cDU && cDU > 0 && yDU <= (4 << (STEP_BITS + 7)) &&
- cDU <= (2 << (STEP_BITS + 7))) {
- // Ensure that samples don't fall outside of the valid bounds of each
- // planar texture. Step until the initial X coordinates are positive.
- for (; (yU.x < 0 || cU.x < 0) && span >= 4; span -= 4) {
- auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV,
- yStrideV, yFracV);
- auto uvPx = textureLinearRowPairedR8(
- samplerU, samplerV, cU >> STEP_BITS, cOffsetV, cStrideV, cFracV);
- commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx));
- dest += 4;
- yU += yDU;
- cU += cDU;
- }
- // Calculate the number of aligned chunks that we can step inside the
- // bounds of each planar texture without overreading.
- int inside = min(
- min((((int(samplerY->width) - 4) << (STEP_BITS + 7)) - yU.x) / yDU,
- (((int(samplerU->width) - 4) << (STEP_BITS + 7)) - cU.x) / cDU) *
- 4,
- span & ~3);
- if (inside > 0) {
- uint8_t* yRow = (uint8_t*)samplerY->buf + yOffsetV;
- uint8_t* cRow1 = (uint8_t*)samplerU->buf + cOffsetV;
- uint8_t* cRow2 = (uint8_t*)samplerV->buf + cOffsetV;
- upscaleYUV42R8<BLEND>(dest, inside, yRow, yU, yDU, yStrideV, yFracV,
- cRow1, cRow2, cU, cDU, cStrideV, cFracV,
- colorSpace);
- span -= inside;
- dest += inside;
- yU += (inside / 4) * yDU;
- cU += (inside / 4) * cDU;
- }
- // If there are any remaining chunks that weren't inside, handle them
- // below.
- }
- for (; span >= 4; span -= 4) {
- // Sample each YUV plane and then transform them by the appropriate
- // color space.
- auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV,
- yStrideV, yFracV);
- auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS,
- cOffsetV, cStrideV, cFracV);
- commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx));
- dest += 4;
- yU += yDU;
- cU += cDU;
- }
- if (span > 0) {
- // Handle any remaining pixels...
- auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV,
- yStrideV, yFracV);
- auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS,
- cOffsetV, cStrideV, cFracV);
- commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx), span);
- }
- }
-}
-
-static void linear_convert_yuv(Texture& ytex, Texture& utex, Texture& vtex,
- YUVColorSpace colorSpace, int colorDepth,
- const IntRect& srcReq, Texture& dsttex,
- const IntRect& dstReq, bool invertY,
- const IntRect& clipRect) {
- // Compute valid dest bounds
- IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY);
- dstBounds.intersect(clipRect);
- // Check if sampling bounds are empty
- if (dstBounds.is_empty()) {
- return;
- }
- // Initialize samplers for source textures
- sampler2DRect_impl sampler[3];
- init_sampler(&sampler[0], ytex);
- init_sampler(&sampler[1], utex);
- init_sampler(&sampler[2], vtex);
-
- // Compute source UVs
- vec2_scalar srcUV(srcReq.x0, srcReq.y0);
- vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
- float(srcReq.height()) / dstReq.height());
- // Inverted Y must step downward along source rows
- if (invertY) {
- srcUV.y += srcReq.height();
- srcDUV.y = -srcDUV.y;
- }
- // Skip to clamped source start
- srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
- // Calculate separate chroma UVs for chroma planes with different scale
- vec2_scalar chromaScale(float(utex.width) / ytex.width,
- float(utex.height) / ytex.height);
- vec2_scalar chromaUV = srcUV * chromaScale;
- vec2_scalar chromaDUV = srcDUV * chromaScale;
- // Scale UVs by lerp precision. If the row has only 1 pixel, then don't
- // quantize so that we can use nearest filtering instead to avoid overreads.
- if (ytex.width >= 2 && utex.width >= 2) {
- srcUV = linearQuantize(srcUV, 128);
- srcDUV *= 128.0f;
- chromaUV = linearQuantize(chromaUV, 128);
- chromaDUV *= 128.0f;
- }
- // Calculate dest pointer from clamped offsets
- int destStride = dsttex.stride();
- char* dest = dsttex.sample_ptr(dstReq, dstBounds);
- int span = dstBounds.width();
- for (int rows = dstBounds.height(); rows > 0; rows--) {
- linear_row_yuv((uint32_t*)dest, span, &sampler[0], srcUV, srcDUV.x,
- &sampler[1], &sampler[2], chromaUV, chromaDUV.x, colorDepth,
- yuvMatrix[colorSpace]);
- dest += destStride;
- srcUV.y += srcDUV.y;
- chromaUV.y += chromaDUV.y;
- }
-}
-
-extern "C" {
-
-// Extension for compositing a YUV surface represented by separate YUV planes
-// to a BGRA destination. The supplied color space is used to determine the
-// transform from YUV to BGRA after sampling.
-void CompositeYUV(LockedTexture* lockedDst, LockedTexture* lockedY,
- LockedTexture* lockedU, LockedTexture* lockedV,
- YUVColorSpace colorSpace, GLuint colorDepth, GLint srcX,
- GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX,
- GLint dstY, GLsizei dstWidth, GLsizei dstHeight,
- GLboolean flip, GLint clipX, GLint clipY, GLsizei clipWidth,
- GLsizei clipHeight) {
- if (!lockedDst || !lockedY || !lockedU || !lockedV) {
- return;
- }
- if (colorSpace > IDENTITY) {
- assert(false);
- return;
- }
- Texture& ytex = *lockedY;
- Texture& utex = *lockedU;
- Texture& vtex = *lockedV;
- Texture& dsttex = *lockedDst;
- // All YUV planes must currently be represented by R8 or R16 textures.
- // The chroma (U/V) planes must have matching dimensions.
- assert(ytex.bpp() == utex.bpp() && ytex.bpp() == vtex.bpp());
- assert((ytex.bpp() == 1 && colorDepth == 8) ||
- (ytex.bpp() == 2 && colorDepth > 8));
- // assert(ytex.width == utex.width && ytex.height == utex.height);
- assert(utex.width == vtex.width && utex.height == vtex.height);
- assert(ytex.offset == utex.offset && ytex.offset == vtex.offset);
- assert(dsttex.bpp() == 4);
-
- IntRect srcReq =
- IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - ytex.offset;
- IntRect dstReq =
- IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset;
- // Compute clip rect as relative to the dstReq, as that's the same coords
- // as used for the sampling bounds.
- IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth,
- clipY - dstY + clipHeight};
- // For now, always use a linear filter path that would be required for
- // scaling. Further fast-paths for non-scaled video might be desirable in the
- // future.
- linear_convert_yuv(ytex, utex, vtex, colorSpace, colorDepth, srcReq, dsttex,
- dstReq, flip, clipRect);
-}
-
-} // extern "C"
diff --git a/third_party/webrender/swgl/src/gl.cc b/third_party/webrender/swgl/src/gl.cc
index 6e214547421..f4a69752dde 100644
--- a/third_party/webrender/swgl/src/gl.cc
+++ b/third_party/webrender/swgl/src/gl.cc
@@ -22,65 +22,15 @@
# define debugf(...) printf(__VA_ARGS__)
#endif
-// #define PRINT_TIMINGS
-
#ifdef _WIN32
# define ALWAYS_INLINE __forceinline
-# define NO_INLINE __declspec(noinline)
-
-// Including Windows.h brings a huge amount of namespace polution so just
-// define a couple of things manually
-typedef int BOOL;
-# define WINAPI __stdcall
-# define DECLSPEC_IMPORT __declspec(dllimport)
-# define WINBASEAPI DECLSPEC_IMPORT
-typedef unsigned long DWORD;
-typedef long LONG;
-typedef __int64 LONGLONG;
-# define DUMMYSTRUCTNAME
-
-typedef union _LARGE_INTEGER {
- struct {
- DWORD LowPart;
- LONG HighPart;
- } DUMMYSTRUCTNAME;
- struct {
- DWORD LowPart;
- LONG HighPart;
- } u;
- LONGLONG QuadPart;
-} LARGE_INTEGER;
-extern "C" {
-WINBASEAPI BOOL WINAPI
-QueryPerformanceCounter(LARGE_INTEGER* lpPerformanceCount);
-
-WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency);
-}
-
#else
-// GCC is slower when dealing with always_inline, especially in debug builds.
-// When using Clang, use always_inline more aggressively.
-# if defined(__clang__) || defined(NDEBUG)
-# define ALWAYS_INLINE __attribute__((always_inline)) inline
-# else
-# define ALWAYS_INLINE inline
-# endif
-# define NO_INLINE __attribute__((noinline))
-#endif
-
-// Some functions may cause excessive binary bloat if inlined in debug or with
-// GCC builds, so use PREFER_INLINE on these instead of ALWAYS_INLINE.
-#if defined(__clang__) && defined(NDEBUG)
-# define PREFER_INLINE ALWAYS_INLINE
-#else
-# define PREFER_INLINE inline
+# define ALWAYS_INLINE __attribute__((always_inline)) inline
#endif
#define UNREACHABLE __builtin_unreachable()
-#define UNUSED [[maybe_unused]]
-
-#define FALLTHROUGH [[fallthrough]]
+#define UNUSED __attribute__((unused))
#ifdef MOZILLA_CLIENT
# define IMPLICIT __attribute__((annotate("moz_implicit")))
@@ -91,32 +41,19 @@ WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency);
#include "gl_defs.h"
#include "glsl.h"
#include "program.h"
-#include "texture.h"
using namespace glsl;
-typedef ivec2_scalar IntPoint;
-
struct IntRect {
int x0;
int y0;
int x1;
int y1;
- IntRect() : x0(0), y0(0), x1(0), y1(0) {}
- IntRect(int x0, int y0, int x1, int y1) : x0(x0), y0(y0), x1(x1), y1(y1) {}
- IntRect(IntPoint origin, IntPoint size)
- : x0(origin.x),
- y0(origin.y),
- x1(origin.x + size.x),
- y1(origin.y + size.y) {}
-
int width() const { return x1 - x0; }
int height() const { return y1 - y0; }
bool is_empty() const { return width() <= 0 || height() <= 0; }
- IntPoint origin() const { return IntPoint(x0, y0); }
-
bool same_size(const IntRect& o) const {
return width() == o.width() && height() == o.height();
}
@@ -133,12 +70,6 @@ struct IntRect {
return *this;
}
- IntRect intersection(const IntRect& o) {
- IntRect result = *this;
- result.intersect(o);
- return result;
- }
-
// Scale from source-space to dest-space, optionally rounding inward
IntRect& scale(int srcWidth, int srcHeight, int dstWidth, int dstHeight,
bool roundIn = false) {
@@ -156,60 +87,15 @@ struct IntRect {
swap(y0, y1);
}
- IntRect& offset(const IntPoint& o) {
- x0 += o.x;
- y0 += o.y;
- x1 += o.x;
- y1 += o.y;
+ IntRect& offset(int dx, int dy) {
+ x0 += dx;
+ y0 += dy;
+ x1 += dx;
+ y1 += dy;
return *this;
}
-
- IntRect operator+(const IntPoint& o) const {
- return IntRect(*this).offset(o);
- }
- IntRect operator-(const IntPoint& o) const {
- return IntRect(*this).offset(-o);
- }
};
-typedef vec2_scalar Point2D;
-typedef vec4_scalar Point3D;
-
-struct IntRange {
- int start;
- int end;
-
- int len() const { return end - start; }
-
- IntRange intersect(IntRange r) const {
- return {max(start, r.start), min(end, r.end)};
- }
-};
-
-struct FloatRange {
- float start;
- float end;
-
- float clip(float x) const { return clamp(x, start, end); }
-
- FloatRange clip(FloatRange r) const { return {clip(r.start), clip(r.end)}; }
-
- FloatRange merge(FloatRange r) const {
- return {min(start, r.start), max(end, r.end)};
- }
-
- IntRange round() const {
- return {int(floor(start + 0.5f)), int(floor(end + 0.5f))};
- }
-
- IntRange round_out() const { return {int(floor(start)), int(ceil(end))}; }
-};
-
-template <typename P>
-static inline FloatRange x_range(P p0, P p1) {
- return {min(p0.x, p1.x), max(p0.x, p1.x)};
-}
-
struct VertexAttrib {
size_t size = 0; // in bytes
GLenum type = 0;
@@ -237,18 +123,12 @@ static int bytes_for_internal_format(GLenum internal_format) {
case GL_R8:
case GL_RED:
return 1;
- case GL_RG8:
- case GL_RG:
- return 2;
case GL_DEPTH_COMPONENT:
case GL_DEPTH_COMPONENT16:
+ return 2;
case GL_DEPTH_COMPONENT24:
case GL_DEPTH_COMPONENT32:
return 4;
- case GL_RGB_RAW_422_APPLE:
- return 2;
- case GL_R16:
- return 2;
default:
debugf("internal format: %x\n", internal_format);
assert(0);
@@ -268,12 +148,6 @@ static TextureFormat gl_format_to_texture_format(int type) {
return TextureFormat::RGBA8;
case GL_R8:
return TextureFormat::R8;
- case GL_RG8:
- return TextureFormat::RG8;
- case GL_R16:
- return TextureFormat::R16;
- case GL_RGB_RAW_422_APPLE:
- return TextureFormat::YUV422;
default:
assert(0);
return TextureFormat::RGBA8;
@@ -287,34 +161,19 @@ struct Query {
struct Buffer {
char* buf = nullptr;
size_t size = 0;
- size_t capacity = 0;
bool allocate(size_t new_size) {
- // If the size remains unchanged, don't allocate anything.
- if (new_size == size) {
- return false;
- }
- // If the new size is within the existing capacity of the buffer, just
- // reuse the existing buffer.
- if (new_size <= capacity) {
- size = new_size;
- return true;
- }
- // Otherwise we need to reallocate the buffer to hold up to the requested
- // larger size.
- char* new_buf = (char*)realloc(buf, new_size);
- assert(new_buf);
- if (!new_buf) {
- // If we fail, null out the buffer rather than leave around the old
- // allocation state.
+ if (new_size != size) {
+ char* new_buf = (char*)realloc(buf, new_size);
+ assert(new_buf);
+ if (new_buf) {
+ buf = new_buf;
+ size = new_size;
+ return true;
+ }
cleanup();
- return false;
}
- // The reallocation succeeded, so install the buffer.
- buf = new_buf;
- size = new_size;
- capacity = new_size;
- return true;
+ return false;
}
void cleanup() {
@@ -322,7 +181,6 @@ struct Buffer {
free(buf);
buf = nullptr;
size = 0;
- capacity = 0;
}
}
@@ -331,6 +189,7 @@ struct Buffer {
struct Framebuffer {
GLuint color_attachment = 0;
+ GLint layer = 0;
GLuint depth_attachment = 0;
};
@@ -364,32 +223,17 @@ struct Texture {
GLenum internal_format = 0;
int width = 0;
int height = 0;
+ int depth = 0;
char* buf = nullptr;
size_t buf_size = 0;
- uint32_t buf_stride = 0;
- uint8_t buf_bpp = 0;
GLenum min_filter = GL_NEAREST;
GLenum mag_filter = GL_LINEAR;
- // The number of active locks on this texture. If this texture has any active
- // locks, we need to disallow modifying or destroying the texture as it may
- // be accessed by other threads where modifications could lead to races.
- int32_t locked = 0;
- // When used as an attachment of a framebuffer, rendering to the texture
- // behaves as if it is located at the given offset such that the offset is
- // subtracted from all transformed vertexes after the viewport is applied.
- IntPoint offset;
enum FLAGS {
- // If the buffer is internally-allocated by SWGL
SHOULD_FREE = 1 << 1,
- // If the buffer has been cleared to initialize it. Currently this is only
- // utilized by depth buffers which need to know when depth runs have reset
- // to a valid row state. When unset, the depth runs may contain garbage.
- CLEARED = 1 << 2,
};
int flags = SHOULD_FREE;
bool should_free() const { return bool(flags & SHOULD_FREE); }
- bool cleared() const { return bool(flags & CLEARED); }
void set_flag(int flag, bool val) {
if (val) {
@@ -398,14 +242,7 @@ struct Texture {
flags &= ~flag;
}
}
- void set_should_free(bool val) {
- // buf must be null before SHOULD_FREE can be safely toggled. Otherwise, we
- // might accidentally mistakenly realloc an externally allocated buffer as
- // if it were an internally allocated one.
- assert(!buf);
- set_flag(SHOULD_FREE, val);
- }
- void set_cleared(bool val) { set_flag(CLEARED, val); }
+ void set_should_free(bool val) { set_flag(SHOULD_FREE, val); }
// Delayed-clearing state. When a clear of an FB is requested, we don't
// immediately clear each row, as the rows may be subsequently overwritten
@@ -418,9 +255,6 @@ struct Texture {
uint32_t clear_val = 0;
uint32_t* cleared_rows = nullptr;
- void init_depth_runs(uint32_t z);
- void fill_depth_runs(uint32_t z, const IntRect& scissor);
-
void enable_delayed_clear(uint32_t val) {
delay_clear = height;
clear_val = val;
@@ -441,88 +275,40 @@ struct Texture {
}
}
- int bpp() const { return buf_bpp; }
- void set_bpp() { buf_bpp = bytes_for_internal_format(internal_format); }
+ int bpp() const { return bytes_for_internal_format(internal_format); }
- size_t stride() const { return buf_stride; }
- void set_stride() { buf_stride = aligned_stride(buf_bpp * width); }
-
- // Set an external backing buffer of this texture.
- void set_buffer(void* new_buf, size_t new_stride) {
- assert(!should_free());
- // Ensure that the supplied stride is at least as big as the row data and
- // is aligned to the smaller of either the BPP or word-size. We need to at
- // least be able to sample data from within a row and sample whole pixels
- // of smaller formats without risking unaligned access.
- set_bpp();
- set_stride();
- assert(new_stride >= size_t(bpp() * width) &&
- new_stride % min(bpp(), sizeof(uint32_t)) == 0);
+ size_t stride(int b = 0, int min_width = 0) const {
+ return aligned_stride((b ? b : bpp()) * max(width, min_width));
+ }
- buf = (char*)new_buf;
- buf_size = 0;
- buf_stride = new_stride;
+ size_t layer_stride(int b = 0, int min_width = 0, int min_height = 0) const {
+ return stride(b ? b : bpp(), min_width) * max(height, min_height);
}
bool allocate(bool force = false, int min_width = 0, int min_height = 0) {
- assert(!locked); // Locked textures shouldn't be reallocated
- // If we get here, some GL API call that invalidates the texture was used.
- // Mark the buffer as not-cleared to signal this.
- set_cleared(false);
- // Check if there is either no buffer currently or if we forced validation
- // of the buffer size because some dimension might have changed.
if ((!buf || force) && should_free()) {
- // Initialize the buffer's BPP and stride, since they may have changed.
- set_bpp();
- set_stride();
- // Compute new size based on the maximum potential stride, rather than
- // the current stride, to hopefully avoid reallocations when size would
- // otherwise change too much...
- size_t max_stride = max(buf_stride, aligned_stride(buf_bpp * min_width));
- size_t size = max_stride * max(height, min_height);
- if ((!buf && size > 0) || size > buf_size) {
+ size_t size = layer_stride(bpp(), min_width, min_height) * max(depth, 1);
+ if (!buf || size > buf_size) {
// Allocate with a SIMD register-sized tail of padding at the end so we
// can safely read or write past the end of the texture with SIMD ops.
- // Currently only the flat Z-buffer texture needs this padding due to
- // full-register loads and stores in check_depth and discard_depth. In
- // case some code in the future accidentally uses a linear filter on a
- // texture with less than 2 pixels per row, we also add this padding
- // just to be safe. All other texture types and use-cases should be
- // safe to omit padding.
- size_t padding =
- internal_format == GL_DEPTH_COMPONENT24 || max(width, min_width) < 2
- ? sizeof(Float)
- : 0;
- char* new_buf = (char*)realloc(buf, size + padding);
+ char* new_buf = (char*)realloc(buf, size + sizeof(Float));
assert(new_buf);
if (new_buf) {
- // Successfully reallocated the buffer, so go ahead and set it.
buf = new_buf;
buf_size = size;
return true;
}
- // Allocation failed, so ensure we don't leave stale buffer state.
cleanup();
}
}
- // Nothing changed...
return false;
}
void cleanup() {
- assert(!locked); // Locked textures shouldn't be destroyed
- if (buf) {
- // If we need to toggle SHOULD_FREE state, ensure that buf is nulled out,
- // regardless of whether we internally allocated it. This will prevent us
- // from wrongly treating buf as having been internally allocated for when
- // we go to realloc if it actually was externally allocted.
- if (should_free()) {
- free(buf);
- }
+ if (buf && should_free()) {
+ free(buf);
buf = nullptr;
buf_size = 0;
- buf_bpp = 0;
- buf_stride = 0;
}
disable_delayed_clear();
}
@@ -530,41 +316,44 @@ struct Texture {
~Texture() { cleanup(); }
IntRect bounds() const { return IntRect{0, 0, width, height}; }
- IntRect offset_bounds() const { return bounds() + offset; }
// Find the valid sampling bounds relative to the requested region
IntRect sample_bounds(const IntRect& req, bool invertY = false) const {
- IntRect bb = bounds().intersect(req) - req.origin();
+ IntRect bb = bounds().intersect(req).offset(-req.x0, -req.y0);
if (invertY) bb.invert_y(req.height());
return bb;
}
// Get a pointer for sampling at the given offset
- char* sample_ptr(int x, int y) const {
- return buf + y * stride() + x * bpp();
+ char* sample_ptr(int x, int y, int z, int bpp, size_t stride) const {
+ return buf + (height * z + y) * stride + x * bpp;
+ }
+
+ char* sample_ptr(int x, int y, int z, int bpp) const {
+ return sample_ptr(x, y, z, bpp, stride(bpp));
+ }
+
+ char* sample_ptr(int x, int y, int z) const {
+ return sample_ptr(x, y, z, bpp());
}
// Get a pointer for sampling the requested region and limit to the provided
// sampling bounds
- char* sample_ptr(const IntRect& req, const IntRect& bounds,
+ char* sample_ptr(const IntRect& req, const IntRect& bounds, int z,
bool invertY = false) const {
// Offset the sample pointer by the clamped bounds
int x = req.x0 + bounds.x0;
// Invert the Y offset if necessary
int y = invertY ? req.y1 - 1 - bounds.y0 : req.y0 + bounds.y0;
- return sample_ptr(x, y);
+ return sample_ptr(x, y, z);
}
};
-// The last vertex attribute is reserved as a null attribute in case a vertex
-// attribute is used without being set.
-#define MAX_ATTRIBS 17
-#define NULL_ATTRIB 16
+#define MAX_ATTRIBS 16
+#define NULL_ATTRIB 15
struct VertexArray {
VertexAttrib attribs[MAX_ATTRIBS];
int max_attrib = -1;
- // The GL spec defines element array buffer binding to be part of VAO state.
- GLuint element_array_buffer_binding = 0;
void validate();
};
@@ -580,67 +369,33 @@ struct Program {
FragmentShaderImpl* frag_impl = nullptr;
bool deleted = false;
- ~Program() { delete impl; }
+ ~Program() {
+ delete impl;
+ }
};
-// clang-format off
-// Fully-expand GL defines while ignoring more than 4 suffixes
+// for GL defines to fully expand
#define CONCAT_KEY(prefix, x, y, z, w, ...) prefix##x##y##z##w
-// Generate a blend key enum symbol
-#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0, 0)
-#define MASK_BLEND_KEY(...) CONCAT_KEY(MASK_BLEND_, __VA_ARGS__, 0, 0, 0)
-#define AA_BLEND_KEY(...) CONCAT_KEY(AA_BLEND_, __VA_ARGS__, 0, 0, 0)
-#define AA_MASK_BLEND_KEY(...) CONCAT_KEY(AA_MASK_BLEND_, __VA_ARGS__, 0, 0, 0)
-
-// Utility macro to easily generate similar code for all implemented blend modes
+#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0)
#define FOR_EACH_BLEND_KEY(macro) \
- macro(GL_ONE, GL_ZERO, 0, 0) \
- macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \
- macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \
- macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0) \
- macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE) \
- macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \
- macro(GL_ZERO, GL_SRC_COLOR, 0, 0) \
- macro(GL_ONE, GL_ONE, 0, 0) \
- macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \
- macro(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE) \
- macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, 0, 0) \
- macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0) \
- macro(GL_MIN, 0, 0, 0) \
- macro(GL_MAX, 0, 0, 0) \
- macro(GL_MULTIPLY_KHR, 0, 0, 0) \
- macro(GL_SCREEN_KHR, 0, 0, 0) \
- macro(GL_OVERLAY_KHR, 0, 0, 0) \
- macro(GL_DARKEN_KHR, 0, 0, 0) \
- macro(GL_LIGHTEN_KHR, 0, 0, 0) \
- macro(GL_COLORDODGE_KHR, 0, 0, 0) \
- macro(GL_COLORBURN_KHR, 0, 0, 0) \
- macro(GL_HARDLIGHT_KHR, 0, 0, 0) \
- macro(GL_SOFTLIGHT_KHR, 0, 0, 0) \
- macro(GL_DIFFERENCE_KHR, 0, 0, 0) \
- macro(GL_EXCLUSION_KHR, 0, 0, 0) \
- macro(GL_HSL_HUE_KHR, 0, 0, 0) \
- macro(GL_HSL_SATURATION_KHR, 0, 0, 0) \
- macro(GL_HSL_COLOR_KHR, 0, 0, 0) \
- macro(GL_HSL_LUMINOSITY_KHR, 0, 0, 0) \
- macro(SWGL_BLEND_DROP_SHADOW, 0, 0, 0) \
- macro(SWGL_BLEND_SUBPIXEL_TEXT, 0, 0, 0)
+ macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE) \
+ macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \
+ macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0) \
+ macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE) \
+ macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0) macro( \
+ GL_ZERO, GL_SRC_COLOR, 0, 0) macro(GL_ONE, GL_ONE, 0, 0) \
+ macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \
+ macro(GL_ONE, GL_ZERO, 0, 0) macro( \
+ GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE) \
+ macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, \
+ 0, 0) \
+ macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0)
#define DEFINE_BLEND_KEY(...) BLEND_KEY(__VA_ARGS__),
-#define DEFINE_MASK_BLEND_KEY(...) MASK_BLEND_KEY(__VA_ARGS__),
-#define DEFINE_AA_BLEND_KEY(...) AA_BLEND_KEY(__VA_ARGS__),
-#define DEFINE_AA_MASK_BLEND_KEY(...) AA_MASK_BLEND_KEY(__VA_ARGS__),
enum BlendKey : uint8_t {
+ BLEND_KEY_NONE = 0,
FOR_EACH_BLEND_KEY(DEFINE_BLEND_KEY)
- FOR_EACH_BLEND_KEY(DEFINE_MASK_BLEND_KEY)
- FOR_EACH_BLEND_KEY(DEFINE_AA_BLEND_KEY)
- FOR_EACH_BLEND_KEY(DEFINE_AA_MASK_BLEND_KEY)
- BLEND_KEY_NONE = BLEND_KEY(GL_ONE, GL_ZERO),
- MASK_BLEND_KEY_NONE = MASK_BLEND_KEY(GL_ONE, GL_ZERO),
- AA_BLEND_KEY_NONE = AA_BLEND_KEY(GL_ONE, GL_ZERO),
- AA_MASK_BLEND_KEY_NONE = AA_MASK_BLEND_KEY(GL_ONE, GL_ZERO),
};
-// clang-format on
const size_t MAX_TEXTURE_UNITS = 16;
@@ -704,10 +459,8 @@ struct ObjectStore {
O* find(size_t i) const { return i < size ? objects[i] : nullptr; }
- template <typename T>
- void on_erase(T*, ...) {}
- template <typename T>
- void on_erase(T* o, decltype(&T::on_erase)) {
+ template <typename T> void on_erase(T*, ...) {}
+ template <typename T> void on_erase(T* o, decltype(&T::on_erase)) {
o->on_erase();
}
@@ -727,8 +480,6 @@ struct ObjectStore {
};
struct Context {
- int32_t references = 1;
-
ObjectStore<Query> queries;
ObjectStore<Buffer> buffers;
ObjectStore<Texture> textures;
@@ -756,7 +507,7 @@ struct Context {
bool scissortest = false;
IntRect scissor = {0, 0, 0, 0};
- GLfloat clearcolor[4] = {0, 0, 0, 0};
+ uint32_t clearcolor = 0;
GLdouble cleardepth = 1;
int unpack_row_length = 0;
@@ -766,10 +517,14 @@ struct Context {
struct TextureUnit {
GLuint texture_2d_binding = 0;
+ GLuint texture_3d_binding = 0;
+ GLuint texture_2d_array_binding = 0;
GLuint texture_rectangle_binding = 0;
void unlink(GLuint n) {
::unlink(texture_2d_binding, n);
+ ::unlink(texture_3d_binding, n);
+ ::unlink(texture_2d_array_binding, n);
::unlink(texture_rectangle_binding, n);
}
};
@@ -784,6 +539,7 @@ struct Context {
GLuint pixel_pack_buffer_binding = 0;
GLuint pixel_unpack_buffer_binding = 0;
GLuint array_buffer_binding = 0;
+ GLuint element_array_buffer_binding = 0;
GLuint time_elapsed_query = 0;
GLuint samples_passed_query = 0;
GLuint renderbuffer_binding = 0;
@@ -800,9 +556,13 @@ struct Context {
case GL_ARRAY_BUFFER:
return array_buffer_binding;
case GL_ELEMENT_ARRAY_BUFFER:
- return vertex_arrays[current_vertex_array].element_array_buffer_binding;
+ return element_array_buffer_binding;
case GL_TEXTURE_2D:
return texture_units[active_texture_unit].texture_2d_binding;
+ case GL_TEXTURE_2D_ARRAY:
+ return texture_units[active_texture_unit].texture_2d_array_binding;
+ case GL_TEXTURE_3D:
+ return texture_units[active_texture_unit].texture_3d_binding;
case GL_TEXTURE_RECTANGLE:
return texture_units[active_texture_unit].texture_rectangle_binding;
case GL_TIME_ELAPSED:
@@ -830,17 +590,16 @@ struct Context {
return textures[texture_units[unit].texture_2d_binding];
}
- Texture& get_texture(sampler2DRect, int unit) {
- return textures[texture_units[unit].texture_rectangle_binding];
+ Texture& get_texture(sampler2DArray, int unit) {
+ return textures[texture_units[unit].texture_2d_array_binding];
}
- IntRect apply_scissor(IntRect bb,
- const IntPoint& origin = IntPoint(0, 0)) const {
- return scissortest ? bb.intersect(scissor - origin) : bb;
+ Texture& get_texture(sampler2DRect, int unit) {
+ return textures[texture_units[unit].texture_rectangle_binding];
}
- IntRect apply_scissor(const Texture& t) const {
- return apply_scissor(t.bounds(), t.offset);
+ IntRect apply_scissor(IntRect bb) const {
+ return scissortest ? bb.intersect(scissor) : bb;
}
};
static Context* ctx = nullptr;
@@ -851,12 +610,14 @@ static BlendKey blend_key = BLEND_KEY_NONE;
static void prepare_texture(Texture& t, const IntRect* skip = nullptr);
template <typename S>
+static inline void init_depth(S* s, Texture& t) {
+ s->depth = max(t.depth, 1);
+ s->height_stride = s->stride * t.height;
+}
+
+template <typename S>
static inline void init_filter(S* s, Texture& t) {
- // If the width is not at least 2 pixels, then we can't safely sample the end
- // of the row with a linear filter. In that case, just punt to using nearest
- // filtering instead.
- s->filter = t.width >= 2 ? gl_filter_to_texture_filter(t.mag_filter)
- : TextureFilter::NEAREST;
+ s->filter = gl_filter_to_texture_filter(t.mag_filter);
}
template <typename S>
@@ -864,44 +625,20 @@ static inline void init_sampler(S* s, Texture& t) {
prepare_texture(t);
s->width = t.width;
s->height = t.height;
- s->stride = t.stride();
int bpp = t.bpp();
- if (bpp >= 4)
- s->stride /= 4;
- else if (bpp == 2)
- s->stride /= 2;
- else
- assert(bpp == 1);
- // Use uint32_t* for easier sampling, but need to cast to uint8_t* or
- // uint16_t* for formats with bpp < 4.
+ s->stride = t.stride(bpp);
+ if (bpp >= 4) s->stride /= 4;
+ // Use uint32_t* for easier sampling, but need to cast to uint8_t* for formats
+ // with bpp < 4.
s->buf = (uint32_t*)t.buf;
s->format = gl_format_to_texture_format(t.internal_format);
}
template <typename S>
-static inline void null_sampler(S* s) {
- // For null texture data, just make the sampler provide a 1x1 buffer that is
- // transparent black. Ensure buffer holds at least a SIMD vector of zero data
- // for SIMD padding of unaligned loads.
- static const uint32_t zeroBuf[sizeof(Float) / sizeof(uint32_t)] = {0};
- s->width = 1;
- s->height = 1;
- s->stride = s->width;
- s->buf = (uint32_t*)zeroBuf;
- s->format = TextureFormat::RGBA8;
-}
-
-template <typename S>
-static inline void null_filter(S* s) {
- s->filter = TextureFilter::NEAREST;
-}
-
-template <typename S>
S* lookup_sampler(S* s, int texture) {
Texture& t = ctx->get_texture(s, texture);
if (!t.buf) {
- null_sampler(s);
- null_filter(s);
+ *s = S();
} else {
init_sampler(s, t);
init_filter(s, t);
@@ -913,13 +650,26 @@ template <typename S>
S* lookup_isampler(S* s, int texture) {
Texture& t = ctx->get_texture(s, texture);
if (!t.buf) {
- null_sampler(s);
+ *s = S();
} else {
init_sampler(s, t);
}
return s;
}
+template <typename S>
+S* lookup_sampler_array(S* s, int texture) {
+ Texture& t = ctx->get_texture(s, texture);
+ if (!t.buf) {
+ *s = S();
+ } else {
+ init_sampler(s, t);
+ init_depth(s, t);
+ init_filter(s, t);
+ }
+ return s;
+}
+
int bytes_per_type(GLenum type) {
switch (type) {
case GL_INT:
@@ -983,40 +733,21 @@ void load_attrib(T& attrib, VertexAttrib& va, uint32_t start, int instance,
attrib = T(load_attrib_scalar<scalar_type>(va, src));
} else {
// Specialized for WR's primitive vertex order/winding.
+ // Triangles must be indexed at offsets 0, 1, 2.
+ // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3.
+ // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2.
+ // Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so that the
+ // points form a convex path that can be traversed by the rasterizer.
if (!count) return;
- assert(count >= 2 && count <= 4);
+ assert(count == 3 || count == 4);
char* src = (char*)va.buf + va.stride * start + va.offset;
- switch (count) {
- case 2: {
- // Lines must be indexed at offsets 0, 1.
- // Line vertexes fill vertex shader SIMD lanes as 0, 1, 1, 0.
- scalar_type lanes[2] = {
- load_attrib_scalar<scalar_type>(va, src),
- load_attrib_scalar<scalar_type>(va, src + va.stride)};
- attrib = (T){lanes[0], lanes[1], lanes[1], lanes[0]};
- break;
- }
- case 3: {
- // Triangles must be indexed at offsets 0, 1, 2.
- // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2.
- scalar_type lanes[3] = {
- load_attrib_scalar<scalar_type>(va, src),
- load_attrib_scalar<scalar_type>(va, src + va.stride),
- load_attrib_scalar<scalar_type>(va, src + va.stride * 2)};
- attrib = (T){lanes[0], lanes[1], lanes[2], lanes[2]};
- break;
- }
- default:
- // Quads must be successive triangles indexed at offsets 0, 1, 2, 2,
- // 1, 3. Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so
- // that the points form a convex path that can be traversed by the
- // rasterizer.
- attrib = (T){load_attrib_scalar<scalar_type>(va, src),
- load_attrib_scalar<scalar_type>(va, src + va.stride),
- load_attrib_scalar<scalar_type>(va, src + va.stride * 3),
- load_attrib_scalar<scalar_type>(va, src + va.stride * 2)};
- break;
- }
+ attrib = (T){
+ load_attrib_scalar<scalar_type>(va, src),
+ load_attrib_scalar<scalar_type>(va, src + va.stride),
+ load_attrib_scalar<scalar_type>(va, src + va.stride * 2 +
+ (count > 3 ? va.stride : 0)),
+ load_attrib_scalar<scalar_type>(va, src + va.stride * 2)
+ };
}
}
@@ -1076,6 +807,7 @@ void Enable(GLenum cap) {
switch (cap) {
case GL_BLEND:
ctx->blend = true;
+ blend_key = ctx->blend_key;
break;
case GL_DEPTH_TEST:
ctx->depthtest = true;
@@ -1090,6 +822,7 @@ void Disable(GLenum cap) {
switch (cap) {
case GL_BLEND:
ctx->blend = false;
+ blend_key = BLEND_KEY_NONE;
break;
case GL_DEPTH_TEST:
ctx->depthtest = false;
@@ -1103,18 +836,10 @@ void Disable(GLenum cap) {
GLenum GetError() { return GL_NO_ERROR; }
static const char* const extensions[] = {
- "GL_ARB_blend_func_extended",
- "GL_ARB_clear_texture",
- "GL_ARB_copy_image",
- "GL_ARB_draw_instanced",
- "GL_ARB_explicit_attrib_location",
- "GL_ARB_instanced_arrays",
- "GL_ARB_invalidate_subdata",
- "GL_ARB_texture_storage",
- "GL_EXT_timer_query",
- "GL_KHR_blend_equation_advanced",
- "GL_KHR_blend_equation_advanced_coherent",
- "GL_APPLE_rgb_422",
+ "GL_ARB_blend_func_extended", "GL_ARB_copy_image",
+ "GL_ARB_draw_instanced", "GL_ARB_explicit_attrib_location",
+ "GL_ARB_instanced_arrays", "GL_ARB_invalidate_subdata",
+ "GL_ARB_texture_storage", "GL_EXT_timer_query",
};
void GetIntegerv(GLenum pname, GLint* params) {
@@ -1128,7 +853,7 @@ void GetIntegerv(GLenum pname, GLint* params) {
params[0] = 1 << 15;
break;
case GL_MAX_ARRAY_TEXTURE_LAYERS:
- params[0] = 0;
+ params[0] = 1 << 15;
break;
case GL_READ_FRAMEBUFFER_BINDING:
params[0] = ctx->read_framebuffer_binding;
@@ -1145,12 +870,6 @@ void GetIntegerv(GLenum pname, GLint* params) {
case GL_NUM_EXTENSIONS:
params[0] = sizeof(extensions) / sizeof(extensions[0]);
break;
- case GL_MAJOR_VERSION:
- params[0] = 3;
- break;
- case GL_MINOR_VERSION:
- params[0] = 2;
- break;
default:
debugf("unhandled glGetIntegerv parameter %x\n", pname);
assert(false);
@@ -1177,8 +896,6 @@ const char* GetString(GLenum name) {
return "Software WebRender";
case GL_VERSION:
return "3.2";
- case GL_SHADING_LANGUAGE_VERSION:
- return "1.50";
default:
debugf("unhandled glGetString parameter %x\n", name);
assert(false);
@@ -1254,23 +971,17 @@ GLenum remap_blendfunc(GLenum rgb, GLenum a) {
return a;
}
-// Generate a hashed blend key based on blend func and equation state. This
-// allows all the blend state to be processed down to a blend key that can be
-// dealt with inside a single switch statement.
-static void hash_blend_key() {
- GLenum srgb = ctx->blendfunc_srgb;
- GLenum drgb = ctx->blendfunc_drgb;
- GLenum sa = ctx->blendfunc_sa;
- GLenum da = ctx->blendfunc_da;
- GLenum equation = ctx->blend_equation;
+void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) {
+ ctx->blendfunc_srgb = srgb;
+ ctx->blendfunc_drgb = drgb;
+ sa = remap_blendfunc(srgb, sa);
+ da = remap_blendfunc(drgb, da);
+ ctx->blendfunc_sa = sa;
+ ctx->blendfunc_da = da;
+
#define HASH_BLEND_KEY(x, y, z, w) ((x << 4) | (y) | (z << 24) | (w << 20))
- // Basic non-separate blend funcs used the two argument form
int hash = HASH_BLEND_KEY(srgb, drgb, 0, 0);
- // Separate alpha blend funcs use the 4 argument hash
if (srgb != sa || drgb != da) hash |= HASH_BLEND_KEY(0, 0, sa, da);
- // Any other blend equation than the default func_add ignores the func and
- // instead generates a one-argument hash based on the equation
- if (equation != GL_FUNC_ADD) hash = HASH_BLEND_KEY(equation, 0, 0, 0);
switch (hash) {
#define MAP_BLEND_KEY(...) \
case HASH_BLEND_KEY(__VA_ARGS__): \
@@ -1278,22 +989,14 @@ static void hash_blend_key() {
break;
FOR_EACH_BLEND_KEY(MAP_BLEND_KEY)
default:
- debugf("blendfunc: %x, %x, separate: %x, %x, equation: %x\n", srgb, drgb,
- sa, da, equation);
+ debugf("blendfunc: %x, %x, separate: %x, %x\n", srgb, drgb, sa, da);
assert(false);
break;
}
-}
-void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) {
- ctx->blendfunc_srgb = srgb;
- ctx->blendfunc_drgb = drgb;
- sa = remap_blendfunc(srgb, sa);
- da = remap_blendfunc(drgb, da);
- ctx->blendfunc_sa = sa;
- ctx->blendfunc_da = da;
-
- hash_blend_key();
+ if (ctx->blend) {
+ blend_key = ctx->blend_key;
+ }
}
void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
@@ -1302,12 +1005,8 @@ void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
}
void BlendEquation(GLenum mode) {
- assert(mode == GL_FUNC_ADD || mode == GL_MIN || mode == GL_MAX ||
- (mode >= GL_MULTIPLY_KHR && mode <= GL_HSL_LUMINOSITY_KHR));
- if (mode != ctx->blend_equation) {
- ctx->blend_equation = mode;
- hash_blend_key();
- }
+ assert(mode == GL_FUNC_ADD);
+ ctx->blend_equation = mode;
}
void DepthMask(GLboolean flag) { ctx->depthmask = flag; }
@@ -1328,10 +1027,8 @@ void SetScissor(GLint x, GLint y, GLsizei width, GLsizei height) {
}
void ClearColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
- ctx->clearcolor[0] = r;
- ctx->clearcolor[1] = g;
- ctx->clearcolor[2] = b;
- ctx->clearcolor[3] = a;
+ I32 c = round_pixel((Float){b, g, r, a});
+ ctx->clearcolor = bit_cast<uint32_t>(CONVERT(c, U8));
}
void ClearDepth(GLdouble depth) { ctx->cleardepth = depth; }
@@ -1369,6 +1066,7 @@ void DeleteBuffer(GLuint n) {
unlink(ctx->pixel_pack_buffer_binding, n);
unlink(ctx->pixel_unpack_buffer_binding, n);
unlink(ctx->array_buffer_binding, n);
+ unlink(ctx->element_array_buffer_binding, n);
}
}
@@ -1434,45 +1132,26 @@ void DeleteProgram(GLuint n) {
void LinkProgram(GLuint program) {
Program& p = ctx->programs[program];
assert(p.impl);
- if (!p.impl) {
- return;
- }
assert(p.impl->interpolants_size() <= sizeof(Interpolants));
if (!p.vert_impl) p.vert_impl = p.impl->get_vertex_shader();
if (!p.frag_impl) p.frag_impl = p.impl->get_fragment_shader();
}
-GLint GetLinkStatus(GLuint program) {
- if (auto* p = ctx->programs.find(program)) {
- return p->impl ? 1 : 0;
- }
- return 0;
-}
-
void BindAttribLocation(GLuint program, GLuint index, char* name) {
Program& p = ctx->programs[program];
assert(p.impl);
- if (!p.impl) {
- return;
- }
p.impl->bind_attrib(name, index);
}
GLint GetAttribLocation(GLuint program, char* name) {
Program& p = ctx->programs[program];
assert(p.impl);
- if (!p.impl) {
- return -1;
- }
return p.impl->get_attrib(name);
}
GLint GetUniformLocation(GLuint program, char* name) {
Program& p = ctx->programs[program];
assert(p.impl);
- if (!p.impl) {
- return -1;
- }
GLint loc = p.impl->get_uniform(name);
// debugf("location: %d\n", loc);
return loc;
@@ -1482,15 +1161,7 @@ static uint64_t get_time_value() {
#ifdef __MACH__
return mach_absolute_time();
#elif defined(_WIN32)
- LARGE_INTEGER time;
- static bool have_frequency = false;
- static LARGE_INTEGER frequency;
- if (!have_frequency) {
- QueryPerformanceFrequency(&frequency);
- have_frequency = true;
- }
- QueryPerformanceCounter(&time);
- return time.QuadPart * 1000000000ULL / frequency.QuadPart;
+ return uint64_t(clock()) * (1000000000ULL / CLOCKS_PER_SEC);
#else
return ({
struct timespec tp;
@@ -1583,113 +1254,60 @@ void PixelStorei(GLenum name, GLint param) {
static GLenum remap_internal_format(GLenum format) {
switch (format) {
case GL_DEPTH_COMPONENT:
- return GL_DEPTH_COMPONENT24;
+ return GL_DEPTH_COMPONENT16;
case GL_RGBA:
return GL_RGBA8;
case GL_RED:
return GL_R8;
- case GL_RG:
- return GL_RG8;
- case GL_RGB_422_APPLE:
- return GL_RGB_RAW_422_APPLE;
default:
return format;
}
}
-} // extern "C"
-
-static bool format_requires_conversion(GLenum external_format,
- GLenum internal_format) {
- switch (external_format) {
- case GL_RGBA:
- return internal_format == GL_RGBA8;
- default:
- return false;
- }
-}
-
-static inline void copy_bgra8_to_rgba8(uint32_t* dest, const uint32_t* src,
- int width) {
- for (; width >= 4; width -= 4, dest += 4, src += 4) {
- U32 p = unaligned_load<U32>(src);
- U32 rb = p & 0x00FF00FF;
- unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16));
- }
- for (; width > 0; width--, dest++, src++) {
- uint32_t p = *src;
- uint32_t rb = p & 0x00FF00FF;
- *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16);
- }
-}
-
-static void convert_copy(GLenum external_format, GLenum internal_format,
- uint8_t* dst_buf, size_t dst_stride,
- const uint8_t* src_buf, size_t src_stride,
- size_t width, size_t height) {
- switch (external_format) {
- case GL_RGBA:
- if (internal_format == GL_RGBA8) {
- for (; height; height--) {
- copy_bgra8_to_rgba8((uint32_t*)dst_buf, (const uint32_t*)src_buf,
- width);
- dst_buf += dst_stride;
- src_buf += src_stride;
- }
- return;
- }
- break;
- default:
- break;
- }
- size_t row_bytes = width * bytes_for_internal_format(internal_format);
- for (; height; height--) {
- memcpy(dst_buf, src_buf, row_bytes);
- dst_buf += dst_stride;
- src_buf += src_stride;
+void TexStorage3D(GLenum target, GLint levels, GLenum internal_format,
+ GLsizei width, GLsizei height, GLsizei depth) {
+ assert(levels == 1);
+ Texture& t = ctx->textures[ctx->get_binding(target)];
+ internal_format = remap_internal_format(internal_format);
+ bool changed = false;
+ if (t.width != width || t.height != height || t.depth != depth ||
+ t.internal_format != internal_format) {
+ changed = true;
+ t.internal_format = internal_format;
+ t.width = width;
+ t.height = height;
+ t.depth = depth;
}
+ t.disable_delayed_clear();
+ t.allocate(changed);
}
-static void set_tex_storage(Texture& t, GLenum external_format, GLsizei width,
- GLsizei height, void* buf = nullptr,
- GLsizei stride = 0, GLsizei min_width = 0,
- GLsizei min_height = 0) {
- GLenum internal_format = remap_internal_format(external_format);
+static void set_tex_storage(Texture& t, GLenum internal_format,
+ GLsizei width, GLsizei height,
+ bool should_free = true, void* buf = nullptr,
+ GLsizei min_width = 0, GLsizei min_height = 0) {
+ internal_format = remap_internal_format(internal_format);
bool changed = false;
- if (t.width != width || t.height != height ||
+ if (t.width != width || t.height != height || t.depth != 0 ||
t.internal_format != internal_format) {
changed = true;
t.internal_format = internal_format;
t.width = width;
t.height = height;
+ t.depth = 0;
}
- // If we are changed from an internally managed buffer to an externally
- // supplied one or vice versa, ensure that we clean up old buffer state.
- // However, if we have to convert the data from a non-native format, then
- // always treat it as internally managed since we will need to copy to an
- // internally managed native format buffer.
- bool should_free = buf == nullptr || format_requires_conversion(
- external_format, internal_format);
- if (t.should_free() != should_free) {
- changed = true;
- t.cleanup();
+ if (t.should_free() != should_free || buf != nullptr) {
+ if (t.should_free()) {
+ t.cleanup();
+ }
t.set_should_free(should_free);
- }
- // If now an external buffer, explicitly set it...
- if (!should_free) {
- t.set_buffer(buf, stride);
+ t.buf = (char*)buf;
+ t.buf_size = 0;
}
t.disable_delayed_clear();
t.allocate(changed, min_width, min_height);
- // If we have a buffer that needs format conversion, then do that now.
- if (buf && should_free) {
- convert_copy(external_format, internal_format, (uint8_t*)t.buf, t.stride(),
- (const uint8_t*)buf, stride, width, height);
- }
}
-extern "C" {
-
void TexStorage2D(GLenum target, GLint levels, GLenum internal_format,
GLsizei width, GLsizei height) {
assert(levels == 1);
@@ -1701,19 +1319,12 @@ GLenum internal_format_for_data(GLenum format, GLenum ty) {
if (format == GL_RED && ty == GL_UNSIGNED_BYTE) {
return GL_R8;
} else if ((format == GL_RGBA || format == GL_BGRA) &&
- (ty == GL_UNSIGNED_BYTE || ty == GL_UNSIGNED_INT_8_8_8_8_REV)) {
+ ty == GL_UNSIGNED_BYTE) {
return GL_RGBA8;
} else if (format == GL_RGBA && ty == GL_FLOAT) {
return GL_RGBA32F;
} else if (format == GL_RGBA_INTEGER && ty == GL_INT) {
return GL_RGBA32I;
- } else if (format == GL_RG && ty == GL_UNSIGNED_BYTE) {
- return GL_RG8;
- } else if (format == GL_RGB_422_APPLE &&
- ty == GL_UNSIGNED_SHORT_8_8_REV_APPLE) {
- return GL_RGB_RAW_422_APPLE;
- } else if (format == GL_RED && ty == GL_UNSIGNED_SHORT) {
- return GL_R16;
} else {
debugf("unknown internal format for format %x, type %x\n", format, ty);
assert(false);
@@ -1721,6 +1332,20 @@ GLenum internal_format_for_data(GLenum format, GLenum ty) {
}
}
+static inline void copy_bgra8_to_rgba8(uint32_t* dest, uint32_t* src,
+ int width) {
+ for (; width >= 4; width -= 4, dest += 4, src += 4) {
+ U32 p = unaligned_load<U32>(src);
+ U32 rb = p & 0x00FF00FF;
+ unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16));
+ }
+ for (; width > 0; width--, dest++, src++) {
+ uint32_t p = *src;
+ uint32_t rb = p & 0x00FF00FF;
+ *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16);
+ }
+}
+
static Buffer* get_pixel_pack_buffer() {
return ctx->pixel_pack_buffer_binding
? &ctx->buffers[ctx->pixel_pack_buffer_binding]
@@ -1750,10 +1375,7 @@ static void* get_pixel_unpack_buffer_data(void* data) {
void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset,
GLsizei width, GLsizei height, GLenum format, GLenum ty,
void* data) {
- if (level != 0) {
- assert(false);
- return;
- }
+ if (level != 0) { assert(false); return; }
data = get_pixel_unpack_buffer_data(data);
if (!data) return;
Texture& t = ctx->textures[ctx->get_binding(target)];
@@ -1765,33 +1387,84 @@ void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset,
GLsizei row_length =
ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width;
assert(t.internal_format == internal_format_for_data(format, ty));
- int src_bpp = format_requires_conversion(format, t.internal_format)
- ? bytes_for_internal_format(format)
- : t.bpp();
- if (!src_bpp || !t.buf) return;
- convert_copy(format, t.internal_format,
- (uint8_t*)t.sample_ptr(xoffset, yoffset), t.stride(),
- (const uint8_t*)data, row_length * src_bpp, width, height);
+ int bpp = t.bpp();
+ if (!bpp || !t.buf) return;
+ size_t dest_stride = t.stride(bpp);
+ char* dest = t.sample_ptr(xoffset, yoffset, 0, bpp, dest_stride);
+ char* src = (char*)data;
+ for (int y = 0; y < height; y++) {
+ if (t.internal_format == GL_RGBA8 && format != GL_BGRA) {
+ copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width);
+ } else {
+ memcpy(dest, src, width * bpp);
+ }
+ dest += dest_stride;
+ src += row_length * bpp;
+ }
}
void TexImage2D(GLenum target, GLint level, GLint internal_format,
GLsizei width, GLsizei height, GLint border, GLenum format,
GLenum ty, void* data) {
- if (level != 0) {
- assert(false);
- return;
- }
+ if (level != 0) { assert(false); return; }
assert(border == 0);
TexStorage2D(target, 1, internal_format, width, height);
TexSubImage2D(target, 0, 0, 0, width, height, format, ty, data);
}
+void TexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset,
+ GLint zoffset, GLsizei width, GLsizei height, GLsizei depth,
+ GLenum format, GLenum ty, void* data) {
+ if (level != 0) { assert(false); return; }
+ data = get_pixel_unpack_buffer_data(data);
+ if (!data) return;
+ Texture& t = ctx->textures[ctx->get_binding(target)];
+ prepare_texture(t);
+ assert(ctx->unpack_row_length == 0 || ctx->unpack_row_length >= width);
+ GLsizei row_length =
+ ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width;
+ if (format == GL_BGRA) {
+ assert(ty == GL_UNSIGNED_BYTE);
+ assert(t.internal_format == GL_RGBA8);
+ } else {
+ assert(t.internal_format == internal_format_for_data(format, ty));
+ }
+ int bpp = t.bpp();
+ if (!bpp || !t.buf) return;
+ char* src = (char*)data;
+ assert(xoffset + width <= t.width);
+ assert(yoffset + height <= t.height);
+ assert(zoffset + depth <= t.depth);
+ size_t dest_stride = t.stride(bpp);
+ for (int z = 0; z < depth; z++) {
+ char* dest = t.sample_ptr(xoffset, yoffset, zoffset + z, bpp, dest_stride);
+ for (int y = 0; y < height; y++) {
+ if (t.internal_format == GL_RGBA8 && format != GL_BGRA) {
+ copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width);
+ } else {
+ memcpy(dest, src, width * bpp);
+ }
+ dest += dest_stride;
+ src += row_length * bpp;
+ }
+ }
+}
+
+void TexImage3D(GLenum target, GLint level, GLint internal_format,
+ GLsizei width, GLsizei height, GLsizei depth, GLint border,
+ GLenum format, GLenum ty, void* data) {
+ if (level != 0) { assert(false); return; }
+ assert(border == 0);
+ TexStorage3D(target, 1, internal_format, width, height, depth);
+ TexSubImage3D(target, 0, 0, 0, 0, width, height, depth, format, ty, data);
+}
+
void GenerateMipmap(UNUSED GLenum target) {
// TODO: support mipmaps
}
-void SetTextureParameter(GLuint texid, GLenum pname, GLint param) {
- Texture& t = ctx->textures[texid];
+void TexParameteri(GLenum target, GLenum pname, GLint param) {
+ Texture& t = ctx->textures[ctx->get_binding(target)];
switch (pname) {
case GL_TEXTURE_WRAP_S:
assert(param == GL_CLAMP_TO_EDGE);
@@ -1810,10 +1483,6 @@ void SetTextureParameter(GLuint texid, GLenum pname, GLint param) {
}
}
-void TexParameteri(GLenum target, GLenum pname, GLint param) {
- SetTextureParameter(ctx->get_binding(target), pname, param);
-}
-
void GenTextures(int n, GLuint* result) {
for (int i = 0; i < n; i++) {
Texture t;
@@ -1839,7 +1508,9 @@ void GenRenderbuffers(int n, GLuint* result) {
void Renderbuffer::on_erase() {
for (auto* fb : ctx->framebuffers) {
if (fb) {
- unlink(fb->color_attachment, texture);
+ if (unlink(fb->color_attachment, texture)) {
+ fb->layer = 0;
+ }
unlink(fb->depth_attachment, texture);
}
}
@@ -1875,11 +1546,10 @@ void RenderbufferStorage(GLenum target, GLenum internal_format, GLsizei width,
}
switch (internal_format) {
case GL_DEPTH_COMPONENT:
- case GL_DEPTH_COMPONENT16:
case GL_DEPTH_COMPONENT24:
case GL_DEPTH_COMPONENT32:
- // Force depth format to 24 bits...
- internal_format = GL_DEPTH_COMPONENT24;
+ // Force depth format to 16 bits...
+ internal_format = GL_DEPTH_COMPONENT16;
break;
}
set_tex_storage(ctx->textures[r.texture], internal_format, width, height);
@@ -1963,8 +1633,7 @@ void VertexAttribDivisor(GLuint index, GLuint divisor) {
va.divisor = divisor;
}
-void BufferData(GLenum target, GLsizeiptr size, void* data,
- UNUSED GLenum usage) {
+void BufferData(GLenum target, GLsizeiptr size, void* data, UNUSED GLenum usage) {
Buffer& b = ctx->buffers[ctx->get_binding(target)];
if (b.allocate(size)) {
ctx->validate_vertex_array = true;
@@ -2004,23 +1673,17 @@ GLboolean UnmapBuffer(GLenum target) {
void Uniform1i(GLint location, GLint V0) {
// debugf("tex: %d\n", (int)ctx->textures.size);
- if (vertex_shader) {
- vertex_shader->set_uniform_1i(location, V0);
- }
+ vertex_shader->set_uniform_1i(location, V0);
}
void Uniform4fv(GLint location, GLsizei count, const GLfloat* v) {
assert(count == 1);
- if (vertex_shader) {
- vertex_shader->set_uniform_4fv(location, v);
- }
+ vertex_shader->set_uniform_4fv(location, v);
}
void UniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose,
const GLfloat* value) {
assert(count == 1);
assert(!transpose);
- if (vertex_shader) {
- vertex_shader->set_uniform_matrix4fv(location, value);
- }
+ vertex_shader->set_uniform_matrix4fv(location, value);
}
void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget,
@@ -2031,7 +1694,24 @@ void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget,
Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)];
if (attachment == GL_COLOR_ATTACHMENT0) {
fb.color_attachment = texture;
+ fb.layer = 0;
+ } else if (attachment == GL_DEPTH_ATTACHMENT) {
+ fb.depth_attachment = texture;
+ } else {
+ assert(0);
+ }
+}
+
+void FramebufferTextureLayer(GLenum target, GLenum attachment, GLuint texture,
+ GLint level, GLint layer) {
+ assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER);
+ assert(level == 0);
+ Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)];
+ if (attachment == GL_COLOR_ATTACHMENT0) {
+ fb.color_attachment = texture;
+ fb.layer = layer;
} else if (attachment == GL_DEPTH_ATTACHMENT) {
+ assert(layer == 0);
fb.depth_attachment = texture;
} else {
assert(0);
@@ -2046,6 +1726,7 @@ void FramebufferRenderbuffer(GLenum target, GLenum attachment,
Renderbuffer& rb = ctx->renderbuffers[renderbuffer];
if (attachment == GL_COLOR_ATTACHMENT0) {
fb.color_attachment = rb.texture;
+ fb.layer = 0;
} else if (attachment == GL_DEPTH_ATTACHMENT) {
fb.depth_attachment = rb.texture;
} else {
@@ -2055,18 +1736,11 @@ void FramebufferRenderbuffer(GLenum target, GLenum attachment,
} // extern "C"
-static inline Framebuffer* get_framebuffer(GLenum target,
- bool fallback = false) {
+static inline Framebuffer* get_framebuffer(GLenum target) {
if (target == GL_FRAMEBUFFER) {
target = GL_DRAW_FRAMEBUFFER;
}
- Framebuffer* fb = ctx->framebuffers.find(ctx->get_binding(target));
- if (fallback && !fb) {
- // If the specified framebuffer isn't found and a fallback is requested,
- // use the default framebuffer.
- fb = &ctx->framebuffers[0];
- }
- return fb;
+ return ctx->framebuffers.find(ctx->get_binding(target));
}
template <typename T>
@@ -2092,7 +1766,9 @@ static inline uint32_t clear_chunk(uint16_t value) {
return uint32_t(value) | (uint32_t(value) << 16);
}
-static inline uint32_t clear_chunk(uint32_t value) { return value; }
+static inline uint32_t clear_chunk(uint32_t value) {
+ return value;
+}
template <typename T>
static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) {
@@ -2115,22 +1791,20 @@ static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) {
}
template <typename T>
-static void clear_buffer(Texture& t, T value, IntRect bb, int skip_start = 0,
- int skip_end = 0) {
+static void clear_buffer(Texture& t, T value, int layer, IntRect bb,
+ int skip_start = 0, int skip_end = 0) {
if (!t.buf) return;
skip_start = max(skip_start, bb.x0);
skip_end = max(skip_end, skip_start);
assert(sizeof(T) == t.bpp());
- size_t stride = t.stride();
- // When clearing multiple full-width rows, collapse them into a single large
- // "row" to avoid redundant setup from clearing each row individually. Note
- // that we can only safely do this if the stride is tightly packed.
- if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end &&
- (t.should_free() || stride == t.width * sizeof(T))) {
+ size_t stride = t.stride(sizeof(T));
+ // When clearing multiple full-width rows, collapse them into a single
+ // large "row" to avoid redundant setup from clearing each row individually.
+ if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end) {
bb.x1 += (stride / sizeof(T)) * (bb.height() - 1);
bb.y1 = bb.y0 + 1;
}
- T* buf = (T*)t.sample_ptr(bb.x0, bb.y0);
+ T* buf = (T*)t.sample_ptr(bb.x0, bb.y0, layer, sizeof(T), stride);
uint32_t chunk = clear_chunk(value);
for (int rows = bb.height(); rows > 0; rows--) {
if (bb.x0 < skip_start) {
@@ -2144,12 +1818,20 @@ static void clear_buffer(Texture& t, T value, IntRect bb, int skip_start = 0,
}
template <typename T>
+static inline void clear_buffer(Texture& t, T value, int layer = 0) {
+ IntRect bb = ctx->apply_scissor(t.bounds());
+ if (bb.width() > 0) {
+ clear_buffer<T>(t, value, layer, bb);
+ }
+}
+
+template <typename T>
static inline void force_clear_row(Texture& t, int y, int skip_start = 0,
int skip_end = 0) {
assert(t.buf != nullptr);
assert(sizeof(T) == t.bpp());
assert(skip_start <= skip_end);
- T* buf = (T*)t.sample_ptr(0, y);
+ T* buf = (T*)t.sample_ptr(0, y, 0, sizeof(T));
uint32_t chunk = clear_chunk((T)t.clear_val);
if (skip_start > 0) {
clear_row<T>(buf, skip_start, t.clear_val, chunk);
@@ -2188,9 +1870,9 @@ static void force_clear(Texture& t, const IntRect* skip = nullptr) {
while (mask) {
int count = __builtin_ctz(mask);
if (count > 0) {
- clear_buffer<T>(t, t.clear_val,
- IntRect{0, start, t.width, start + count}, skip_start,
- skip_end);
+ clear_buffer<T>(t, t.clear_val, 0,
+ IntRect{0, start, t.width, start + count},
+ skip_start, skip_end);
t.delay_clear -= count;
start += count;
mask >>= count;
@@ -2201,9 +1883,9 @@ static void force_clear(Texture& t, const IntRect* skip = nullptr) {
}
int count = (i + 1) * 32 - start;
if (count > 0) {
- clear_buffer<T>(t, t.clear_val,
- IntRect{0, start, t.width, start + count}, skip_start,
- skip_end);
+ clear_buffer<T>(t, t.clear_val, 0,
+ IntRect{0, start, t.width, start + count},
+ skip_start, skip_end);
t.delay_clear -= count;
}
}
@@ -2220,7 +1902,7 @@ static void prepare_texture(Texture& t, const IntRect* skip) {
case GL_R8:
force_clear<uint8_t>(t, skip);
break;
- case GL_RG8:
+ case GL_DEPTH_COMPONENT16:
force_clear<uint16_t>(t, skip);
break;
default:
@@ -2230,53 +1912,31 @@ static void prepare_texture(Texture& t, const IntRect* skip) {
}
}
-// Setup a clear on a texture. This may either force an immediate clear or
-// potentially punt to a delayed clear, if applicable.
-template <typename T>
-static void request_clear(Texture& t, T value, const IntRect& scissor) {
- // If the clear would require a scissor, force clear anything outside
- // the scissor, and then immediately clear anything inside the scissor.
- if (!scissor.contains(t.offset_bounds())) {
- IntRect skip = scissor - t.offset;
- force_clear<T>(t, &skip);
- clear_buffer<T>(t, value, skip.intersection(t.bounds()));
- } else {
- // Do delayed clear for 2D texture without scissor.
- t.enable_delayed_clear(value);
- }
-}
-
-template <typename T>
-static inline void request_clear(Texture& t, T value) {
- // If scissoring is enabled, use the scissor rect. Otherwise, just scissor to
- // the entire texture bounds.
- request_clear(t, value, ctx->scissortest ? ctx->scissor : t.offset_bounds());
-}
-
extern "C" {
-void InitDefaultFramebuffer(int x, int y, int width, int height, int stride,
- void* buf) {
+void InitDefaultFramebuffer(int width, int height) {
Framebuffer& fb = ctx->framebuffers[0];
if (!fb.color_attachment) {
GenTextures(1, &fb.color_attachment);
+ fb.layer = 0;
}
- // If the dimensions or buffer properties changed, we need to reallocate
- // the underlying storage for the color buffer texture.
Texture& colortex = ctx->textures[fb.color_attachment];
- set_tex_storage(colortex, GL_RGBA8, width, height, buf, stride);
- colortex.offset = IntPoint(x, y);
+ if (colortex.width != width || colortex.height != height) {
+ colortex.cleanup();
+ set_tex_storage(colortex, GL_RGBA8, width, height);
+ }
if (!fb.depth_attachment) {
GenTextures(1, &fb.depth_attachment);
}
- // Ensure dimensions of the depth buffer match the color buffer.
Texture& depthtex = ctx->textures[fb.depth_attachment];
- set_tex_storage(depthtex, GL_DEPTH_COMPONENT24, width, height);
- depthtex.offset = IntPoint(x, y);
+ if (depthtex.width != width || depthtex.height != height) {
+ depthtex.cleanup();
+ set_tex_storage(depthtex, GL_DEPTH_COMPONENT16, width, height);
+ }
}
void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width,
- int32_t* height, int32_t* stride) {
+ int32_t* height) {
Framebuffer* fb = ctx->framebuffers.find(fbo);
if (!fb || !fb->color_attachment) {
return nullptr;
@@ -2285,33 +1945,16 @@ void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width,
if (flush) {
prepare_texture(colortex);
}
- assert(colortex.offset == IntPoint(0, 0));
- if (width) {
- *width = colortex.width;
- }
- if (height) {
- *height = colortex.height;
- }
- if (stride) {
- *stride = colortex.stride();
- }
- return colortex.buf ? colortex.sample_ptr(0, 0) : nullptr;
-}
-
-void ResolveFramebuffer(GLuint fbo) {
- Framebuffer* fb = ctx->framebuffers.find(fbo);
- if (!fb || !fb->color_attachment) {
- return;
- }
- Texture& colortex = ctx->textures[fb->color_attachment];
- prepare_texture(colortex);
+ *width = colortex.width;
+ *height = colortex.height;
+ return colortex.buf ? colortex.sample_ptr(0, 0, fb->layer) : nullptr;
}
void SetTextureBuffer(GLuint texid, GLenum internal_format, GLsizei width,
- GLsizei height, GLsizei stride, void* buf,
- GLsizei min_width, GLsizei min_height) {
+ GLsizei height, void* buf, GLsizei min_width,
+ GLsizei min_height) {
Texture& t = ctx->textures[texid];
- set_tex_storage(t, internal_format, width, height, buf, stride, min_width,
+ set_tex_storage(t, internal_format, width, height, !buf, buf, min_width,
min_height);
}
@@ -2323,170 +1966,57 @@ GLenum CheckFramebufferStatus(GLenum target) {
return GL_FRAMEBUFFER_COMPLETE;
}
-void ClearTexSubImage(GLuint texture, GLint level, GLint xoffset, GLint yoffset,
- GLint zoffset, GLsizei width, GLsizei height,
- GLsizei depth, GLenum format, GLenum type,
- const void* data) {
- if (level != 0) {
- assert(false);
- return;
- }
- Texture& t = ctx->textures[texture];
- assert(!t.locked);
- if (width <= 0 || height <= 0 || depth <= 0) {
- return;
- }
- assert(zoffset == 0 && depth == 1);
- IntRect scissor = {xoffset, yoffset, xoffset + width, yoffset + height};
- if (t.internal_format == GL_DEPTH_COMPONENT24) {
- uint32_t value = 0xFFFFFF;
- switch (format) {
- case GL_DEPTH_COMPONENT:
- switch (type) {
- case GL_DOUBLE:
- value = uint32_t(*(const GLdouble*)data * 0xFFFFFF);
- break;
- case GL_FLOAT:
- value = uint32_t(*(const GLfloat*)data * 0xFFFFFF);
- break;
- default:
- assert(false);
- break;
- }
- break;
- default:
- assert(false);
- break;
- }
- if (t.cleared() && !scissor.contains(t.offset_bounds())) {
- // If we need to scissor the clear and the depth buffer was already
- // initialized, then just fill runs for that scissor area.
- t.fill_depth_runs(value, scissor);
- } else {
- // Otherwise, the buffer is either uninitialized or the clear would
- // encompass the entire buffer. If uninitialized, we can safely fill
- // the entire buffer with any value and thus ignore any scissoring.
- t.init_depth_runs(value);
- }
- return;
- }
-
- uint32_t color = 0xFF000000;
- switch (type) {
- case GL_FLOAT: {
- const GLfloat* f = (const GLfloat*)data;
- Float v = {0.0f, 0.0f, 0.0f, 1.0f};
- switch (format) {
- case GL_RGBA:
- v.w = f[3]; // alpha
- FALLTHROUGH;
- case GL_RGB:
- v.z = f[2]; // blue
- FALLTHROUGH;
- case GL_RG:
- v.y = f[1]; // green
- FALLTHROUGH;
- case GL_RED:
- v.x = f[0]; // red
- break;
- default:
- assert(false);
- break;
- }
- color = bit_cast<uint32_t>(CONVERT(round_pixel(v), U8));
- break;
- }
- case GL_UNSIGNED_BYTE: {
- const GLubyte* b = (const GLubyte*)data;
- switch (format) {
- case GL_RGBA:
- color = (color & ~0xFF000000) | (uint32_t(b[3]) << 24); // alpha
- FALLTHROUGH;
- case GL_RGB:
- color = (color & ~0x00FF0000) | (uint32_t(b[2]) << 16); // blue
- FALLTHROUGH;
- case GL_RG:
- color = (color & ~0x0000FF00) | (uint32_t(b[1]) << 8); // green
- FALLTHROUGH;
- case GL_RED:
- color = (color & ~0x000000FF) | uint32_t(b[0]); // red
- break;
- default:
- assert(false);
- break;
- }
- break;
- }
- default:
- assert(false);
- break;
- }
-
- switch (t.internal_format) {
- case GL_RGBA8:
- // Clear color needs to swizzle to BGRA.
- request_clear<uint32_t>(t,
- (color & 0xFF00FF00) |
- ((color << 16) & 0xFF0000) |
- ((color >> 16) & 0xFF),
- scissor);
- break;
- case GL_R8:
- request_clear<uint8_t>(t, uint8_t(color & 0xFF), scissor);
- break;
- case GL_RG8:
- request_clear<uint16_t>(t, uint16_t(color & 0xFFFF), scissor);
- break;
- default:
- assert(false);
- break;
- }
-}
-
-void ClearTexImage(GLuint texture, GLint level, GLenum format, GLenum type,
- const void* data) {
- Texture& t = ctx->textures[texture];
- IntRect scissor = t.offset_bounds();
- ClearTexSubImage(texture, level, scissor.x0, scissor.y0, 0, scissor.width(),
- scissor.height(), 1, format, type, data);
+static inline bool clear_requires_scissor(Texture& t) {
+ return ctx->scissortest && !ctx->scissor.contains(t.bounds());
}
void Clear(GLbitfield mask) {
- Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true);
+ Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER);
if ((mask & GL_COLOR_BUFFER_BIT) && fb.color_attachment) {
Texture& t = ctx->textures[fb.color_attachment];
- IntRect scissor = ctx->scissortest
- ? ctx->scissor.intersection(t.offset_bounds())
- : t.offset_bounds();
- ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0,
- scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT,
- ctx->clearcolor);
+ if (t.internal_format == GL_RGBA8) {
+ uint32_t color = ctx->clearcolor;
+ // If the clear would require a scissor, force clear anything outside
+ // the scissor, and then immediately clear anything inside the scissor.
+ if (clear_requires_scissor(t)) {
+ force_clear<uint32_t>(t, &ctx->scissor);
+ clear_buffer<uint32_t>(t, color, fb.layer);
+ } else if (t.depth > 1) {
+ // Delayed clear is not supported on texture arrays.
+ t.disable_delayed_clear();
+ clear_buffer<uint32_t>(t, color, fb.layer);
+ } else {
+ // Do delayed clear for 2D texture without scissor.
+ t.enable_delayed_clear(color);
+ }
+ } else if (t.internal_format == GL_R8) {
+ uint8_t color = uint8_t((ctx->clearcolor >> 16) & 0xFF);
+ if (clear_requires_scissor(t)) {
+ force_clear<uint8_t>(t, &ctx->scissor);
+ clear_buffer<uint8_t>(t, color, fb.layer);
+ } else if (t.depth > 1) {
+ t.disable_delayed_clear();
+ clear_buffer<uint8_t>(t, color, fb.layer);
+ } else {
+ t.enable_delayed_clear(color);
+ }
+ } else {
+ assert(false);
+ }
}
if ((mask & GL_DEPTH_BUFFER_BIT) && fb.depth_attachment) {
Texture& t = ctx->textures[fb.depth_attachment];
- IntRect scissor = ctx->scissortest
- ? ctx->scissor.intersection(t.offset_bounds())
- : t.offset_bounds();
- ClearTexSubImage(fb.depth_attachment, 0, scissor.x0, scissor.y0, 0,
- scissor.width(), scissor.height(), 1, GL_DEPTH_COMPONENT,
- GL_DOUBLE, &ctx->cleardepth);
+ assert(t.internal_format == GL_DEPTH_COMPONENT16);
+ uint16_t depth = uint16_t(0xFFFF * ctx->cleardepth) - 0x8000;
+ if (clear_requires_scissor(t)) {
+ force_clear<uint16_t>(t, &ctx->scissor);
+ clear_buffer<uint16_t>(t, depth);
+ } else {
+ t.enable_delayed_clear(depth);
+ }
}
}
-void ClearColorRect(GLuint fbo, GLint xoffset, GLint yoffset, GLsizei width,
- GLsizei height, GLfloat r, GLfloat g, GLfloat b,
- GLfloat a) {
- GLfloat color[] = {r, g, b, a};
- Framebuffer& fb = ctx->framebuffers[fbo];
- Texture& t = ctx->textures[fb.color_attachment];
- IntRect scissor =
- IntRect{xoffset, yoffset, xoffset + width, yoffset + height}.intersection(
- t.offset_bounds());
- ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0,
- scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT,
- color);
-}
-
void InvalidateFramebuffer(GLenum target, GLsizei num_attachments,
const GLenum* attachments) {
Framebuffer* fb = get_framebuffer(target);
@@ -2497,7 +2027,7 @@ void InvalidateFramebuffer(GLenum target, GLsizei num_attachments,
switch (attachments[i]) {
case GL_DEPTH_ATTACHMENT: {
Texture& t = ctx->textures[fb->depth_attachment];
- t.set_cleared(false);
+ t.disable_delayed_clear();
break;
}
case GL_COLOR_ATTACHMENT0: {
@@ -2516,58 +2046,40 @@ void ReadPixels(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format,
Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
if (!fb) return;
assert(format == GL_RED || format == GL_RGBA || format == GL_RGBA_INTEGER ||
- format == GL_BGRA || format == GL_RG);
+ format == GL_BGRA);
Texture& t = ctx->textures[fb->color_attachment];
if (!t.buf) return;
prepare_texture(t);
// debugf("read pixels %d, %d, %d, %d from fb %d with format %x\n", x, y,
// width, height, ctx->read_framebuffer_binding, t.internal_format);
- x -= t.offset.x;
- y -= t.offset.y;
- assert(x >= 0 && y >= 0);
assert(x + width <= t.width);
assert(y + height <= t.height);
if (internal_format_for_data(format, type) != t.internal_format) {
debugf("mismatched format for read pixels: %x vs %x\n", t.internal_format,
internal_format_for_data(format, type));
assert(false);
- return;
- }
- // Only support readback conversions that are reversible
- assert(!format_requires_conversion(format, t.internal_format) ||
- bytes_for_internal_format(format) == t.bpp());
- uint8_t* dest = (uint8_t*)data;
- size_t destStride = width * t.bpp();
- if (y < 0) {
- dest += -y * destStride;
- height += y;
- y = 0;
- }
- if (y + height > t.height) {
- height = t.height - y;
- }
- if (x < 0) {
- dest += -x * t.bpp();
- width += x;
- x = 0;
}
- if (x + width > t.width) {
- width = t.width - x;
- }
- if (width <= 0 || height <= 0) {
- return;
+ int bpp = t.bpp();
+ char* dest = (char*)data;
+ size_t src_stride = t.stride(bpp);
+ char* src = t.sample_ptr(x, y, fb->layer, bpp, src_stride);
+ for (; height > 0; height--) {
+ if (t.internal_format == GL_RGBA8 && format != GL_BGRA) {
+ copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width);
+ } else {
+ memcpy(dest, src, width * bpp);
+ }
+ dest += width * bpp;
+ src += src_stride;
}
- convert_copy(format, t.internal_format, dest, destStride,
- (const uint8_t*)t.sample_ptr(x, y), t.stride(), width, height);
}
void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel,
GLint srcX, GLint srcY, GLint srcZ, GLuint dstName,
- GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX,
- GLint dstY, GLint dstZ, GLsizei srcWidth,
- GLsizei srcHeight, GLsizei srcDepth) {
+ GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX, GLint dstY,
+ GLint dstZ, GLsizei srcWidth, GLsizei srcHeight,
+ GLsizei srcDepth) {
assert(srcLevel == 0 && dstLevel == 0);
- assert(srcZ == 0 && srcDepth == 1 && dstZ == 0);
if (srcTarget == GL_RENDERBUFFER) {
Renderbuffer& rb = ctx->renderbuffers[srcName];
srcName = rb.texture;
@@ -2581,44 +2093,532 @@ void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel,
prepare_texture(srctex);
Texture& dsttex = ctx->textures[dstName];
if (!dsttex.buf) return;
- assert(!dsttex.locked);
IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight};
prepare_texture(dsttex, &skip);
assert(srctex.internal_format == dsttex.internal_format);
assert(srcWidth >= 0);
assert(srcHeight >= 0);
+ assert(srcDepth >= 0);
assert(srcX + srcWidth <= srctex.width);
assert(srcY + srcHeight <= srctex.height);
+ assert(srcZ + srcDepth <= max(srctex.depth, 1));
assert(dstX + srcWidth <= dsttex.width);
assert(dstY + srcHeight <= dsttex.height);
+ assert(dstZ + srcDepth <= max(dsttex.depth, 1));
int bpp = srctex.bpp();
- int src_stride = srctex.stride();
- int dest_stride = dsttex.stride();
- char* dest = dsttex.sample_ptr(dstX, dstY);
- char* src = srctex.sample_ptr(srcX, srcY);
- for (int y = 0; y < srcHeight; y++) {
- memcpy(dest, src, srcWidth * bpp);
- dest += dest_stride;
- src += src_stride;
+ int src_stride = srctex.stride(bpp);
+ int dest_stride = dsttex.stride(bpp);
+ for (int z = 0; z < srcDepth; z++) {
+ char* dest = dsttex.sample_ptr(dstX, dstY, dstZ + z, bpp, dest_stride);
+ char* src = srctex.sample_ptr(srcX, srcY, srcZ + z, bpp, src_stride);
+ for (int y = 0; y < srcHeight; y++) {
+ memcpy(dest, src, srcWidth * bpp);
+ dest += dest_stride;
+ src += src_stride;
+ }
}
}
-void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset,
- GLint yoffset, GLint x, GLint y, GLsizei width,
+void CopyTexSubImage3D(GLenum target, UNUSED GLint level, GLint xoffset, GLint yoffset,
+ GLint zoffset, GLint x, GLint y, GLsizei width,
GLsizei height) {
assert(level == 0);
Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
if (!fb) return;
- CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D, 0, x, y, 0,
- ctx->get_binding(target), GL_TEXTURE_2D, 0, xoffset, yoffset,
- 0, width, height, 1);
+ CopyImageSubData(fb->color_attachment, GL_TEXTURE_3D, 0, x, y, fb->layer,
+ ctx->get_binding(target), GL_TEXTURE_3D, 0, xoffset, yoffset,
+ zoffset, width, height, 1);
+}
+
+void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset, GLint yoffset,
+ GLint x, GLint y, GLsizei width, GLsizei height) {
+ assert(level == 0);
+ Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
+ if (!fb) return;
+ CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D_ARRAY, 0, x, y,
+ fb->layer, ctx->get_binding(target), GL_TEXTURE_2D_ARRAY, 0,
+ xoffset, yoffset, 0, width, height, 1);
}
} // extern "C"
-#include "blend.h"
-#include "composite.h"
-#include "swgl_ext.h"
+using PackedRGBA8 = V16<uint8_t>;
+using WideRGBA8 = V16<uint16_t>;
+using HalfRGBA8 = V8<uint16_t>;
+
+static inline WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); }
+
+static inline PackedRGBA8 pack(WideRGBA8 p) {
+#if USE_SSE2
+ return _mm_packus_epi16(lowHalf(p), highHalf(p));
+#elif USE_NEON
+ return vcombine_u8(vqmovn_u16(lowHalf(p)), vqmovn_u16(highHalf(p)));
+#else
+ return CONVERT(p, PackedRGBA8);
+#endif
+}
+
+static inline HalfRGBA8 packRGBA8(I32 a, I32 b) {
+#if USE_SSE2
+ return _mm_packs_epi32(a, b);
+#elif USE_NEON
+ return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b));
+#else
+ return CONVERT(combine(a, b), HalfRGBA8);
+#endif
+}
+
+using PackedR8 = V4<uint8_t>;
+using WideR8 = V4<uint16_t>;
+
+static inline WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); }
+
+static inline WideR8 packR8(I32 a) {
+#if USE_SSE2
+ return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a)));
+#elif USE_NEON
+ return vqmovun_s32(a);
+#else
+ return CONVERT(a, WideR8);
+#endif
+}
+
+static inline PackedR8 pack(WideR8 p) {
+#if USE_SSE2
+ auto m = expand(p);
+ auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m));
+ return SHUFFLE(r, r, 0, 1, 2, 3);
+#elif USE_NEON
+ return lowHalf(bit_cast<V8<uint8_t>>(vqmovn_u16(expand(p))));
+#else
+ return CONVERT(p, PackedR8);
+#endif
+}
+
+using ZMask4 = V4<int16_t>;
+using ZMask8 = V8<int16_t>;
+
+static inline PackedRGBA8 unpack(ZMask4 mask, uint32_t*) {
+ return bit_cast<PackedRGBA8>(mask.xxyyzzww);
+}
+
+static inline WideR8 unpack(ZMask4 mask, uint8_t*) {
+ return bit_cast<WideR8>(mask);
+}
+
+#if USE_SSE2
+# define ZMASK_NONE_PASSED 0xFFFF
+# define ZMASK_ALL_PASSED 0
+static inline uint32_t zmask_code(ZMask8 mask) {
+ return _mm_movemask_epi8(mask);
+}
+static inline uint32_t zmask_code(ZMask4 mask) {
+ return zmask_code(mask.xyzwxyzw);
+}
+#else
+using ZMask4Code = V4<uint8_t>;
+using ZMask8Code = V8<uint8_t>;
+# define ZMASK_NONE_PASSED 0xFFFFFFFFU
+# define ZMASK_ALL_PASSED 0
+static inline uint32_t zmask_code(ZMask4 mask) {
+ return bit_cast<uint32_t>(CONVERT(mask, ZMask4Code));
+}
+static inline uint32_t zmask_code(ZMask8 mask) {
+ return zmask_code(
+ ZMask4((U16(lowHalf(mask)) >> 12) | (U16(highHalf(mask)) << 4)));
+}
+#endif
+
+template <int FUNC, bool MASK>
+static ALWAYS_INLINE int check_depth8(uint16_t z, uint16_t* zbuf,
+ ZMask8& outmask) {
+ ZMask8 dest = unaligned_load<ZMask8>(zbuf);
+ ZMask8 src = int16_t(z);
+ // Invert the depth test to check which pixels failed and should be discarded.
+ ZMask8 mask = FUNC == GL_LEQUAL ?
+ // GL_LEQUAL: Not(LessEqual) = Greater
+ ZMask8(src > dest)
+ :
+ // GL_LESS: Not(Less) = GreaterEqual
+ ZMask8(src >= dest);
+ switch (zmask_code(mask)) {
+ case ZMASK_NONE_PASSED:
+ return 0;
+ case ZMASK_ALL_PASSED:
+ if (MASK) {
+ unaligned_store(zbuf, src);
+ }
+ return -1;
+ default:
+ if (MASK) {
+ unaligned_store(zbuf, (mask & dest) | (~mask & src));
+ }
+ outmask = mask;
+ return 1;
+ }
+}
+
+template <bool FULL_SPANS, bool DISCARD>
+static ALWAYS_INLINE bool check_depth4(ZMask4 src, uint16_t* zbuf,
+ ZMask4& outmask, int span = 0) {
+ ZMask4 dest = unaligned_load<ZMask4>(zbuf);
+ // Invert the depth test to check which pixels failed and should be discarded.
+ ZMask4 mask = ctx->depthfunc == GL_LEQUAL
+ ?
+ // GL_LEQUAL: Not(LessEqual) = Greater
+ ZMask4(src > dest)
+ :
+ // GL_LESS: Not(Less) = GreaterEqual
+ ZMask4(src >= dest);
+ if (!FULL_SPANS) {
+ mask |= ZMask4(span) < ZMask4{1, 2, 3, 4};
+ }
+ if (zmask_code(mask) == ZMASK_NONE_PASSED) {
+ return false;
+ }
+ if (!DISCARD && ctx->depthmask) {
+ unaligned_store(zbuf, (mask & dest) | (~mask & src));
+ }
+ outmask = mask;
+ return true;
+}
+
+template <bool FULL_SPANS, bool DISCARD>
+static ALWAYS_INLINE bool check_depth4(uint16_t z, uint16_t* zbuf,
+ ZMask4& outmask, int span = 0) {
+ return check_depth4<FULL_SPANS, DISCARD>(ZMask4(int16_t(z)), zbuf, outmask,
+ span);
+}
+
+template <typename T>
+static inline ZMask4 packZMask4(T a) {
+#if USE_SSE2
+ return lowHalf(bit_cast<ZMask8>(_mm_packs_epi32(a, a)));
+#elif USE_NEON
+ return vqmovn_s32(a);
+#else
+ return CONVERT(a, ZMask4);
+#endif
+}
+
+static ALWAYS_INLINE ZMask4 packDepth() {
+ return packZMask4(cast(fragment_shader->gl_FragCoord.z * 0xFFFF) - 0x8000);
+}
+
+static ALWAYS_INLINE void discard_depth(ZMask4 src, uint16_t* zbuf,
+ ZMask4 mask) {
+ if (ctx->depthmask) {
+ ZMask4 dest = unaligned_load<ZMask4>(zbuf);
+ mask |= packZMask4(fragment_shader->isPixelDiscarded);
+ unaligned_store(zbuf, (mask & dest) | (~mask & src));
+ }
+}
+
+static ALWAYS_INLINE void discard_depth(uint16_t z, uint16_t* zbuf,
+ ZMask4 mask) {
+ discard_depth(ZMask4(int16_t(z)), zbuf, mask);
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8(const vec4& v) {
+ ivec4 i = round_pixel(v);
+ HalfRGBA8 xz = packRGBA8(i.z, i.x);
+ HalfRGBA8 yw = packRGBA8(i.y, i.w);
+ HalfRGBA8 xy = zipLow(xz, yw);
+ HalfRGBA8 zw = zipHigh(xz, yw);
+ HalfRGBA8 lo = zip2Low(xy, zw);
+ HalfRGBA8 hi = zip2High(xy, zw);
+ return combine(lo, hi);
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v) {
+ I32 i = round_pixel((Float){v.z, v.y, v.x, v.w});
+ HalfRGBA8 c = packRGBA8(i, i);
+ return combine(c, c);
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8() {
+ return pack_pixels_RGBA8(fragment_shader->gl_FragColor);
+}
+
+template <typename V>
+static inline PackedRGBA8 pack_span(uint32_t*, const V& v) {
+ return pack(pack_pixels_RGBA8(v));
+}
+
+static inline PackedRGBA8 pack_span(uint32_t*) {
+ return pack(pack_pixels_RGBA8());
+}
+
+// (x*y + x) >> 8, cheap approximation of (x*y) / 255
+template <typename T>
+static inline T muldiv255(T x, T y) {
+ return (x * y + x) >> 8;
+}
+
+// Byte-wise addition for when x or y is a signed 8-bit value stored in the
+// low byte of a larger type T only with zeroed-out high bits, where T is
+// greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used
+// upon signed operands, using up all the precision in a 16 bit integer, and
+// potentially losing the sign bit in the last >> 8 shift. Due to the
+// properties of two's complement arithmetic, even though we've discarded the
+// sign bit, we can still represent a negative number under addition (without
+// requiring any extra sign bits), just that any negative number will behave
+// like a large unsigned number under addition, generating a single carry bit
+// on overflow that we need to discard. Thus, just doing a byte-wise add will
+// overflow without the troublesome carry, giving us only the remaining 8 low
+// bits we actually need while keeping the high bits at zero.
+template <typename T>
+static inline T addlow(T x, T y) {
+ typedef VectorType<uint8_t, sizeof(T)> bytes;
+ return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y));
+}
+
+static inline WideRGBA8 alphas(WideRGBA8 c) {
+ return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15);
+}
+
+static inline WideRGBA8 blend_pixels_RGBA8(PackedRGBA8 pdst, WideRGBA8 src) {
+ WideRGBA8 dst = unpack(pdst);
+ const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0, 0xFFFF, 0xFFFF,
+ 0xFFFF, 0, 0xFFFF, 0xFFFF, 0xFFFF, 0,
+ 0xFFFF, 0xFFFF, 0xFFFF, 0};
+ const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF,
+ 0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF};
+ const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255,
+ 0, 0, 0, 255, 0, 0, 0, 255};
+ switch (blend_key) {
+ case BLEND_KEY_NONE:
+ return src;
+ case BLEND_KEY(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE):
+ // dst + src.a*(src.rgb1 - dst.rgb0)
+ // use addlow for signed overflow
+ return addlow(dst,
+ muldiv255(alphas(src), (src | ALPHA_OPAQUE) - (dst & RGB_MASK)));
+ case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
+ return src + dst - muldiv255(dst, alphas(src));
+ case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_COLOR):
+ return dst - muldiv255(dst, src);
+ case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE):
+ return dst - (muldiv255(dst, src) & RGB_MASK);
+ case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA):
+ return dst - muldiv255(dst, alphas(src));
+ case BLEND_KEY(GL_ZERO, GL_SRC_COLOR):
+ return muldiv255(src, dst);
+ case BLEND_KEY(GL_ONE, GL_ONE):
+ return src + dst;
+ case BLEND_KEY(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
+ return src + dst - (muldiv255(dst, src) & ALPHA_MASK);
+ case BLEND_KEY(GL_ONE, GL_ZERO):
+ return src;
+ case BLEND_KEY(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE):
+ // src*(1-dst.a) + dst*1 = src - src*dst.a + dst
+ return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK);
+ case BLEND_KEY(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR):
+ // src*k + (1-src)*dst = src*k + dst - src*dst = dst + src*(k - dst)
+ // use addlow for signed overflow
+ return addlow(dst,
+ muldiv255(src, combine(ctx->blendcolor, ctx->blendcolor) - dst));
+ case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
+ WideRGBA8 secondary =
+ pack_pixels_RGBA8(fragment_shader->gl_SecondaryFragColor);
+ return src + dst - muldiv255(dst, secondary);
+ }
+ default:
+ UNREACHABLE;
+ // return src;
+ }
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint32_t* buf, PackedRGBA8 mask) {
+ PackedRGBA8 dst = unaligned_load<PackedRGBA8>(buf);
+ WideRGBA8 r = pack_pixels_RGBA8();
+ if (blend_key) r = blend_pixels_RGBA8(dst, r);
+ if (DISCARD) mask |= bit_cast<PackedRGBA8>(fragment_shader->isPixelDiscarded);
+ unaligned_store(buf, (mask & dst) | (~mask & pack(r)));
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint32_t* buf) {
+ discard_output<DISCARD>(buf, 0);
+}
+
+template <>
+inline void discard_output<false>(uint32_t* buf) {
+ WideRGBA8 r = pack_pixels_RGBA8();
+ if (blend_key) r = blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), r);
+ unaligned_store(buf, pack(r));
+}
+
+static inline PackedRGBA8 span_mask_RGBA8(int span) {
+ return bit_cast<PackedRGBA8>(I32(span) < I32{1, 2, 3, 4});
+}
+
+static inline PackedRGBA8 span_mask(uint32_t*, int span) {
+ return span_mask_RGBA8(span);
+}
+
+static inline WideR8 pack_pixels_R8(Float c) {
+ return packR8(round_pixel(c));
+}
+
+static inline WideR8 pack_pixels_R8() {
+ return pack_pixels_R8(fragment_shader->gl_FragColor.x);
+}
+
+template <typename C>
+static inline PackedR8 pack_span(uint8_t*, C c) {
+ return pack(pack_pixels_R8(c));
+}
+
+static inline PackedR8 pack_span(uint8_t*) { return pack(pack_pixels_R8()); }
+
+static inline WideR8 blend_pixels_R8(WideR8 dst, WideR8 src) {
+ switch (blend_key) {
+ case BLEND_KEY_NONE:
+ return src;
+ case BLEND_KEY(GL_ZERO, GL_SRC_COLOR):
+ return muldiv255(src, dst);
+ case BLEND_KEY(GL_ONE, GL_ONE):
+ return src + dst;
+ case BLEND_KEY(GL_ONE, GL_ZERO):
+ return src;
+ default:
+ UNREACHABLE;
+ // return src;
+ }
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint8_t* buf, WideR8 mask) {
+ WideR8 dst = unpack(unaligned_load<PackedR8>(buf));
+ WideR8 r = pack_pixels_R8();
+ if (blend_key) r = blend_pixels_R8(dst, r);
+ if (DISCARD) mask |= packR8(fragment_shader->isPixelDiscarded);
+ unaligned_store(buf, pack((mask & dst) | (~mask & r)));
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint8_t* buf) {
+ discard_output<DISCARD>(buf, 0);
+}
+
+template <>
+inline void discard_output<false>(uint8_t* buf) {
+ WideR8 r = pack_pixels_R8();
+ if (blend_key) r = blend_pixels_R8(unpack(unaligned_load<PackedR8>(buf)), r);
+ unaligned_store(buf, pack(r));
+}
+
+static inline WideR8 span_mask_R8(int span) {
+ return bit_cast<WideR8>(WideR8(span) < WideR8{1, 2, 3, 4});
+}
+
+static inline WideR8 span_mask(uint8_t*, int span) {
+ return span_mask_R8(span);
+}
+
+template <bool DISCARD, bool W, typename P, typename M>
+static inline void commit_output(P* buf, M mask) {
+ fragment_shader->run<W>();
+ discard_output<DISCARD>(buf, mask);
+}
+
+template <bool DISCARD, bool W, typename P>
+static inline void commit_output(P* buf) {
+ fragment_shader->run<W>();
+ discard_output<DISCARD>(buf);
+}
+
+template <bool DISCARD, bool W, typename P>
+static inline void commit_output(P* buf, int span) {
+ commit_output<DISCARD, W>(buf, span_mask(buf, span));
+}
+
+template <bool DISCARD, bool W, typename P, typename Z>
+static inline void commit_output(P* buf, Z z, uint16_t* zbuf) {
+ ZMask4 zmask;
+ if (check_depth4<true, DISCARD>(z, zbuf, zmask)) {
+ commit_output<DISCARD, W>(buf, unpack(zmask, buf));
+ if (DISCARD) {
+ discard_depth(z, zbuf, zmask);
+ }
+ } else {
+ fragment_shader->skip<W>();
+ }
+}
+
+template <bool DISCARD, bool W, typename P, typename Z>
+static inline void commit_output(P* buf, Z z, uint16_t* zbuf, int span) {
+ ZMask4 zmask;
+ if (check_depth4<false, DISCARD>(z, zbuf, zmask, span)) {
+ commit_output<DISCARD, W>(buf, unpack(zmask, buf));
+ if (DISCARD) {
+ discard_depth(z, zbuf, zmask);
+ }
+ }
+}
+
+static inline void commit_span(uint32_t* buf, PackedRGBA8 r) {
+ if (blend_key)
+ r = pack(blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), unpack(r)));
+ unaligned_store(buf, r);
+}
+
+UNUSED static inline void commit_solid_span(uint32_t* buf, PackedRGBA8 r,
+ int len) {
+ if (blend_key) {
+ auto src = unpack(r);
+ for (uint32_t* end = &buf[len]; buf < end; buf += 4) {
+ unaligned_store(
+ buf, pack(blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), src)));
+ }
+ } else {
+ fill_n(buf, len, bit_cast<U32>(r).x);
+ }
+}
+
+UNUSED static inline void commit_texture_span(uint32_t* buf, uint32_t* src,
+ int len) {
+ if (blend_key) {
+ for (uint32_t* end = &buf[len]; buf < end; buf += 4, src += 4) {
+ PackedRGBA8 r = unaligned_load<PackedRGBA8>(src);
+ unaligned_store(buf, pack(blend_pixels_RGBA8(
+ unaligned_load<PackedRGBA8>(buf), unpack(r))));
+ }
+ } else {
+ memcpy(buf, src, len * sizeof(uint32_t));
+ }
+}
+
+static inline void commit_span(uint8_t* buf, PackedR8 r) {
+ if (blend_key)
+ r = pack(blend_pixels_R8(unpack(unaligned_load<PackedR8>(buf)), unpack(r)));
+ unaligned_store(buf, r);
+}
+
+UNUSED static inline void commit_solid_span(uint8_t* buf, PackedR8 r, int len) {
+ if (blend_key) {
+ auto src = unpack(r);
+ for (uint8_t* end = &buf[len]; buf < end; buf += 4) {
+ unaligned_store(buf, pack(blend_pixels_R8(
+ unpack(unaligned_load<PackedR8>(buf)), src)));
+ }
+ } else {
+ fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(r));
+ }
+}
+
+#define DISPATCH_DRAW_SPAN(self, buf, len) do { \
+ int drawn = self->draw_span(buf, len); \
+ if (drawn) self->step_interp_inputs(drawn >> 2); \
+ for (buf += drawn; drawn < len; drawn += 4, buf += 4) { \
+ run(self); \
+ commit_span(buf, pack_span(buf)); \
+ } \
+} while (0)
+
+#include "texture.h"
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wuninitialized"
@@ -2627,14 +2627,942 @@ void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset,
#pragma GCC diagnostic ignored "-Wunused-variable"
#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
#ifdef __clang__
-# pragma GCC diagnostic ignored "-Wunused-private-field"
+#pragma GCC diagnostic ignored "-Wunused-private-field"
#else
-# pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
#endif
#include "load_shader.h"
#pragma GCC diagnostic pop
-#include "rasterize.h"
+typedef vec2_scalar Point2D;
+typedef vec4_scalar Point3D;
+
+struct ClipRect {
+ float x0;
+ float y0;
+ float x1;
+ float y1;
+
+ ClipRect(const IntRect& i) : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {}
+ ClipRect(Texture& t) : ClipRect(ctx->apply_scissor(t.bounds())) {}
+
+ template <typename P>
+ bool overlaps(int nump, const P* p) const {
+ // Generate a mask of which side of the clip rect all of a polygon's points
+ // fall inside of. This is a cheap conservative estimate of whether the
+ // bounding box of the polygon might overlap the clip rect, rather than an
+ // exact test that would require multiple slower line intersections.
+ int sides = 0;
+ for (int i = 0; i < nump; i++) {
+ sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2;
+ sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8;
+ }
+ return sides == 0xF;
+ }
+};
+
+// Helper function for drawing 8-pixel wide chunks of a span with depth buffer.
+// Using 8-pixel chunks maximizes use of 16-bit depth values in 128-bit wide
+// SIMD register. However, since fragment shaders process only 4 pixels per
+// invocation, we need to run fragment shader twice for every 8 pixel batch
+// of results we get from the depth test. Perspective is not supported.
+template <int FUNC, bool MASK, typename P>
+static inline void draw_depth_span(uint16_t z, P* buf, uint16_t* depth,
+ int span) {
+ int skip = 0;
+ // Check if the fragment shader has an optimized draw specialization.
+ if (fragment_shader->has_draw_span(buf)) {
+ // The loop tries to accumulate runs of pixels that passed (len) and
+ // runs of pixels that failed (skip). This allows it to pass the largest
+ // possible span in between changes in depth pass or fail status to the
+ // fragment shader's draw specialer.
+ int len = 0;
+ do {
+ ZMask8 zmask;
+ // Process depth in 8-pixel chunks.
+ switch (check_depth8<FUNC, MASK>(z, depth, zmask)) {
+ case 0: // All pixels failed the depth test.
+ if (len) {
+ // Flush out passed pixels.
+ fragment_shader->draw_span(buf - len, len);
+ len = 0;
+ }
+ // Accumulate 2 skipped chunks.
+ skip += 2;
+ break;
+ case -1: // All pixels passed the depth test.
+ if (skip) {
+ // Flushed out any skipped chunks.
+ fragment_shader->skip(skip);
+ skip = 0;
+ }
+ // Accumulate 8 passed pixels.
+ len += 8;
+ break;
+ default: // Mixture of pass and fail results.
+ if (len) {
+ // Flush out any passed pixels.
+ fragment_shader->draw_span(buf - len, len);
+ len = 0;
+ } else if (skip) {
+ // Flush out any skipped chunks.
+ fragment_shader->skip(skip);
+ skip = 0;
+ }
+ // Run fragment shader on first 4 depth results.
+ commit_output<false, false>(buf, unpack(lowHalf(zmask), buf));
+ // Run fragment shader on next 4 depth results.
+ commit_output<false, false>(buf + 4, unpack(highHalf(zmask), buf));
+ break;
+ }
+ // Advance to next 8 pixels...
+ buf += 8;
+ depth += 8;
+ span -= 8;
+ } while (span >= 8);
+ // Flush out any remaining passed pixels.
+ if (len) {
+ fragment_shader->draw_span(buf - len, len);
+ }
+ } else {
+ // No draw specialization, so we can use a simpler loop here that just
+ // accumulates depth failures, but otherwise invokes fragment shader
+ // immediately on depth pass.
+ do {
+ ZMask8 zmask;
+ // Process depth in 8-pixel chunks.
+ switch (check_depth8<FUNC, MASK>(z, depth, zmask)) {
+ case 0: // All pixels failed the depth test.
+ // Accumulate 2 skipped chunks.
+ skip += 2;
+ break;
+ case -1: // All pixels passed the depth test.
+ if (skip) {
+ // Flush out any skipped chunks.
+ fragment_shader->skip(skip);
+ skip = 0;
+ }
+ // Run the fragment shader for two 4-pixel chunks.
+ commit_output<false, false>(buf);
+ commit_output<false, false>(buf + 4);
+ break;
+ default: // Mixture of pass and fail results.
+ if (skip) {
+ // Flush out any skipped chunks.
+ fragment_shader->skip(skip);
+ skip = 0;
+ }
+ // Run fragment shader on first 4 depth results.
+ commit_output<false, false>(buf, unpack(lowHalf(zmask), buf));
+ // Run fragment shader on next 4 depth results.
+ commit_output<false, false>(buf + 4, unpack(highHalf(zmask), buf));
+ break;
+ }
+ // Advance to next 8 pixels...
+ buf += 8;
+ depth += 8;
+ span -= 8;
+ } while (span >= 8);
+ }
+ // Flush out any remaining skipped chunks.
+ if (skip) {
+ fragment_shader->skip(skip);
+ }
+}
+
+// Draw a simple span in 4-pixel wide chunks, optionally using depth.
+template <bool DISCARD, bool W, typename P, typename Z>
+static ALWAYS_INLINE void draw_span(P* buf, uint16_t* depth, int span, Z z) {
+ if (depth) {
+ // Depth testing is enabled. If perspective is used, Z values will vary
+ // across the span, we use packDepth to generate 16-bit Z values suitable
+ // for depth testing based on current values from gl_FragCoord.z.
+ // Otherwise, for the no-perspective case, we just use the provided Z.
+ // Process 4-pixel chunks first.
+ for (; span >= 4; span -= 4, buf += 4, depth += 4) {
+ commit_output<DISCARD, W>(buf, z(), depth);
+ }
+ // If there are any remaining pixels, do a partial chunk.
+ if (span > 0) {
+ commit_output<DISCARD, W>(buf, z(), depth, span);
+ }
+ } else {
+ // Process 4-pixel chunks first.
+ for (; span >= 4; span -= 4, buf += 4) {
+ commit_output<DISCARD, W>(buf);
+ }
+ // If there are any remaining pixels, do a partial chunk.
+ if (span > 0) {
+ commit_output<DISCARD, W>(buf, span);
+ }
+ }
+}
+
+// Draw spans for each row of a given quad (or triangle) with a constant Z
+// value. The quad is assumed convex. It is clipped to fall within the given
+// clip rect. In short, this function rasterizes a quad by first finding a
+// top most starting point and then from there tracing down the left and right
+// sides of this quad until it hits the bottom, outputting a span between the
+// current left and right positions at each row along the way. Points are
+// assumed to be ordered in either CW or CCW to support this, but currently
+// both orders (CW and CCW) are supported and equivalent.
+template <typename P>
+static inline void draw_quad_spans(int nump, Point2D p[4], uint16_t z,
+ Interpolants interp_outs[4],
+ Texture& colortex, int layer,
+ Texture& depthtex,
+ const ClipRect& clipRect) {
+ // Only triangles and convex quads supported.
+ assert(nump == 3 || nump == 4);
+ Point2D l0, r0, l1, r1;
+ int l0i, r0i, l1i, r1i;
+ {
+ // Find the index of the top-most (smallest Y) point from which
+ // rasterization can start.
+ int top = nump > 3 && p[3].y < p[2].y
+ ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3)
+ : (p[1].y < p[3].y ? 1 : 3))
+ : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2)
+ : (p[1].y < p[2].y ? 1 : 2));
+ // Helper to find next index in the points array, walking forward.
+#define NEXT_POINT(idx) \
+ ({ \
+ int cur = (idx) + 1; \
+ cur < nump ? cur : 0; \
+ })
+ // Helper to find the previous index in the points array, walking backward.
+#define PREV_POINT(idx) \
+ ({ \
+ int cur = (idx)-1; \
+ cur >= 0 ? cur : nump - 1; \
+ })
+ // Start looking for "left"-side and "right"-side descending edges starting
+ // from the determined top point.
+ int next = NEXT_POINT(top);
+ int prev = PREV_POINT(top);
+ if (p[top].y == p[next].y) {
+ // If the next point is on the same row as the top, then advance one more
+ // time to the next point and use that as the "left" descending edge.
+ l0i = next;
+ l1i = NEXT_POINT(next);
+ // Assume top and prev form a descending "right" edge, as otherwise this
+ // will be a collapsed polygon and harmlessly bail out down below.
+ r0i = top;
+ r1i = prev;
+ } else if (p[top].y == p[prev].y) {
+ // If the prev point is on the same row as the top, then advance to the
+ // prev again and use that as the "right" descending edge.
+ // Assume top and next form a non-empty descending "left" edge.
+ l0i = top;
+ l1i = next;
+ r0i = prev;
+ r1i = PREV_POINT(prev);
+ } else {
+ // Both next and prev are on distinct rows from top, so both "left" and
+ // "right" edges are non-empty/descending.
+ l0i = r0i = top;
+ l1i = next;
+ r1i = prev;
+ }
+ // Load the points from the indices.
+ l0 = p[l0i]; // Start of left edge
+ r0 = p[r0i]; // End of left edge
+ l1 = p[l1i]; // Start of right edge
+ r1 = p[r1i]; // End of right edge
+ // debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1:
+ // %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i,
+ // r1.x, r1.y);
+ }
+
+ struct Edge
+ {
+ float yScale;
+ float xSlope;
+ float x;
+ Interpolants interpSlope;
+ Interpolants interp;
+
+ Edge(float y, const Point2D& p0, const Point2D& p1,
+ const Interpolants& i0, const Interpolants& i1) :
+ // Inverse Y scale for slope calculations. Avoid divide on 0-length edge.
+ // Later checks below ensure that Y <= p1.y, or otherwise we don't use
+ // this edge. We just need to guard against Y == p1.y == p0.y. In that
+ // case, Y - p0.y == 0 and will cancel out the slopes below, except if
+ // yScale is Inf for some reason (or worse, NaN), which 1/(p1.y-p0.y)
+ // might produce if we don't bound it.
+ yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
+ // Calculate dX/dY slope
+ xSlope((p1.x - p0.x) * yScale),
+ // Initialize current X based on Y and slope
+ x(p0.x + (y - p0.y) * xSlope),
+ // Calculate change in interpolants per change in Y
+ interpSlope((i1 - i0) * yScale),
+ // Initialize current interpolants based on Y and slope
+ interp(i0 + (y - p0.y) * interpSlope)
+ {}
+
+ void nextRow() {
+ // step current X and interpolants to next row from slope
+ x += xSlope;
+ interp += interpSlope;
+ }
+ };
+
+ // Vertex selection above should result in equal left and right start rows
+ assert(l0.y == r0.y);
+ // Find the start y, clip to within the clip rect, and round to row center.
+ float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f;
+ // Initialize left and right edges from end points and start Y
+ Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+ Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+ // Get pointer to color buffer and depth buffer at current Y
+ P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer, sizeof(P));
+ uint16_t* fdepth =
+ (uint16_t*)depthtex.sample_ptr(0, int(y), 0, sizeof(uint16_t));
+ // Loop along advancing Ys, rasterizing spans at each row
+ float checkY = min(min(l1.y, r1.y), clipRect.y1);
+ for (;;) {
+ // Check if we maybe passed edge ends or outside clip rect...
+ if (y > checkY) {
+ // If we're outside the clip rect, we're done.
+ if (y > clipRect.y1) break;
+ // Helper to find the next non-duplicate vertex that doesn't loop back.
+#define STEP_EDGE(e0i, e0, e1i, e1, STEP_POINT, end) \
+ for (;;) { \
+ /* Set new start of edge to be end of old edge */ \
+ e0i = e1i; \
+ e0 = e1; \
+ /* Set new end of edge to next point */ \
+ e1i = STEP_POINT(e1i); \
+ e1 = p[e1i]; \
+ /* If the edge is descending, use it. */ \
+ if (e1.y > e0.y) break; \
+ /* If the edge is ascending or crossed the end, we're done. */ \
+ if (e1.y < e0.y || e0i == end) return; \
+ /* Otherwise, it's a duplicate, so keep searching. */ \
+ }
+ // Check if Y advanced past the end of the left edge
+ if (y > l1.y) {
+ // Step to next left edge past Y and reset edge interpolants.
+ do { STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i); } while (y > l1.y);
+ left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+ }
+ // Check if Y advanced past the end of the right edge
+ if (y > r1.y) {
+ // Step to next right edge past Y and reset edge interpolants.
+ do { STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i); } while (y > r1.y);
+ right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+ }
+ // Reset check condition for next time around.
+ checkY = min(min(l1.y, r1.y), clipRect.y1);
+ }
+ // lx..rx form the bounds of the span. WR does not use backface culling,
+ // so we need to use min/max to support the span in either orientation.
+ // Clip the span to fall within the clip rect and then round to nearest
+ // column.
+ int startx = int(max(min(left.x, right.x), clipRect.x0) + 0.5f);
+ int endx = int(min(max(left.x, right.x), clipRect.x1) + 0.5f);
+ // Check if span is non-empty.
+ int span = endx - startx;
+ if (span > 0) {
+ ctx->shaded_rows++;
+ ctx->shaded_pixels += span;
+ // Advance color/depth buffer pointers to the start of the span.
+ P* buf = fbuf + startx;
+ // Check if the we will need to use depth-buffer or discard on this span.
+ uint16_t* depth = depthtex.buf != nullptr ? fdepth + startx : nullptr;
+ bool use_discard = fragment_shader->use_discard();
+ if (depthtex.delay_clear) {
+ // Delayed clear is enabled for the depth buffer. Check if this row
+ // needs to be cleared.
+ int yi = int(y);
+ uint32_t& mask = depthtex.cleared_rows[yi / 32];
+ if ((mask & (1 << (yi & 31))) == 0) {
+ // The depth buffer is unitialized on this row, but we know it will
+ // thus be cleared entirely to the clear value. This lets us quickly
+ // check the constant Z value of the quad against the clear Z to know
+ // if the entire span passes or fails the depth test all at once.
+ switch (ctx->depthfunc) {
+ case GL_LESS:
+ if (int16_t(z) < int16_t(depthtex.clear_val))
+ break;
+ else
+ goto next_span;
+ case GL_LEQUAL:
+ if (int16_t(z) <= int16_t(depthtex.clear_val))
+ break;
+ else
+ goto next_span;
+ }
+ // If we got here, we passed the depth test.
+ if (ctx->depthmask) {
+ // Depth writes are enabled, so we need to initialize depth.
+ mask |= 1 << (yi & 31);
+ depthtex.delay_clear--;
+ if (use_discard) {
+ // if discard is enabled, we don't know what pixels may be
+ // written to, so we have to clear the entire row.
+ force_clear_row<uint16_t>(depthtex, yi);
+ } else {
+ // Otherwise, we only need to clear the pixels that fall outside
+ // the current span on this row.
+ if (startx > 0 || endx < depthtex.width) {
+ force_clear_row<uint16_t>(depthtex, yi, startx, endx);
+ }
+ // Fill in the span's Z values with constant Z.
+ clear_buffer<uint16_t>(depthtex, z, 0,
+ IntRect{startx, yi, endx, yi + 1});
+ // We already passed the depth test, so no need to test depth
+ // any more.
+ depth = nullptr;
+ }
+ } else {
+ // No depth writes, so don't clear anything, and no need to test.
+ depth = nullptr;
+ }
+ }
+ }
+ if (colortex.delay_clear) {
+ // Delayed clear is enabled for the color buffer. Check if needs clear.
+ int yi = int(y);
+ uint32_t& mask = colortex.cleared_rows[yi / 32];
+ if ((mask & (1 << (yi & 31))) == 0) {
+ mask |= 1 << (yi & 31);
+ colortex.delay_clear--;
+ if (depth || blend_key || use_discard) {
+ // If depth test, blending, or discard is used, old color values
+ // might be sampled, so we need to clear the entire row to fill it.
+ force_clear_row<P>(colortex, yi);
+ } else if (startx > 0 || endx < colortex.width) {
+ // Otherwise, we only need to clear the row outside of the span.
+ // The fragment shader will fill the row within the span itself.
+ force_clear_row<P>(colortex, yi, startx, endx);
+ }
+ }
+ }
+ // Initialize fragment shader interpolants to current span position.
+ fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1);
+ fragment_shader->gl_FragCoord.y = y;
+ {
+ // Change in interpolants is difference between current right and left
+ // edges per the change in right and left X.
+ Interpolants step =
+ (right.interp - left.interp) * (1.0f / (right.x - left.x));
+ // Advance current interpolants to X at start of span.
+ Interpolants o = left.interp + step * (startx + 0.5f - left.x);
+ fragment_shader->init_span(&o, &step, 4.0f);
+ }
+ if (!use_discard) {
+ // Fast paths for the case where fragment discard is not used.
+ if (depth) {
+ // If depth is used, we want to process spans in 8-pixel chunks to
+ // maximize sampling and testing 16-bit depth values within the 128-
+ // bit width of a SIMD register.
+ if (span >= 8) {
+ // Specializations for supported depth functions depending on
+ // whether depth writes are enabled.
+ if (ctx->depthfunc == GL_LEQUAL) {
+ if (ctx->depthmask)
+ draw_depth_span<GL_LEQUAL, true>(z, buf, depth, span);
+ else
+ draw_depth_span<GL_LEQUAL, false>(z, buf, depth, span);
+ } else {
+ if (ctx->depthmask)
+ draw_depth_span<GL_LESS, true>(z, buf, depth, span);
+ else
+ draw_depth_span<GL_LESS, false>(z, buf, depth, span);
+ }
+ // Advance buffers past processed chunks.
+ buf += span & ~7;
+ depth += span & ~7;
+ span &= 7;
+ }
+ } else {
+ // Check if the fragment shader has an optimized draw specialization.
+ if (span >= 4 && fragment_shader->has_draw_span(buf)) {
+ // Draw specialization expects 4-pixel chunks.
+ int len = span & ~3;
+ fragment_shader->draw_span(buf, len);
+ buf += len;
+ span &= 3;
+ }
+ }
+ draw_span<false, false>(buf, depth, span, [=]{ return z; });
+ } else {
+ // If discard is used, then use slower fallbacks. This should be rare.
+ // Just needs to work, doesn't need to be too fast yet...
+ draw_span<true, false>(buf, depth, span, [=]{ return z; });
+ }
+ }
+ next_span:
+ // Advance Y and edge interpolants to next row.
+ y++;
+ left.nextRow();
+ right.nextRow();
+ // Advance buffers to next row.
+ fbuf += colortex.stride(sizeof(P)) / sizeof(P);
+ fdepth += depthtex.stride(sizeof(uint16_t)) / sizeof(uint16_t);
+ }
+}
+
+// Draw perspective-correct spans for a convex quad that has been clipped to
+// the near and far Z planes, possibly producing a clipped convex polygon with
+// more than 4 sides. This assumes the Z value will vary across the spans and
+// requires interpolants to factor in W values. This tends to be slower than
+// the simpler 2D draw_quad_spans above, especially since we can't optimize the
+// depth test easily when Z values, and should be used only rarely if possible.
+template <typename P>
+static inline void draw_perspective_spans(int nump, Point3D* p,
+ Interpolants* interp_outs,
+ Texture& colortex, int layer,
+ Texture& depthtex,
+ const ClipRect& clipRect) {
+ Point3D l0, r0, l1, r1;
+ int l0i, r0i, l1i, r1i;
+ {
+ // Find the index of the top-most point (smallest Y) from which
+ // rasterization can start.
+ int top = 0;
+ for (int i = 1; i < nump; i++) {
+ if (p[i].y < p[top].y) {
+ top = i;
+ }
+ }
+ // Find left-most top point, the start of the left descending edge.
+ // Advance forward in the points array, searching at most nump points
+ // in case the polygon is flat.
+ l0i = top;
+ for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) {
+ l0i = i;
+ }
+ if (l0i == nump - 1) {
+ for (int i = 0; i <= top && p[i].y == p[top].y; i++) {
+ l0i = i;
+ }
+ }
+ // Find right-most top point, the start of the right descending edge.
+ // Advance backward in the points array, searching at most nump points.
+ r0i = top;
+ for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) {
+ r0i = i;
+ }
+ if (r0i == 0) {
+ for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) {
+ r0i = i;
+ }
+ }
+ // End of left edge is next point after left edge start.
+ l1i = NEXT_POINT(l0i);
+ // End of right edge is prev point after right edge start.
+ r1i = PREV_POINT(r0i);
+ l0 = p[l0i]; // Start of left edge
+ r0 = p[r0i]; // End of left edge
+ l1 = p[l1i]; // Start of right edge
+ r1 = p[r1i]; // End of right edge
+ }
+
+ struct Edge
+ {
+ float yScale;
+ // Current coordinates for edge. Where in the 2D case of draw_quad_spans,
+ // it is enough to just track the X coordinate as we advance along the rows,
+ // for the perspective case we also need to keep track of Z and W. For
+ // simplicity, we just use the full 3D point to track all these coordinates.
+ Point3D pSlope;
+ Point3D p;
+ Interpolants interpSlope;
+ Interpolants interp;
+
+ Edge(float y, const Point3D& p0, const Point3D& p1,
+ const Interpolants& i0, const Interpolants& i1) :
+ // Inverse Y scale for slope calculations. Avoid divide on 0-length edge.
+ yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
+ // Calculate dX/dY slope
+ pSlope((p1 - p0) * yScale),
+ // Initialize current coords based on Y and slope
+ p(p0 + (y - p0.y) * pSlope),
+ // Crucially, these interpolants must be scaled by the point's 1/w value,
+ // which allows linear interpolation in a perspective-correct manner.
+ // This will be canceled out inside the fragment shader later.
+ // Calculate change in interpolants per change in Y
+ interpSlope((i1 * p1.w - i0 * p0.w) * yScale),
+ // Initialize current interpolants based on Y and slope
+ interp(i0 * p0.w + (y - p0.y) * interpSlope)
+ {}
+
+ float x() const { return p.x; }
+ vec2_scalar zw() const { return {p.z, p.w}; }
+
+ void nextRow() {
+ // step current coords and interpolants to next row from slope
+ p += pSlope;
+ interp += interpSlope;
+ }
+ };
+
+ // Vertex selection above should result in equal left and right start rows
+ assert(l0.y == r0.y);
+ // Find the start y, clip to within the clip rect, and round to row center.
+ float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f;
+ // Initialize left and right edges from end points and start Y
+ Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+ Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+ // Get pointer to color buffer and depth buffer at current Y
+ P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer, sizeof(P));
+ uint16_t* fdepth =
+ (uint16_t*)depthtex.sample_ptr(0, int(y), 0, sizeof(uint16_t));
+ // Loop along advancing Ys, rasterizing spans at each row
+ float checkY = min(min(l1.y, r1.y), clipRect.y1);
+ for (;;) {
+ // Check if we maybe passed edge ends or outside clip rect...
+ if (y > checkY) {
+ // If we're outside the clip rect, we're done.
+ if (y > clipRect.y1) break;
+ // Check if Y advanced past the end of the left edge
+ if (y > l1.y) {
+ // Step to next left edge past Y and reset edge interpolants.
+ do { STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i); } while (y > l1.y);
+ left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+ }
+ // Check if Y advanced past the end of the right edge
+ if (y > r1.y) {
+ // Step to next right edge past Y and reset edge interpolants.
+ do { STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i); } while (y > r1.y);
+ right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+ }
+ // Reset check condition for next time around.
+ checkY = min(min(l1.y, r1.y), clipRect.y1);
+ }
+ // lx..rx form the bounds of the span. WR does not use backface culling,
+ // so we need to use min/max to support the span in either orientation.
+ // Clip the span to fall within the clip rect and then round to nearest
+ // column.
+ int startx = int(max(min(left.x(), right.x()), clipRect.x0) + 0.5f);
+ int endx = int(min(max(left.x(), right.x()), clipRect.x1) + 0.5f);
+ // Check if span is non-empty.
+ int span = endx - startx;
+ if (span > 0) {
+ ctx->shaded_rows++;
+ ctx->shaded_pixels += span;
+ // Advance color/depth buffer pointers to the start of the span.
+ P* buf = fbuf + startx;
+ // Check if the we will need to use depth-buffer or discard on this span.
+ uint16_t* depth = depthtex.buf != nullptr ? fdepth + startx : nullptr;
+ bool use_discard = fragment_shader->use_discard();
+ if (depthtex.delay_clear) {
+ // Delayed clear is enabled for the depth buffer. Check if this row
+ // needs to be cleared.
+ int yi = int(y);
+ uint32_t& mask = depthtex.cleared_rows[yi / 32];
+ if ((mask & (1 << (yi & 31))) == 0) {
+ mask |= 1 << (yi & 31);
+ depthtex.delay_clear--;
+ // Since Z varies across the span, it's easier to just clear the
+ // row and rely on later depth testing. If necessary, this could be
+ // optimized to test against the start and end Z values of the span
+ // here.
+ force_clear_row<uint16_t>(depthtex, yi);
+ }
+ }
+ if (colortex.delay_clear) {
+ // Delayed clear is enabled for the color buffer. Check if needs clear.
+ int yi = int(y);
+ uint32_t& mask = colortex.cleared_rows[yi / 32];
+ if ((mask & (1 << (yi & 31))) == 0) {
+ mask |= 1 << (yi & 31);
+ colortex.delay_clear--;
+ if (depth || blend_key || use_discard) {
+ // If depth test, blending, or discard is used, old color values
+ // might be sampled, so we need to clear the entire row to fill it.
+ force_clear_row<P>(colortex, yi);
+ } else if (startx > 0 || endx < colortex.width) {
+ // Otherwise, we only need to clear the row outside of the span.
+ // The fragment shader will fill the row within the span itself.
+ force_clear_row<P>(colortex, yi, startx, endx);
+ }
+ }
+ }
+ // Initialize fragment shader interpolants to current span position.
+ fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1);
+ fragment_shader->gl_FragCoord.y = y;
+ {
+ // Calculate the fragment Z and W change per change in fragment X step.
+ vec2_scalar stepZW =
+ (right.zw() - left.zw()) * (1.0f / (right.x() - left.x()));
+ // Calculate initial Z and W values for span start.
+ vec2_scalar zw = left.zw() + stepZW * (startx + 0.5f - left.x());
+ // Set fragment shader's Z and W values so that it can use them to
+ // cancel out the 1/w baked into the interpolants.
+ fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x);
+ fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y);
+ fragment_shader->stepZW = stepZW * 4.0f;
+ // Change in interpolants is difference between current right and left
+ // edges per the change in right and left X. The left and right
+ // interpolant values were previously multipled by 1/w, so the step and
+ // initial span values take this into account.
+ Interpolants step =
+ (right.interp - left.interp) * (1.0f / (right.x() - left.x()));
+ // Advance current interpolants to X at start of span.
+ Interpolants o = left.interp + step * (startx + 0.5f - left.x());
+ fragment_shader->init_span<true>(&o, &step, 4.0f);
+ }
+ if (!use_discard) {
+ // No discard is used. Common case.
+ draw_span<false, true>(buf, depth, span, packDepth);
+ } else {
+ // Discard is used. Rare.
+ draw_span<true, true>(buf, depth, span, packDepth);
+ }
+ }
+ // Advance Y and edge interpolants to next row.
+ y++;
+ left.nextRow();
+ right.nextRow();
+ // Advance buffers to next row.
+ fbuf += colortex.stride(sizeof(P)) / sizeof(P);
+ fdepth += depthtex.stride(sizeof(uint16_t)) / sizeof(uint16_t);
+ }
+}
+
+// Clip a primitive against both sides of a view-frustum axis, producing
+// intermediate vertexes with interpolated attributes that will no longer
+// intersect the selected axis planes. This assumes the primitive is convex
+// and should produce at most N+2 vertexes for each invocation (only in the
+// worst case where one point falls outside on each of the opposite sides
+// with the rest of the points inside).
+template <XYZW AXIS>
+static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP,
+ Interpolants* outInterp) {
+ int numClip = 0;
+ Point3D prev = p[nump - 1];
+ Interpolants prevInterp = interp[nump - 1];
+ float prevCoord = prev.select(AXIS);
+ // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and
+ // if so, remember which side it is outside of.
+ int prevSide = prevCoord < -prev.w ? -1 : (prevCoord > prev.w ? 1 : 0);
+ // Loop through points, finding edges that cross the planes by evaluating
+ // the side at each point.
+ for (int i = 0; i < nump; i++) {
+ Point3D cur = p[i];
+ Interpolants curInterp = interp[i];
+ float curCoord = cur.select(AXIS);
+ int curSide = curCoord < -cur.w ? -1 : (curCoord > cur.w ? 1 : 0);
+ // Check if the previous and current end points are on different sides.
+ if (curSide != prevSide) {
+ // One of the edge's end points is outside the plane with the other
+ // inside the plane. Find the offset where it crosses the plane and
+ // adjust the point and interpolants to there.
+ if (prevSide) {
+ // Edge that was previously outside crosses inside.
+ // Evaluate plane equation for previous and current end-point
+ // based on previous side and calculate relative offset.
+ assert(numClip < nump + 2);
+ float prevDist = prevCoord - prevSide * prev.w;
+ float curDist = curCoord - prevSide * cur.w;
+ float k = prevDist / (prevDist - curDist);
+ outP[numClip] = prev + (cur - prev) * k;
+ outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
+ numClip++;
+ }
+ if (curSide) {
+ // Edge that was previously inside crosses outside.
+ // Evaluate plane equation for previous and current end-point
+ // based on current side and calculate relative offset.
+ assert(numClip < nump + 2);
+ float prevDist = prevCoord - curSide * prev.w;
+ float curDist = curCoord - curSide * cur.w;
+ float k = prevDist / (prevDist - curDist);
+ outP[numClip] = prev + (cur - prev) * k;
+ outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
+ numClip++;
+ }
+ }
+ if (!curSide) {
+ // The current end point is inside the plane, so output point unmodified.
+ assert(numClip < nump + 2);
+ outP[numClip] = cur;
+ outInterp[numClip] = curInterp;
+ numClip++;
+ }
+ prev = cur;
+ prevInterp = curInterp;
+ prevCoord = curCoord;
+ prevSide = curSide;
+ }
+ return numClip;
+}
+
+// Helper function to dispatch to perspective span drawing with points that
+// have already been transformed and clipped.
+static inline void draw_perspective_clipped(int nump, Point3D* p_clip,
+ Interpolants* interp_clip,
+ Texture& colortex, int layer,
+ Texture& depthtex) {
+ // If polygon is ouside clip rect, nothing to draw.
+ ClipRect clipRect(colortex);
+ if (!clipRect.overlaps(nump, p_clip)) {
+ return;
+ }
+
+ // Finally draw perspective-correct spans for the polygon.
+ if (colortex.internal_format == GL_RGBA8) {
+ draw_perspective_spans<uint32_t>(nump, p_clip, interp_clip, colortex,
+ layer, depthtex, clipRect);
+ } else if (colortex.internal_format == GL_R8) {
+ draw_perspective_spans<uint8_t>(nump, p_clip, interp_clip, colortex,
+ layer, depthtex, clipRect);
+ } else {
+ assert(false);
+ }
+}
+
+// Draws a perspective-correct 3D primitive with varying Z value, as opposed
+// to a simple 2D planar primitive with a constant Z value that could be
+// trivially Z rejected. This requires clipping the primitive against the near
+// and far planes to ensure it stays within the valid Z-buffer range. The Z
+// and W of each fragment of the primitives are interpolated across the
+// generated spans and then depth-tested as appropriate.
+// Additionally, vertex attributes must be interpolated with perspective-
+// correction by dividing by W before interpolation, and then later multiplied
+// by W again to produce the final correct attribute value for each fragment.
+// This process is expensive and should be avoided if possible for primitive
+// batches that are known ahead of time to not need perspective-correction.
+static void draw_perspective(int nump,
+ Interpolants interp_outs[4],
+ Texture& colortex, int layer,
+ Texture& depthtex) {
+ // Convert output of vertex shader to screen space.
+ vec4 pos = vertex_shader->gl_Position;
+ vec3_scalar scale =
+ vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f;
+ vec3_scalar offset =
+ vec3_scalar(ctx->viewport.x0, ctx->viewport.y0, 0.0f) + scale;
+ if (test_none(pos.z <= -pos.w || pos.z >= pos.w)) {
+ // No points cross the near or far planes, so no clipping required.
+ // Just divide coords by W and convert to viewport.
+ Float w = 1.0f / pos.w;
+ vec3 screen = pos.sel(X, Y, Z) * w * scale + offset;
+ Point3D p[4] = {
+ {screen.x.x, screen.y.x, screen.z.x, w.x},
+ {screen.x.y, screen.y.y, screen.z.y, w.y},
+ {screen.x.z, screen.y.z, screen.z.z, w.z},
+ {screen.x.w, screen.y.w, screen.z.w, w.w}
+ };
+ draw_perspective_clipped(nump, p, interp_outs, colortex, layer, depthtex);
+ } else {
+ // Points cross the near or far planes, so we need to clip.
+ // Start with the original 3 or 4 points...
+ Point3D p[4] = {
+ {pos.x.x, pos.y.x, pos.z.x, pos.w.x},
+ {pos.x.y, pos.y.y, pos.z.y, pos.w.y},
+ {pos.x.z, pos.y.z, pos.z.z, pos.w.z},
+ {pos.x.w, pos.y.w, pos.z.w, pos.w.w}
+ };
+ // Clipping can expand the points by 1 for each of 6 view frustum planes.
+ Point3D p_clip[4 + 6];
+ Interpolants interp_clip[4 + 6];
+ // Clip against near and far Z planes.
+ nump = clip_side<Z>(nump, p, interp_outs, p_clip, interp_clip);
+ // If no points are left inside the view frustum, there's nothing to draw.
+ if (nump < 3) {
+ return;
+ }
+ // After clipping against only the near and far planes, we might still
+ // produce points where W = 0, exactly at the camera plane. OpenGL specifies
+ // that for clip coordinates, points must satisfy:
+ // -W <= X <= W
+ // -W <= Y <= W
+ // -W <= Z <= W
+ // When Z = W = 0, this is trivially satisfied, but when we transform and
+ // divide by W below it will produce a divide by 0. Usually we want to only
+ // clip Z to avoid the extra work of clipping X and Y. We can still project
+ // points that fall outside the view frustum X and Y so long as Z is valid.
+ // The span drawing code will then ensure X and Y are clamped to viewport
+ // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y,
+ // will push W further inside the view frustum so that it is no longer 0,
+ // allowing us to finally proceed to projecting the points to the screen.
+ for (int i = 0; i < nump; i++) {
+ // Found an invalid W, so need to clip against X and Y...
+ if (p_clip[i].w <= 0.0f) {
+ // Ping-pong p_clip -> p_tmp -> p_clip.
+ Point3D p_tmp[4 + 6];
+ Interpolants interp_tmp[4 + 6];
+ nump = clip_side<X>(nump, p_clip, interp_clip, p_tmp, interp_tmp);
+ if (nump < 3) return;
+ nump = clip_side<Y>(nump, p_tmp, interp_tmp, p_clip, interp_clip);
+ if (nump < 3) return;
+ // After clipping against X and Y planes, there's still points left
+ // to draw, so proceed to trying projection now...
+ break;
+ }
+ }
+ // Divide coords by W and convert to viewport.
+ for (int i = 0; i < nump; i++) {
+ float w = 1.0f / p_clip[i].w;
+ p_clip[i] = Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w);
+ }
+ draw_perspective_clipped(nump, p_clip, interp_clip, colortex, layer,
+ depthtex);
+ }
+}
+
+static void draw_quad(int nump, Texture& colortex, int layer,
+ Texture& depthtex) {
+ // Run vertex shader once for the primitive's vertices.
+ // Reserve space for 6 sets of interpolants, in case we need to clip against
+ // near and far planes in the perspective case.
+ Interpolants interp_outs[4];
+ vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants));
+ vec4 pos = vertex_shader->gl_Position;
+ // Check if any vertex W is different from another. If so, use perspective.
+ if (test_any(pos.w != pos.w.x)) {
+ draw_perspective(nump, interp_outs, colortex, layer, depthtex);
+ return;
+ }
+
+ // Convert output of vertex shader to screen space.
+ // Divide coords by W and convert to viewport.
+ float w = 1.0f / pos.w.x;
+ vec2 screen =
+ (pos.sel(X, Y) * w + 1) * 0.5f *
+ vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) +
+ vec2_scalar(ctx->viewport.x0, ctx->viewport.y0);
+ Point2D p[4] = {{screen.x.x, screen.y.x},
+ {screen.x.y, screen.y.y},
+ {screen.x.z, screen.y.z},
+ {screen.x.w, screen.y.w}};
+
+ // If quad is ouside clip rect, nothing to draw.
+ ClipRect clipRect(colortex);
+ if (!clipRect.overlaps(nump, p)) {
+ return;
+ }
+
+ // Since the quad is assumed 2D, Z is constant across the quad.
+ float screenZ = (pos.z.x * w + 1) * 0.5f;
+ if (screenZ < 0 || screenZ > 1) {
+ // Z values would cross the near or far plane, so just bail.
+ return;
+ }
+ // Since Z doesn't need to be interpolated, just set the fragment shader's
+ // Z and W values here, once and for all fragment shader invocations.
+ // SSE2 does not support unsigned comparison, so bias Z to be negative.
+ uint16_t z = uint16_t(0xFFFF * screenZ) - 0x8000;
+ fragment_shader->gl_FragCoord.z = screenZ;
+ fragment_shader->gl_FragCoord.w = w;
+
+ // Finally draw 2D spans for the quad. Currently only supports drawing to
+ // RGBA8 and R8 color buffers.
+ if (colortex.internal_format == GL_RGBA8) {
+ draw_quad_spans<uint32_t>(nump, p, z, interp_outs, colortex, layer,
+ depthtex, clipRect);
+ } else if (colortex.internal_format == GL_R8) {
+ draw_quad_spans<uint8_t>(nump, p, z, interp_outs, colortex, layer, depthtex,
+ clipRect);
+ } else {
+ assert(false);
+ }
+}
void VertexArray::validate() {
int last_enabled = -1;
@@ -2653,32 +3581,78 @@ void VertexArray::validate() {
max_attrib = last_enabled;
}
+template <typename INDEX>
+static inline void draw_elements(GLsizei count, GLsizei instancecount,
+ Buffer& indices_buf, size_t offset,
+ VertexArray& v, Texture& colortex, int layer,
+ Texture& depthtex) {
+ assert((offset & (sizeof(INDEX) - 1)) == 0);
+ INDEX* indices = (INDEX*)(indices_buf.buf + offset);
+ count = min(count,
+ (GLsizei)((indices_buf.size - offset) / sizeof(INDEX)));
+ // Triangles must be indexed at offsets 0, 1, 2.
+ // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3.
+ if (count == 6 && indices[1] == indices[0] + 1 &&
+ indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) {
+ assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1);
+ // Fast path - since there is only a single quad, we only load per-vertex
+ // attribs once for all instances, as they won't change across instances
+ // or within an instance.
+ vertex_shader->load_attribs(v.attribs, indices[0], 0, 4);
+ draw_quad(4, colortex, layer, depthtex);
+ for (GLsizei instance = 1; instance < instancecount; instance++) {
+ vertex_shader->load_attribs(v.attribs, indices[0], instance, 0);
+ draw_quad(4, colortex, layer, depthtex);
+ }
+ } else {
+ for (GLsizei instance = 0; instance < instancecount; instance++) {
+ for (GLsizei i = 0; i + 3 <= count; i += 3) {
+ if (indices[i + 1] != indices[i] + 1 ||
+ indices[i + 2] != indices[i] + 2) {
+ continue;
+ }
+ int nump = 3;
+ if (i + 6 <= count && indices[i + 5] == indices[i] + 3) {
+ assert(indices[i + 3] == indices[i] + 2 &&
+ indices[i + 4] == indices[i] + 1);
+ nump = 4;
+ i += 3;
+ }
+ vertex_shader->load_attribs(v.attribs, indices[i], instance, nump);
+ draw_quad(nump, colortex, layer, depthtex);
+ }
+ }
+ }
+}
+
extern "C" {
void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
- GLintptr offset, GLsizei instancecount) {
- if (offset < 0 || count <= 0 || instancecount <= 0 || !vertex_shader ||
- !fragment_shader) {
+ void* indicesptr, GLsizei instancecount) {
+ assert(mode == GL_TRIANGLES);
+ assert(type == GL_UNSIGNED_SHORT || type == GL_UNSIGNED_INT);
+ if (count <= 0 || instancecount <= 0) {
return;
}
- Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true);
- if (!fb.color_attachment) {
- return;
- }
+ Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER);
Texture& colortex = ctx->textures[fb.color_attachment];
if (!colortex.buf) {
return;
}
- assert(!colortex.locked);
assert(colortex.internal_format == GL_RGBA8 ||
colortex.internal_format == GL_R8);
Texture& depthtex = ctx->textures[ctx->depthtest ? fb.depth_attachment : 0];
if (depthtex.buf) {
- assert(depthtex.internal_format == GL_DEPTH_COMPONENT24);
+ assert(depthtex.internal_format == GL_DEPTH_COMPONENT16);
assert(colortex.width == depthtex.width &&
colortex.height == depthtex.height);
- assert(colortex.offset == depthtex.offset);
+ }
+
+ Buffer& indices_buf = ctx->buffers[ctx->element_array_buffer_binding];
+ size_t offset = (size_t)indicesptr;
+ if (!indices_buf.buf || offset >= indices_buf.size) {
+ return;
}
// debugf("current_vertex_array %d\n", ctx->current_vertex_array);
@@ -2689,8 +3663,8 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
v.validate();
}
-#ifdef PRINT_TIMINGS
- uint64_t start = get_time_value();
+#ifndef NDEBUG
+ // uint64_t start = get_time_value();
#endif
ctx->shaded_rows = 0;
@@ -2698,43 +3672,14 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
vertex_shader->init_batch();
- switch (type) {
- case GL_UNSIGNED_SHORT:
- assert(mode == GL_TRIANGLES);
- draw_elements<uint16_t>(count, instancecount, offset, v, colortex,
- depthtex);
- break;
- case GL_UNSIGNED_INT:
- assert(mode == GL_TRIANGLES);
- draw_elements<uint32_t>(count, instancecount, offset, v, colortex,
- depthtex);
- break;
- case GL_NONE:
- // Non-standard GL extension - if element type is GL_NONE, then we don't
- // use any element buffer and behave as if DrawArrays was called instead.
- for (GLsizei instance = 0; instance < instancecount; instance++) {
- switch (mode) {
- case GL_LINES:
- for (GLsizei i = 0; i + 2 <= count; i += 2) {
- vertex_shader->load_attribs(v.attribs, offset + i, instance, 2);
- draw_quad(2, colortex, depthtex);
- }
- break;
- case GL_TRIANGLES:
- for (GLsizei i = 0; i + 3 <= count; i += 3) {
- vertex_shader->load_attribs(v.attribs, offset + i, instance, 3);
- draw_quad(3, colortex, depthtex);
- }
- break;
- default:
- assert(false);
- break;
- }
- }
- break;
- default:
- assert(false);
- break;
+ if (type == GL_UNSIGNED_SHORT) {
+ draw_elements<uint16_t>(count, instancecount, indices_buf, offset, v,
+ colortex, fb.layer, depthtex);
+ } else if (type == GL_UNSIGNED_INT) {
+ draw_elements<uint32_t>(count, instancecount, indices_buf, offset, v,
+ colortex, fb.layer, depthtex);
+ } else {
+ assert(false);
}
if (ctx->samples_passed_query) {
@@ -2742,66 +3687,329 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
q.value += ctx->shaded_pixels;
}
-#ifdef PRINT_TIMINGS
- uint64_t end = get_time_value();
- printf(
- "%7.3fms draw(%s, %d): %d pixels in %d rows (avg %f pixels/row, "
- "%fns/pixel)\n",
- double(end - start) / (1000. * 1000.),
- ctx->programs[ctx->current_program].impl->get_name(), instancecount,
- ctx->shaded_pixels, ctx->shaded_rows,
- double(ctx->shaded_pixels) / ctx->shaded_rows,
- double(end - start) / max(ctx->shaded_pixels, 1));
+#ifndef NDEBUG
+ // uint64_t end = get_time_value();
+ // debugf("draw(%d): %fms for %d pixels in %d rows (avg %f pixels/row, %f
+ // ns/pixel)\n", instancecount, double(end - start)/(1000.*1000.),
+ // ctx->shaded_pixels, ctx->shaded_rows,
+ // double(ctx->shaded_pixels)/ctx->shaded_rows, double(end -
+ // start)/max(ctx->shaded_pixels, 1));
#endif
}
-void Finish() {
-#ifdef PRINT_TIMINGS
- printf("Finish\n");
-#endif
+} // extern "C"
+
+template <typename P>
+static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth,
+ int span) {
+ int frac = 0;
+ for (P* end = dst + span; dst < end; dst++) {
+ *dst = *src;
+ // Step source according to width ratio.
+ for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
+ src++;
+ }
+ }
}
-void MakeCurrent(Context* c) {
- if (ctx == c) {
+static void scale_blit(Texture& srctex, const IntRect& srcReq, int srcZ,
+ Texture& dsttex, const IntRect& dstReq, int dstZ,
+ bool invertY) {
+ // Cache scaling ratios
+ int srcWidth = srcReq.width();
+ int srcHeight = srcReq.height();
+ int dstWidth = dstReq.width();
+ int dstHeight = dstReq.height();
+ // Compute valid dest bounds
+ IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY);
+ // Compute valid source bounds
+ // Scale source to dest, rounding inward to avoid sampling outside source
+ IntRect srcBounds = srctex.sample_bounds(srcReq)
+ .scale(srcWidth, srcHeight, dstWidth, dstHeight, true);
+ // Limit dest sampling bounds to overlap source bounds
+ dstBounds.intersect(srcBounds);
+ // Check if sampling bounds are empty
+ if (dstBounds.is_empty()) {
return;
}
- ctx = c;
- setup_program(ctx ? ctx->current_program : 0);
+ // Compute final source bounds from clamped dest sampling bounds
+ srcBounds = IntRect(dstBounds)
+ .scale(dstWidth, dstHeight, srcWidth, srcHeight);
+ // Calculate source and dest pointers from clamped offsets
+ int bpp = srctex.bpp();
+ int srcStride = srctex.stride(bpp);
+ int destStride = dsttex.stride(bpp);
+ char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ, invertY);
+ char* src = srctex.sample_ptr(srcReq, srcBounds, srcZ);
+ // Inverted Y must step downward along dest rows
+ if (invertY) {
+ destStride = -destStride;
+ }
+ int span = dstBounds.width();
+ int frac = 0;
+ for (int rows = dstBounds.height(); rows > 0; rows--) {
+ if (srcWidth == dstWidth) {
+ // No scaling, so just do a fast copy.
+ memcpy(dest, src, span * bpp);
+ } else {
+ // Do scaling with different source and dest widths.
+ switch (bpp) {
+ case 1:
+ scale_row((uint8_t*)dest, dstWidth, (uint8_t*)src, srcWidth, span);
+ break;
+ case 2:
+ scale_row((uint16_t*)dest, dstWidth, (uint16_t*)src, srcWidth, span);
+ break;
+ case 4:
+ scale_row((uint32_t*)dest, dstWidth, (uint32_t*)src, srcWidth, span);
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ }
+ dest += destStride;
+ // Step source according to height ratio.
+ for (frac += srcHeight; frac >= dstHeight; frac -= dstHeight) {
+ src += srcStride;
+ }
+ }
+}
+
+static void linear_row(uint32_t* dest, int span, const vec2_scalar& srcUV,
+ float srcDU, int srcZOffset, sampler2DArray sampler) {
+ vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+ for (; span >= 4; span -= 4) {
+ auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset);
+ unaligned_store(dest, srcpx);
+ dest += 4;
+ uv.x += 4 * srcDU;
+ }
+ if (span > 0) {
+ auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset);
+ auto mask = span_mask_RGBA8(span);
+ auto dstpx = unaligned_load<PackedRGBA8>(dest);
+ unaligned_store(dest, (mask & dstpx) | (~mask & srcpx));
+ }
}
-Context* CreateContext() { return new Context; }
+static void linear_row(uint8_t* dest, int span, const vec2_scalar& srcUV,
+ float srcDU, int srcZOffset, sampler2DArray sampler) {
+ vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+ for (; span >= 4; span -= 4) {
+ auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset);
+ unaligned_store(dest, pack(srcpx));
+ dest += 4;
+ uv.x += 4 * srcDU;
+ }
+ if (span > 0) {
+ auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset);
+ auto mask = span_mask_R8(span);
+ auto dstpx = unpack(unaligned_load<PackedR8>(dest));
+ unaligned_store(dest, pack((mask & dstpx) | (~mask & srcpx)));
+ }
+}
-void ReferenceContext(Context* c) {
- if (!c) {
+static void linear_blit(Texture& srctex, const IntRect& srcReq, int srcZ,
+ Texture& dsttex, const IntRect& dstReq, int dstZ,
+ bool invertY) {
+ assert(srctex.internal_format == GL_RGBA8 ||
+ srctex.internal_format == GL_R8);
+ // Compute valid dest bounds
+ IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY);
+ // Check if sampling bounds are empty
+ if (dstBounds.is_empty()) {
return;
}
- ++c->references;
+ // Initialize sampler for source texture
+ sampler2DArray_impl sampler;
+ init_sampler(&sampler, srctex);
+ init_depth(&sampler, srctex);
+ sampler.filter = TextureFilter::LINEAR;
+ // Compute source UVs
+ int srcZOffset = srcZ * sampler.height_stride;
+ vec2_scalar srcUV(srcReq.x0, srcReq.y0);
+ vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
+ float(srcReq.height()) / dstReq.height());
+ // Skip to clamped source start
+ srcUV += srcDUV * vec2_scalar(dstBounds.x0, dstBounds.y0);
+ // Offset source UVs to texel centers and scale by lerp precision
+ srcUV = linearQuantize(srcUV + 0.5f, 128);
+ srcDUV *= 128.0f;
+ // Calculate dest pointer from clamped offsets
+ int bpp = dsttex.bpp();
+ int destStride = dsttex.stride(bpp);
+ char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ, invertY);
+ // Inverted Y must step downward along dest rows
+ if (invertY) {
+ destStride = -destStride;
+ }
+ int span = dstBounds.width();
+ for (int rows = dstBounds.height(); rows > 0; rows--) {
+ switch (bpp) {
+ case 1:
+ linear_row((uint8_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+ &sampler);
+ break;
+ case 4:
+ linear_row((uint32_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+ &sampler);
+ break;
+ default:
+ assert(false);
+ break;
+ }
+ dest += destStride;
+ srcUV.y += srcDUV.y;
+ }
}
-void DestroyContext(Context* c) {
- if (!c) {
+extern "C" {
+
+void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+ GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+ GLbitfield mask, GLenum filter) {
+ assert(mask == GL_COLOR_BUFFER_BIT);
+ Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER);
+ if (!srcfb || srcfb->layer < 0) return;
+ Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER);
+ if (!dstfb || dstfb->layer < 0) return;
+ Texture& srctex = ctx->textures[srcfb->color_attachment];
+ if (!srctex.buf || srcfb->layer >= max(srctex.depth, 1)) return;
+ Texture& dsttex = ctx->textures[dstfb->color_attachment];
+ if (!dsttex.buf || dstfb->layer >= max(dsttex.depth, 1)) return;
+ if (srctex.internal_format != dsttex.internal_format) {
+ assert(false);
return;
}
- assert(c->references > 0);
- --c->references;
- if (c->references > 0) {
+ // Force flipped Y onto dest coordinates
+ if (srcY1 < srcY0) {
+ swap(srcY0, srcY1);
+ swap(dstY0, dstY1);
+ }
+ bool invertY = dstY1 < dstY0;
+ if (invertY) {
+ swap(dstY0, dstY1);
+ }
+ IntRect srcReq = {srcX0, srcY0, srcX1, srcY1};
+ IntRect dstReq = {dstX0, dstY0, dstX1, dstY1};
+ if (srcReq.is_empty() || dstReq.is_empty()) {
return;
}
- if (ctx == c) {
- MakeCurrent(nullptr);
+ prepare_texture(srctex);
+ prepare_texture(dsttex, &dstReq);
+ if (!srcReq.same_size(dstReq) && filter == GL_LINEAR &&
+ (srctex.internal_format == GL_RGBA8 ||
+ srctex.internal_format == GL_R8)) {
+ linear_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer,
+ invertY);
+ } else {
+ scale_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer,
+ invertY);
}
- delete c;
}
-size_t ReportMemory(size_t (*size_of_op)(void*)) {
- size_t size = 0;
+void Finish() {}
+
+void MakeCurrent(void* ctx_ptr) {
+ ctx = (Context*)ctx_ptr;
if (ctx) {
- for (auto& t : ctx->textures) {
- if (t && t->should_free()) {
- size += size_of_op(t->buf);
+ setup_program(ctx->current_program);
+ blend_key = ctx->blend ? ctx->blend_key : BLEND_KEY_NONE;
+ } else {
+ setup_program(0);
+ blend_key = BLEND_KEY_NONE;
+ }
+}
+
+void* CreateContext() { return new Context; }
+
+void DestroyContext(void* ctx_ptr) {
+ if (!ctx_ptr) {
+ return;
+ }
+ if (ctx == ctx_ptr) {
+ MakeCurrent(nullptr);
+ }
+ delete (Context*)ctx_ptr;
+}
+
+void Composite(GLuint srcId, GLint srcX, GLint srcY, GLsizei srcWidth,
+ GLsizei srcHeight, GLint dstX, GLint dstY, GLboolean opaque,
+ GLboolean flip) {
+ Framebuffer& fb = ctx->framebuffers[0];
+ if (!fb.color_attachment) {
+ return;
+ }
+ Texture& srctex = ctx->textures[srcId];
+ if (!srctex.buf) return;
+ prepare_texture(srctex);
+ Texture& dsttex = ctx->textures[fb.color_attachment];
+ if (!dsttex.buf) return;
+ assert(srctex.bpp() == 4);
+ const int bpp = 4;
+ size_t src_stride = srctex.stride(bpp);
+ size_t dest_stride = dsttex.stride(bpp);
+ if (srcY < 0) {
+ dstY -= srcY;
+ srcHeight += srcY;
+ srcY = 0;
+ }
+ if (dstY < 0) {
+ srcY -= dstY;
+ srcHeight += dstY;
+ dstY = 0;
+ }
+ if (srcY + srcHeight > srctex.height) {
+ srcHeight = srctex.height - srcY;
+ }
+ if (dstY + srcHeight > dsttex.height) {
+ srcHeight = dsttex.height - dstY;
+ }
+ IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight};
+ prepare_texture(dsttex, &skip);
+ char* dest = dsttex.sample_ptr(dstX, flip ? dsttex.height - 1 - dstY : dstY,
+ fb.layer, bpp, dest_stride);
+ char* src = srctex.sample_ptr(srcX, srcY, 0, bpp, src_stride);
+ if (flip) {
+ dest_stride = -dest_stride;
+ }
+ if (opaque) {
+ for (int y = 0; y < srcHeight; y++) {
+ memcpy(dest, src, srcWidth * bpp);
+ dest += dest_stride;
+ src += src_stride;
+ }
+ } else {
+ for (int y = 0; y < srcHeight; y++) {
+ char* end = src + srcWidth * bpp;
+ while (src + 4 * bpp <= end) {
+ WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
+ WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
+ PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+ unaligned_store(dest, r);
+ src += 4 * bpp;
+ dest += 4 * bpp;
}
+ if (src < end) {
+ WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
+ WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
+ U32 r = bit_cast<U32>(
+ pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))));
+ unaligned_store(dest, r.x);
+ if (src + bpp < end) {
+ unaligned_store(dest + bpp, r.y);
+ if (src + 2 * bpp < end) {
+ unaligned_store(dest + 2 * bpp, r.z);
+ }
+ }
+ dest += end - src;
+ src = end;
+ }
+ dest += dest_stride - srcWidth * bpp;
+ src += src_stride - srcWidth * bpp;
}
}
- return size;
}
+
} // extern "C"
diff --git a/third_party/webrender/swgl/src/gl_defs.h b/third_party/webrender/swgl/src/gl_defs.h
index 22219366ecf..c7e87230a3d 100644
--- a/third_party/webrender/swgl/src/gl_defs.h
+++ b/third_party/webrender/swgl/src/gl_defs.h
@@ -15,27 +15,20 @@ typedef float GLfloat;
typedef double GLdouble;
typedef uint32_t GLenum;
-typedef uint8_t GLboolean;
+typedef int32_t GLboolean;
typedef uint32_t GLbitfield;
typedef int32_t GLsizei;
typedef size_t GLsizeiptr;
typedef intptr_t GLintptr;
-#define GL_FALSE 0
-#define GL_TRUE 1
-
-#define GL_NONE 0
-
#define GL_NO_ERROR 0
#define GL_RGBA32F 0x8814
#define GL_RGBA8 0x8058
#define GL_R8 0x8229
-#define GL_R16 0x822A
#define GL_RGBA32I 0x8D82
#define GL_BGRA8 0x93A1
-#define GL_RG8 0x822B
#define GL_BYTE 0x1400
#define GL_UNSIGNED_BYTE 0x1401
@@ -44,7 +37,6 @@ typedef intptr_t GLintptr;
#define GL_INT 0x1404
#define GL_UNSIGNED_INT 0x1405
#define GL_FLOAT 0x1406
-#define GL_DOUBLE 0x1408
#define GL_RED 0x1903
#define GL_GREEN 0x1904
@@ -54,7 +46,6 @@ typedef intptr_t GLintptr;
#define GL_RGBA 0x1908
#define GL_RGBA_INTEGER 0x8D99
#define GL_BGRA 0x80E1
-#define GL_RG 0x8227
#define GL_DEPTH_COMPONENT 0x1902
#define GL_DEPTH_COMPONENT16 0x81A5
@@ -155,8 +146,6 @@ typedef intptr_t GLintptr;
#define GL_ONE_MINUS_SRC1_ALPHA 0x88FB
#define GL_FUNC_ADD 0x8006
-#define GL_MIN 0x8007
-#define GL_MAX 0x8008
#define GL_NEVER 0x0200
#define GL_LESS 0x0201
@@ -176,9 +165,6 @@ typedef intptr_t GLintptr;
#define GL_VERSION 0x1F02
#define GL_EXTENSIONS 0x1F03
#define GL_NUM_EXTENSIONS 0x821D
-#define GL_MINOR_VERSION 0x821C
-#define GL_MAJOR_VERSION 0x821B
-#define GL_SHADING_LANGUAGE_VERSION 0x8B8C
#define GL_POINTS 0x0000
#define GL_LINES 0x0001
@@ -188,29 +174,3 @@ typedef intptr_t GLintptr;
#define GL_TRIANGLE_STRIP 0x0005
#define GL_TRIANGLE_FAN 0x0006
#define GL_QUADS 0x0007
-
-#define GL_UNSIGNED_INT_8_8_8_8_REV 0x8367
-
-#define GL_RGB_422_APPLE 0x8A1F
-#define GL_UNSIGNED_SHORT_8_8_APPLE 0x85BA
-#define GL_UNSIGNED_SHORT_8_8_REV_APPLE 0x85BB
-#define GL_RGB_RAW_422_APPLE 0x8A51
-
-#define GL_MULTIPLY_KHR 0x9294
-#define GL_SCREEN_KHR 0x9295
-#define GL_OVERLAY_KHR 0x9296
-#define GL_DARKEN_KHR 0x9297
-#define GL_LIGHTEN_KHR 0x9298
-#define GL_COLORDODGE_KHR 0x9299
-#define GL_COLORBURN_KHR 0x929A
-#define GL_HARDLIGHT_KHR 0x929B
-#define GL_SOFTLIGHT_KHR 0x929C
-#define GL_DIFFERENCE_KHR 0x929E
-#define GL_EXCLUSION_KHR 0x92A0
-#define GL_HSL_HUE_KHR 0x92AD
-#define GL_HSL_SATURATION_KHR 0x92AE
-#define GL_HSL_COLOR_KHR 0x92AF
-#define GL_HSL_LUMINOSITY_KHR 0x92B0
-
-#define SWGL_BLEND_DROP_SHADOW 0xB001
-#define SWGL_BLEND_SUBPIXEL_TEXT 0xB002
diff --git a/third_party/webrender/swgl/src/glsl.h b/third_party/webrender/swgl/src/glsl.h
index bec63858b0d..cdedb43d567 100644
--- a/third_party/webrender/swgl/src/glsl.h
+++ b/third_party/webrender/swgl/src/glsl.h
@@ -2,45 +2,14 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+// Some of this is copied from Skia and is governed by a BSD-style license
+// Every function in this file should be marked static and inline using SI.
#define SI ALWAYS_INLINE static
#include "vector_type.h"
namespace glsl {
-enum TextureFormat { RGBA32F, RGBA32I, RGBA8, R8, RG8, R16, YUV422 };
-
-enum TextureFilter { NEAREST, LINEAR };
-
-struct samplerCommon {
- uint32_t* buf = nullptr;
- uint32_t stride = 0; // in units of BPP if < 4, or dwords if BPP >= 4
- uint32_t height = 0;
- uint32_t width = 0;
- TextureFormat format = TextureFormat::RGBA8;
-};
-
-struct samplerFilter {
- TextureFilter filter = TextureFilter::NEAREST;
-};
-
-struct sampler2D_impl : samplerCommon, samplerFilter {};
-typedef sampler2D_impl* sampler2D;
-
-typedef struct sampler2DR8_impl : sampler2D_impl{} * sampler2DR8;
-typedef struct sampler2DRG8_impl : sampler2D_impl{} * sampler2DRG8;
-typedef struct sampler2DRGBA8_impl : sampler2D_impl{} * sampler2DRGBA8;
-typedef struct sampler2DRGBA32F_impl : sampler2D_impl{} * sampler2DRGBA32F;
-
-struct isampler2D_impl : samplerCommon {};
-typedef isampler2D_impl* isampler2D;
-
-struct isampler2DRGBA32I_impl : isampler2D_impl {};
-typedef isampler2DRGBA32I_impl* isampler2DRGBA32I;
-
-struct sampler2DRect_impl : samplerCommon, samplerFilter {};
-typedef sampler2DRect_impl* sampler2DRect;
-
#if USE_SSE2
SI bool test_all(Bool cond) { return _mm_movemask_ps(cond) == 0xF; }
SI bool test_any(Bool cond) { return _mm_movemask_ps(cond) != 0; }
@@ -49,14 +18,9 @@ SI bool test_none(Bool cond) { return _mm_movemask_ps(cond) == 0; }
SI bool test_all(Bool cond) {
return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0xFFFFFFFFU;
}
-SI bool test_any(Bool cond) {
- return bit_cast<uint32_t>(CONVERT(cond, U8)) != 0;
-}
-SI bool test_none(Bool cond) {
- return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0;
-}
+SI bool test_any(Bool cond) { return bit_cast<uint32_t>(CONVERT(cond, U8)) != 0; }
+SI bool test_none(Bool cond) { return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0; }
#endif
-SI bool test_equal(Bool cond) { return test_none(cond != cond.x); }
float make_float(float n) { return n; }
@@ -110,23 +74,17 @@ struct vec4;
struct ivec2;
SI int32_t if_then_else(int32_t c, int32_t t, int32_t e) { return c ? t : e; }
-SI int32_t if_then_else(bool c, int32_t t, int32_t e) { return c ? t : e; }
SI float if_then_else(int32_t c, float t, float e) { return c ? t : e; }
SI Float if_then_else(I32 c, float t, float e) {
- return bit_cast<Float>((c & bit_cast<I32>(Float(t))) |
- (~c & bit_cast<I32>(Float(e))));
+ return bit_cast<Float>((c & bit_cast<I32>(Float(t))) | (~c & bit_cast<I32>(Float(e))));
}
SI I32 if_then_else(I32 c, int32_t t, int32_t e) {
return (c & I32(t)) | (~c & I32(e));
}
-SI U32 if_then_else(I32 c, U32 t, U32 e) {
- return bit_cast<U32>((c & bit_cast<I32>(t)) | (~c & bit_cast<I32>(e)));
-}
-
SI Float if_then_else(I32 c, Float t, Float e) {
return bit_cast<Float>((c & bit_cast<I32>(t)) | (~c & bit_cast<I32>(e)));
}
@@ -137,10 +95,7 @@ SI Bool if_then_else(I32 c, Bool t, Bool e) { return (c & t) | (~c & e); }
SI Bool if_then_else(int32_t c, Bool t, Bool e) { return c ? t : e; }
-SI I16 if_then_else(I16 c, I16 t, I16 e) { return (c & t) | (~c & e); }
-
-template <typename T>
-SI void swap(T& a, T& b) {
+template <typename T> SI void swap(T& a, T& b) {
T t(a);
a = b;
b = t;
@@ -201,37 +156,7 @@ SI Float sqrt(Float v) {
#endif
}
-SI float recip(float x) {
-#if USE_SSE2
- return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(x)));
-#else
- return 1.0f / x;
-#endif
-}
-
-// Use a fast vector reciprocal approximation when available. This should only
-// be used in cases where it is okay that the approximation is imprecise -
-// essentially visually correct but numerically wrong. Otherwise just rely on
-// however the compiler would implement slower division if the platform doesn't
-// provide a convenient intrinsic.
-SI Float recip(Float v) {
-#if USE_SSE2
- return _mm_rcp_ps(v);
-#elif USE_NEON
- Float e = vrecpeq_f32(v);
- return vrecpsq_f32(v, e) * e;
-#else
- return 1.0f / v;
-#endif
-}
-
-SI float inversesqrt(float x) {
-#if USE_SSE2
- return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(x)));
-#else
- return 1.0f / sqrtf(x);
-#endif
-}
+SI float inversesqrt(float x) { return 1.0f / sqrtf(x); }
SI Float inversesqrt(Float v) {
#if USE_SSE2
@@ -269,45 +194,18 @@ enum XYZW {
A = 3,
};
-struct bvec4_scalar;
-
struct bvec2_scalar {
bool x;
bool y;
bvec2_scalar() : bvec2_scalar(false) {}
- IMPLICIT constexpr bvec2_scalar(bool a) : x(a), y(a) {}
+ constexpr bvec2_scalar(bool a) : x(a), y(a) {}
constexpr bvec2_scalar(bool x, bool y) : x(x), y(y) {}
-
- bool& select(XYZW c) {
- switch (c) {
- case X:
- return x;
- case Y:
- return y;
- default:
- UNREACHABLE;
- }
- }
- bool sel(XYZW c1) { return select(c1); }
-
- bvec2_scalar sel(XYZW c1, XYZW c2) {
- return bvec2_scalar(select(c1), select(c2));
- }
- bvec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
-};
-
-struct bvec2_scalar1 {
- bool x;
-
- IMPLICIT constexpr bvec2_scalar1(bool a) : x(a) {}
-
- operator bvec2_scalar() const { return bvec2_scalar(x); }
};
struct bvec2 {
bvec2() : bvec2(0) {}
- IMPLICIT bvec2(Bool a) : x(a), y(a) {}
+ bvec2(Bool a) : x(a), y(a) {}
bvec2(Bool x, Bool y) : x(x), y(y) {}
Bool& select(XYZW c) {
switch (c) {
@@ -321,15 +219,13 @@ struct bvec2 {
}
Bool sel(XYZW c1) { return select(c1); }
- bvec2 sel(XYZW c1, XYZW c2) { return bvec2(select(c1), select(c2)); }
-
bvec2 operator~() { return bvec2(~x, ~y); }
Bool x;
Bool y;
};
-bvec2_scalar1 make_bvec2(bool n) { return bvec2_scalar1(n); }
+bvec2_scalar make_bvec2(bool n) { return bvec2_scalar{n, n}; }
bvec2_scalar make_bvec2(bool x, bool y) { return bvec2_scalar{x, y}; }
@@ -353,8 +249,8 @@ struct vec2_scalar {
float y;
constexpr vec2_scalar() : vec2_scalar(0.0f) {}
- IMPLICIT constexpr vec2_scalar(float a) : x(a), y(a) {}
- IMPLICIT constexpr vec2_scalar(int a) : x(a), y(a) {}
+ constexpr vec2_scalar(float a) : x(a), y(a) {}
+ constexpr vec2_scalar(int a) : x(a), y(a) {}
constexpr vec2_scalar(float x, float y) : x(x), y(y) {}
float& select(XYZW c) {
@@ -390,9 +286,6 @@ struct vec2_scalar {
friend vec2_scalar operator*(vec2_scalar a, vec2_scalar b) {
return vec2_scalar(a.x * b.x, a.y * b.y);
}
- friend vec2_scalar operator/(vec2_scalar a, float b) {
- return vec2_scalar(a.x / b, a.y / b);
- }
friend vec2_scalar operator/(vec2_scalar a, vec2_scalar b) {
return vec2_scalar(a.x / b.x, a.y / b.y);
}
@@ -415,12 +308,6 @@ struct vec2_scalar {
return *this;
}
- vec2_scalar operator/=(vec2_scalar a) {
- x /= a.x;
- y /= a.y;
- return *this;
- }
-
vec2_scalar operator+=(vec2_scalar a) {
x += a.x;
y += a.y;
@@ -469,12 +356,12 @@ struct vec2 {
typedef float element_type;
constexpr vec2() : vec2(Float(0.0f)) {}
- IMPLICIT constexpr vec2(Float a) : x(a), y(a) {}
+ constexpr vec2(Float a) : x(a), y(a) {}
vec2(Float x, Float y) : x(x), y(y) {}
- IMPLICIT constexpr vec2(vec2_scalar s) : x(s.x), y(s.y) {}
+ constexpr vec2(vec2_scalar s) : x(s.x), y(s.y) {}
constexpr vec2(vec2_scalar s0, vec2_scalar s1, vec2_scalar s2, vec2_scalar s3)
: x(Float{s0.x, s1.x, s2.x, s3.x}), y(Float{s0.y, s1.y, s2.y, s3.y}) {}
- explicit vec2(ivec2 a);
+ vec2(ivec2 a);
Float x;
Float y;
@@ -583,7 +470,6 @@ vec2 operator*(vec2_scalar a, Float b) { return vec2(a.x * b, a.y * b); }
vec2 operator*(Float a, vec2_scalar b) { return vec2(a * b.x, a * b.y); }
SI vec2 min(vec2 a, vec2 b) { return vec2(min(a.x, b.x), min(a.y, b.y)); }
-SI vec2 min(vec2 a, Float b) { return vec2(min(a.x, b), min(a.y, b)); }
SI vec2_scalar min(vec2_scalar a, vec2_scalar b) {
return vec2_scalar{min(a.x, b.x), min(a.y, b.y)};
@@ -599,12 +485,8 @@ vec2 step(vec2 edge, vec2 x) {
return vec2(step(edge.x, x.x), step(edge.y, x.y));
}
-vec2_scalar step(vec2_scalar edge, vec2_scalar x) {
- return vec2_scalar(step(edge.x, x.x), step(edge.y, x.y));
-}
-
-SI vec2 max(vec2 a, vec2 b) { return vec2(max(a.x, b.x), max(a.y, b.y)); }
-SI vec2 max(vec2 a, Float b) { return vec2(max(a.x, b), max(a.y, b)); }
+vec2 max(vec2 a, vec2 b) { return vec2(max(a.x, b.x), max(a.y, b.y)); }
+vec2 max(vec2 a, Float b) { return vec2(max(a.x, b), max(a.y, b)); }
SI vec2_scalar max(vec2_scalar a, vec2_scalar b) {
return vec2_scalar{max(a.x, b.x), max(a.y, b.y)};
@@ -617,31 +499,9 @@ Float length(vec2 a) { return sqrt(a.x * a.x + a.y * a.y); }
float length(vec2_scalar a) { return hypotf(a.x, a.y); }
-template <typename A, typename B>
-SI auto distance(A a, B b) {
- return length(a - b);
-}
+SI Float distance(vec2 a, vec2 b) { return length(a - b); }
-template <typename T>
-SI T normalize(T a) {
- return a / length(a);
-}
-
-SI vec2 sqrt(vec2 a) { return vec2(sqrt(a.x), sqrt(a.y)); }
-
-SI vec2_scalar sqrt(vec2_scalar a) { return vec2_scalar(sqrt(a.x), sqrt(a.y)); }
-
-SI vec2 recip(vec2 a) { return vec2(recip(a.x), recip(a.y)); }
-
-SI vec2_scalar recip(vec2_scalar a) {
- return vec2_scalar(recip(a.x), recip(a.y));
-}
-
-SI vec2 inversesqrt(vec2 a) { return vec2(inversesqrt(a.x), inversesqrt(a.y)); }
-
-SI vec2_scalar inversesqrt(vec2_scalar a) {
- return vec2_scalar(inversesqrt(a.x), inversesqrt(a.y));
-}
+SI vec2 normalize(vec2 a) { return a / length(a); }
#define abs __glsl_abs
@@ -657,13 +517,6 @@ Float abs(Float v) {
#endif
}
-float sign(float a) { return copysignf(1.0f, a); }
-
-Float sign(Float v) {
- return bit_cast<Float>((bit_cast<I32>(v) & 0x80000000) |
- bit_cast<I32>(Float(1.0f)));
-}
-
Float cast(U32 v) { return CONVERT((I32)v, Float); }
Float cast(I32 v) { return CONVERT((I32)v, Float); }
I32 cast(Float v) { return CONVERT(v, I32); }
@@ -725,22 +578,17 @@ SI I32 roundfast(Float v, Float scale) {
#endif
}
-template <typename T>
-SI auto round_pixel(T v, float scale = 255.0f) {
- return roundfast(v, scale);
-}
+template <typename T> SI auto round_pixel(T v) { return roundfast(v, 255.0f); }
#define round __glsl_round
float round(float a) { return roundf(a); }
-Float round(Float v) { return floor(v + 0.5f); }
-
float fract(float a) { return a - floor(a); }
-Float fract(Float v) { return v - floor(v); }
+Float round(Float v) { return floor(v + 0.5f); }
-vec2 fract(vec2 v) { return vec2(fract(v.x), fract(v.y)); }
+Float fract(Float v) { return v - floor(v); }
// X derivatives can be approximated by dFdx(x) = x[1] - x[0].
// Y derivatives are not easily available since we operate in terms of X spans
@@ -748,15 +596,11 @@ vec2 fract(vec2 v) { return vec2(fract(v.x), fract(v.y)); }
// uniform scaling, and thus abs(dFdx(p.x)) + abs(dFdy(p.x)) = abs(dFdx(p.x)) +
// abs(dFdx(p.y)) which mirrors abs(dFdx(p.y)) + abs(dFdy(p.y)) = abs(dFdx(p.y))
// + abs(dFdx(p.x)).
-vec2_scalar fwidth(vec2 p) {
+vec2 fwidth(vec2 p) {
Float d = abs(SHUFFLE(p.x, p.y, 1, 1, 5, 5) - SHUFFLE(p.x, p.y, 0, 0, 4, 4));
- return vec2_scalar(d.x + d.z);
+ return vec2(d.xyxy + d.zwzw);
}
-float dFdx(Float x) { return x.y - x.x; }
-
-vec2_scalar dFdx(vec2 p) { return vec2_scalar(dFdx(p.x), dFdx(p.y)); }
-
// See
// http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
Float approx_log2(Float x) {
@@ -768,7 +612,6 @@ Float approx_log2(Float x) {
return e - 124.225514990f - 1.498030302f * m -
1.725879990f / (0.3520887068f + m);
}
-
Float approx_pow2(Float x) {
Float f = fract(x);
return bit_cast<Float>(
@@ -776,41 +619,16 @@ Float approx_pow2(Float x) {
27.728023300f / (4.84252568f - f)));
}
-#define pow __glsl_pow
-
-SI float pow(float x, float y) { return powf(x, y); }
-
+// From skia
Float pow(Float x, Float y) {
return if_then_else((x == 0) | (x == 1), x, approx_pow2(approx_log2(x) * y));
}
-#define exp __glsl_exp
-
-SI float exp(float x) { return expf(x); }
-
Float exp(Float y) {
- float l2e = 1.4426950408889634074f;
- return approx_pow2(l2e * y);
+ float x = 2.718281828459045235360287471352;
+ return approx_pow2(log2f(x) * y);
}
-#define exp2 __glsl_exp2
-
-SI float exp2(float x) { return exp2f(x); }
-
-Float exp2(Float x) { return approx_pow2(x); }
-
-#define log __glsl_log
-
-SI float log(float x) { return logf(x); }
-
-Float log(Float x) { return approx_log2(x) * 0.69314718f; }
-
-#define log2 __glsl_log2
-
-SI float log2(float x) { return log2f(x); }
-
-Float log2(Float x) { return approx_log2(x); }
-
struct ivec4;
struct ivec2_scalar {
@@ -820,7 +638,7 @@ struct ivec2_scalar {
int32_t y;
ivec2_scalar() : ivec2_scalar(0) {}
- IMPLICIT constexpr ivec2_scalar(int32_t a) : x(a), y(a) {}
+ constexpr ivec2_scalar(int32_t a) : x(a), y(a) {}
constexpr ivec2_scalar(int32_t x, int32_t y) : x(x), y(y) {}
int32_t& select(XYZW c) {
@@ -838,8 +656,6 @@ struct ivec2_scalar {
return ivec2_scalar{select(c1), select(c2)};
}
- ivec2_scalar operator-() const { return ivec2_scalar{-x, -y}; }
-
ivec2_scalar& operator+=(ivec2_scalar a) {
x += a.x;
y += a.y;
@@ -864,25 +680,17 @@ struct ivec2_scalar {
friend ivec2_scalar operator+(ivec2_scalar a, ivec2_scalar b) {
return ivec2_scalar{a.x + b.x, a.y + b.y};
}
-
- friend ivec2_scalar operator-(ivec2_scalar a, ivec2_scalar b) {
- return ivec2_scalar{a.x - b.x, a.y - b.y};
- }
-
- friend bool operator==(const ivec2_scalar& l, const ivec2_scalar& r) {
- return l.x == r.x && l.y == r.y;
- }
};
struct ivec2 {
typedef int32_t element_type;
ivec2() : ivec2(I32(0)) {}
- IMPLICIT ivec2(I32 a) : x(a), y(a) {}
+ ivec2(I32 a) : x(a), y(a) {}
ivec2(I32 x, I32 y) : x(x), y(y) {}
- IMPLICIT ivec2(vec2 a) : x(cast(a.x)), y(cast(a.y)) {}
+ ivec2(vec2 a) : x(cast(a.x)), y(cast(a.y)) {}
ivec2(U32 x, U32 y) : x(CONVERT(x, I32)), y(CONVERT(y, I32)) {}
- IMPLICIT constexpr ivec2(ivec2_scalar s) : x(s.x), y(s.y) {}
+ constexpr ivec2(ivec2_scalar s) : x(s.x), y(s.y) {}
constexpr ivec2(ivec2_scalar s0, ivec2_scalar s1, ivec2_scalar s2,
ivec2_scalar s3)
: x(I32{s0.x, s1.x, s2.x, s3.x}), y(I32{s0.y, s1.y, s2.y, s3.y}) {}
@@ -973,7 +781,7 @@ struct ivec3_scalar {
int32_t z;
ivec3_scalar() : ivec3_scalar(0) {}
- IMPLICIT constexpr ivec3_scalar(int32_t a) : x(a), y(a), z(a) {}
+ constexpr ivec3_scalar(int32_t a) : x(a), y(a), z(a) {}
constexpr ivec3_scalar(int32_t x, int32_t y, int32_t z) : x(x), y(y), z(z) {}
int32_t& select(XYZW c) {
@@ -996,7 +804,7 @@ struct ivec3_scalar {
struct ivec3 {
ivec3() : ivec3(0) {}
- IMPLICIT ivec3(I32 a) : x(a), y(a), z(a) {}
+ ivec3(I32 a) : x(a), y(a), z(a) {}
ivec3(I32 x, I32 y, I32 z) : x(x), y(y), z(z) {}
ivec3(ivec2 a, I32 b) : x(a.x), y(a.y), z(b) {}
ivec3(vec2 a, Float b) : x(cast(a.x)), y(cast(a.y)), z(cast(b)) {}
@@ -1047,7 +855,7 @@ struct ivec4_scalar {
int32_t w;
ivec4_scalar() : ivec4_scalar(0) {}
- IMPLICIT constexpr ivec4_scalar(int32_t a) : x(a), y(a), z(a), w(a) {}
+ constexpr ivec4_scalar(int32_t a) : x(a), y(a), z(a), w(a) {}
constexpr ivec4_scalar(int32_t x, int32_t y, int32_t z, int32_t w)
: x(x), y(y), z(z), w(w) {}
@@ -1073,31 +881,16 @@ struct ivec4_scalar {
friend ivec4_scalar operator&(int32_t a, ivec4_scalar b) {
return ivec4_scalar{a & b.x, a & b.y, a & b.z, a & b.w};
}
-
- int32_t& operator[](int index) {
- switch (index) {
- case 0:
- return x;
- case 1:
- return y;
- case 2:
- return z;
- case 3:
- return w;
- default:
- UNREACHABLE;
- }
- }
};
struct ivec4 {
typedef int32_t element_type;
ivec4() : ivec4(I32(0)) {}
- IMPLICIT ivec4(I32 a) : x(a), y(a), z(a), w(a) {}
+ ivec4(I32 a) : x(a), y(a), z(a), w(a) {}
ivec4(I32 x, I32 y, I32 z, I32 w) : x(x), y(y), z(z), w(w) {}
ivec4(ivec2 a, I32 b, I32 c) : x(a.x), y(a.y), z(b), w(c) {}
- IMPLICIT constexpr ivec4(ivec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
+ constexpr ivec4(ivec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
constexpr ivec4(ivec4_scalar s0, ivec4_scalar s1, ivec4_scalar s2,
ivec4_scalar s3)
: x(I32{s0.x, s1.x, s2.x, s3.x}),
@@ -1190,21 +983,13 @@ struct bvec3_scalar {
bool z;
bvec3_scalar() : bvec3_scalar(false) {}
- IMPLICIT constexpr bvec3_scalar(bool a) : x(a), y(a), z(a) {}
+ constexpr bvec3_scalar(bool a) : x(a), y(a), z(a) {}
constexpr bvec3_scalar(bool x, bool y, bool z) : x(x), y(y), z(z) {}
};
-struct bvec3_scalar1 {
- bool x;
-
- IMPLICIT constexpr bvec3_scalar1(bool a) : x(a) {}
-
- operator bvec3_scalar() const { return bvec3_scalar(x); }
-};
-
struct bvec3 {
bvec3() : bvec3(0) {}
- IMPLICIT bvec3(Bool a) : x(a), y(a), z(a) {}
+ bvec3(Bool a) : x(a), y(a), z(a) {}
bvec3(Bool x, Bool y, Bool z) : x(x), y(y), z(z) {}
Bool& select(XYZW c) {
switch (c) {
@@ -1225,8 +1010,6 @@ struct bvec3 {
Bool z;
};
-bvec3_scalar1 make_bvec3(bool n) { return bvec3_scalar1(n); }
-
struct bvec4_scalar {
bool x;
bool y;
@@ -1234,45 +1017,14 @@ struct bvec4_scalar {
bool w;
bvec4_scalar() : bvec4_scalar(false) {}
- IMPLICIT constexpr bvec4_scalar(bool a) : x(a), y(a), z(a), w(a) {}
+ constexpr bvec4_scalar(bool a) : x(a), y(a), z(a), w(a) {}
constexpr bvec4_scalar(bool x, bool y, bool z, bool w)
: x(x), y(y), z(z), w(w) {}
-
- bool& select(XYZW c) {
- switch (c) {
- case X:
- return x;
- case Y:
- return y;
- case Z:
- return z;
- case W:
- return w;
- default:
- UNREACHABLE;
- }
- }
- bool sel(XYZW c1) { return select(c1); }
- bvec2_scalar sel(XYZW c1, XYZW c2) {
- return bvec2_scalar(select(c1), select(c2));
- }
-};
-
-bvec4_scalar bvec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
- return bvec4_scalar{select(c1), select(c2), select(c3), select(c4)};
-}
-
-struct bvec4_scalar1 {
- bool x;
-
- IMPLICIT constexpr bvec4_scalar1(bool a) : x(a) {}
-
- operator bvec4_scalar() const { return bvec4_scalar(x); }
};
struct bvec4 {
bvec4() : bvec4(0) {}
- IMPLICIT bvec4(Bool a) : x(a), y(a), z(a), w(a) {}
+ bvec4(Bool a) : x(a), y(a), z(a), w(a) {}
bvec4(Bool x, Bool y, Bool z, Bool w) : x(x), y(y), z(z), w(w) {}
bvec4(bvec2 x, bvec2 y) : x(x.x), y(x.y), z(y.x), w(y.y) {}
Bool& select(XYZW c) {
@@ -1285,8 +1037,6 @@ struct bvec4 {
return z;
case W:
return w;
- default:
- UNREACHABLE;
}
}
Bool sel(XYZW c1) { return select(c1); }
@@ -1297,16 +1047,12 @@ struct bvec4 {
Bool w;
};
-bvec4_scalar1 make_bvec4(bool n) { return bvec4_scalar1(n); }
+bvec4_scalar make_bvec4(bool n) { return bvec4_scalar{n, n, n, n}; }
bvec4_scalar make_bvec4(bool x, bool y, bool z, bool w) {
return bvec4_scalar{x, y, z, w};
}
-bvec4_scalar make_bvec4(bvec2_scalar a, bvec2_scalar b) {
- return bvec4_scalar{a.x, a.y, b.x, b.y};
-}
-
template <typename N>
bvec4 make_bvec4(const N& n) {
return bvec4(n);
@@ -1383,7 +1129,7 @@ struct vec3_scalar {
float z;
constexpr vec3_scalar() : vec3_scalar(0.0f) {}
- IMPLICIT constexpr vec3_scalar(float a) : x(a), y(a), z(a) {}
+ constexpr vec3_scalar(float a) : x(a), y(a), z(a) {}
constexpr vec3_scalar(float x, float y, float z) : x(x), y(y), z(z) {}
float& select(XYZW c) {
@@ -1474,11 +1220,10 @@ struct vec3 {
typedef float element_type;
constexpr vec3() : vec3(Float(0.0f)) {}
- IMPLICIT constexpr vec3(Float a) : x(a), y(a), z(a) {}
+ constexpr vec3(Float a) : x(a), y(a), z(a) {}
constexpr vec3(Float x, Float y, Float z) : x(x), y(y), z(z) {}
vec3(vec2 a, Float z) : x(a.x), y(a.y), z(z) {}
- explicit vec3(vec4);
- IMPLICIT constexpr vec3(vec3_scalar s) : x(s.x), y(s.y), z(s.z) {}
+ constexpr vec3(vec3_scalar s) : x(s.x), y(s.y), z(s.z) {}
constexpr vec3(vec3_scalar s0, vec3_scalar s1, vec3_scalar s2, vec3_scalar s3)
: x(Float{s0.x, s1.x, s2.x, s3.x}),
y(Float{s0.y, s1.y, s2.y, s3.y}),
@@ -1507,8 +1252,6 @@ struct vec3 {
return vec3(select(c1), select(c2), select(c3));
}
- vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
-
vec2_ref lsel(XYZW c1, XYZW c2) { return vec2_ref(select(c1), select(c2)); }
friend vec3 operator*(vec3 a, Float b) {
@@ -1605,26 +1348,13 @@ vec3 step(vec3 edge, vec3 x) {
return vec3(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z));
}
-vec3_scalar step(vec3_scalar edge, vec3_scalar x) {
- return vec3_scalar(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z));
-}
-
SI vec3 min(vec3 a, vec3 b) {
return vec3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
}
-SI vec3 min(vec3 a, Float b) {
- return vec3(min(a.x, b), min(a.y, b), min(a.z, b));
-}
-SI vec3_scalar min(vec3_scalar a, vec3_scalar b) {
- return vec3_scalar{min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)};
-}
-
SI vec3 max(vec3 a, vec3 b) {
return vec3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
}
-SI vec3 max(vec3 a, Float b) {
- return vec3(max(a.x, b), max(a.y, b), max(a.z, b));
-}
+
SI vec3_scalar max(vec3_scalar a, vec3_scalar b) {
return vec3_scalar{max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
}
@@ -1670,15 +1400,11 @@ struct vec4_scalar {
float w;
constexpr vec4_scalar() : vec4_scalar(0.0f) {}
- IMPLICIT constexpr vec4_scalar(float a) : x(a), y(a), z(a), w(a) {}
+ constexpr vec4_scalar(float a) : x(a), y(a), z(a), w(a) {}
constexpr vec4_scalar(float x, float y, float z, float w)
: x(x), y(y), z(z), w(w) {}
vec4_scalar(vec3_scalar xyz, float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
- static vec4_scalar load_from_ptr(const float* f) {
- return vec4_scalar(f[0], f[1], f[2], f[3]);
- }
-
ALWAYS_INLINE float& select(XYZW c) {
switch (c) {
case X:
@@ -1700,9 +1426,6 @@ struct vec4_scalar {
vec3_scalar sel(XYZW c1, XYZW c2, XYZW c3) {
return vec3_scalar{select(c1), select(c2), select(c3)};
}
- vec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
- return vec4_scalar{select(c1), select(c2), select(c3), select(c4)};
- }
vec2_scalar_ref lsel(XYZW c1, XYZW c2) {
return vec2_scalar_ref(select(c1), select(c2));
}
@@ -1750,56 +1473,30 @@ struct vec4_scalar {
w /= a.w;
return *this;
}
-
- vec4_scalar& operator*=(vec4_scalar a) {
- x *= a.x;
- y *= a.y;
- z *= a.z;
- w *= a.w;
- return *this;
- }
-
- friend bool operator==(const vec4_scalar& l, const vec4_scalar& r) {
- return l.x == r.x && l.y == r.y && l.z == r.z && l.w == r.w;
- }
-
- friend bool operator!=(const vec4_scalar& l, const vec4_scalar& r) {
- return l.x != r.x || l.y != r.y || l.z != r.z || l.w != r.w;
- }
};
vec4_scalar vec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
return vec4_scalar{select(c1), select(c2), select(c3), select(c4)};
}
-struct vec4_ref {
- vec4_ref(Float& x, Float& y, Float& z, Float& w) : x(x), y(y), z(z), w(w) {}
- Float& x;
- Float& y;
- Float& z;
- Float& w;
-
- vec4_ref& operator=(const vec4& a);
-};
-
struct vec4 {
typedef struct vec4 vector_type;
typedef float element_type;
constexpr vec4() : vec4(Float(0.0f)) {}
- IMPLICIT constexpr vec4(Float a) : x(a), y(a), z(a), w(a) {}
+ constexpr vec4(Float a) : x(a), y(a), z(a), w(a) {}
vec4(Float x, Float y, Float z, Float w) : x(x), y(y), z(z), w(w) {}
vec4(vec3 xyz, Float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
vec4(vec2 xy, vec2 zw) : x(xy.x), y(xy.y), z(zw.x), w(zw.y) {}
vec4(vec2 xy, Float z, Float w) : x(xy.x), y(xy.y), z(z), w(w) {}
vec4(Float x, Float y, vec2 zw) : x(x), y(y), z(zw.x), w(zw.y) {}
- IMPLICIT constexpr vec4(vec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
+ constexpr vec4(vec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
constexpr vec4(vec4_scalar s0, vec4_scalar s1, vec4_scalar s2, vec4_scalar s3)
: x(Float{s0.x, s1.x, s2.x, s3.x}),
y(Float{s0.y, s1.y, s2.y, s3.y}),
z(Float{s0.z, s1.z, s2.z, s3.z}),
w(Float{s0.w, s1.w, s2.w, s3.w}) {}
- ALWAYS_INLINE Float& select(XYZW c) {
+ Float& select(XYZW c) {
switch (c) {
case X:
return x;
@@ -1813,29 +1510,18 @@ struct vec4 {
UNREACHABLE;
}
}
- ALWAYS_INLINE Float& sel(XYZW c1) { return select(c1); }
+ Float& sel(XYZW c1) { return select(c1); }
- ALWAYS_INLINE vec2 sel(XYZW c1, XYZW c2) {
- return vec2(select(c1), select(c2));
- }
+ vec2 sel(XYZW c1, XYZW c2) { return vec2(select(c1), select(c2)); }
- ALWAYS_INLINE vec3 sel(XYZW c1, XYZW c2, XYZW c3) {
+ vec3 sel(XYZW c1, XYZW c2, XYZW c3) {
return vec3(select(c1), select(c2), select(c3));
}
- ALWAYS_INLINE vec3_ref lsel(XYZW c1, XYZW c2, XYZW c3) {
+ vec3_ref lsel(XYZW c1, XYZW c2, XYZW c3) {
return vec3_ref(select(c1), select(c2), select(c3));
}
- ALWAYS_INLINE vec2_ref lsel(XYZW c1, XYZW c2) {
- return vec2_ref(select(c1), select(c2));
- }
-
- ALWAYS_INLINE vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
- return vec4(select(c1), select(c2), select(c3), select(c4));
- }
- ALWAYS_INLINE vec4_ref lsel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
- return vec4_ref(select(c1), select(c2), select(c3), select(c4));
- }
+ vec2_ref lsel(XYZW c1, XYZW c2) { return vec2_ref(select(c1), select(c2)); }
Float& operator[](int index) {
switch (index) {
@@ -1957,13 +1643,6 @@ struct vec4 {
w /= a.w;
return *this;
}
- vec4& operator*=(vec4 a) {
- x *= a.x;
- y *= a.y;
- z *= a.z;
- w *= a.w;
- return *this;
- }
vec4& operator*=(Float a) {
x *= a;
y *= a;
@@ -1978,18 +1657,6 @@ struct vec4 {
Float w;
};
-inline vec4_ref& vec4_ref::operator=(const vec4& a) {
- x = a.x;
- y = a.y;
- z = a.z;
- w = a.w;
- return *this;
-}
-
-inline vec4 vec3::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
- return vec4(select(c1), select(c2), select(c3), select(c4));
-}
-
vec4_scalar force_scalar(const vec4& v) {
return vec4_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z),
force_scalar(v.w)};
@@ -2017,10 +1684,6 @@ vec4_scalar make_vec4(float x, float y, const vec2_scalar& v) {
return vec4_scalar{x, y, v.x, v.y};
}
-ivec4_scalar make_ivec4(const vec4_scalar& v) {
- return ivec4_scalar{int32_t(v.x), int32_t(v.y), int32_t(v.z), int32_t(v.w)};
-}
-
template <typename N>
vec4 make_vec4(const N& n) {
return vec4(n);
@@ -2041,8 +1704,6 @@ vec4 make_vec4(const X& x, const Y& y, const Z& z, const W& w) {
return vec4(x, y, z, w);
}
-ALWAYS_INLINE vec3::vec3(vec4 v) : x(v.x), y(v.y), z(v.z) {}
-
SI ivec4 roundfast(vec4 v, Float scale) {
return ivec4(roundfast(v.x, scale), roundfast(v.y, scale),
roundfast(v.z, scale), roundfast(v.w, scale));
@@ -2059,14 +1720,6 @@ SI vec4 if_then_else(I32 c, vec4 t, vec4 e) {
SI vec4 if_then_else(int32_t c, vec4 t, vec4 e) { return c ? t : e; }
-SI vec4_scalar if_then_else(int32_t c, vec4_scalar t, vec4_scalar e) {
- return c ? t : e;
-}
-
-SI vec2 clamp(vec2 a, Float minVal, Float maxVal) {
- return vec2(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal));
-}
-
SI vec2 clamp(vec2 a, vec2 minVal, vec2 maxVal) {
return vec2(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y));
}
@@ -2076,56 +1729,20 @@ SI vec2_scalar clamp(vec2_scalar a, vec2_scalar minVal, vec2_scalar maxVal) {
clamp(a.y, minVal.y, maxVal.y)};
}
-SI vec2_scalar clamp(vec2_scalar a, float minVal, float maxVal) {
- return vec2_scalar{clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal)};
-}
-
SI I32 clamp(I32 a, I32 minVal, I32 maxVal) {
a = if_then_else(a < minVal, minVal, a);
return if_then_else(a > maxVal, maxVal, a);
}
-SI vec3 clamp(vec3 a, Float minVal, Float maxVal) {
- return vec3(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal),
- clamp(a.z, minVal, maxVal));
-}
-
SI vec3 clamp(vec3 a, vec3 minVal, vec3 maxVal) {
return vec3(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
clamp(a.z, minVal.z, maxVal.z));
}
-SI vec4 clamp(vec4 a, Float minVal, Float maxVal) {
- return vec4(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal),
- clamp(a.z, minVal, maxVal), clamp(a.w, minVal, maxVal));
-}
-
SI vec4 clamp(vec4 a, vec4 minVal, vec4 maxVal) {
return vec4(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w));
}
-
-SI vec4_scalar clamp(vec4_scalar a, vec4_scalar minVal, vec4_scalar maxVal) {
- return vec4_scalar{
- clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
- clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w)};
-}
-
-SI vec4_scalar clamp(vec4_scalar a, float minVal, float maxVal) {
- return vec4_scalar{clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal),
- clamp(a.z, minVal, maxVal), clamp(a.w, minVal, maxVal)};
-}
-
-vec4 step(vec4 edge, vec4 x) {
- return vec4(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z),
- step(edge.w, x.w));
-}
-
-vec4_scalar step(vec4_scalar edge, vec4_scalar x) {
- return vec4_scalar(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z),
- step(edge.w, x.w));
-}
-
template <typename T>
auto lessThanEqual(T x, T y) -> decltype(x <= y) {
return x <= y;
@@ -2163,20 +1780,6 @@ SI bvec2 lessThan(vec2 x, vec2 y) {
return bvec2(lessThan(x.x, y.x), lessThan(x.y, y.y));
}
-SI bvec2_scalar lessThan(vec2_scalar x, vec2_scalar y) {
- return bvec2_scalar(lessThan(x.x, y.x), lessThan(x.y, y.y));
-}
-
-SI bvec4 lessThan(vec4 x, vec4 y) {
- return bvec4(lessThan(x.x, y.x), lessThan(x.y, y.y), lessThan(x.z, y.z),
- lessThan(x.w, y.w));
-}
-
-SI bvec4_scalar lessThan(vec4_scalar x, vec4_scalar y) {
- return bvec4_scalar{lessThan(x.x, y.x), lessThan(x.y, y.y),
- lessThan(x.z, y.z), lessThan(x.w, y.w)};
-}
-
template <typename T>
auto greaterThan(T x, T y) -> decltype(x > y) {
return x > y;
@@ -2186,20 +1789,6 @@ bvec2 greaterThan(vec2 x, vec2 y) {
return bvec2(greaterThan(x.x, y.x), greaterThan(x.y, y.y));
}
-bvec2_scalar greaterThan(vec2_scalar x, vec2_scalar y) {
- return bvec2_scalar(greaterThan(x.x, y.x), greaterThan(x.y, y.y));
-}
-
-SI bvec4 greaterThan(vec4 x, vec4 y) {
- return bvec4(greaterThan(x.x, y.x), greaterThan(x.y, y.y),
- greaterThan(x.z, y.z), greaterThan(x.w, y.w));
-}
-
-SI bvec4_scalar greaterThan(vec4_scalar x, vec4_scalar y) {
- return bvec4_scalar{greaterThan(x.x, y.x), greaterThan(x.y, y.y),
- greaterThan(x.z, y.z), greaterThan(x.w, y.w)};
-}
-
template <typename T>
auto greaterThanEqual(T x, T y) -> decltype(x >= y) {
return x >= y;
@@ -2210,29 +1799,51 @@ bvec4 greaterThanEqual(vec4 x, vec4 y) {
greaterThanEqual(x.z, y.z), greaterThanEqual(x.w, y.w));
}
-template <typename T>
-auto equal(T x, T y) -> decltype(x > y) {
- return x == y;
-}
+enum TextureFormat { RGBA32F, RGBA32I, RGBA8, R8 };
-bvec2 equal(vec2 x, vec2 y) { return bvec2(equal(x.x, y.x), equal(x.y, y.y)); }
+enum TextureFilter { NEAREST, LINEAR };
-bvec2_scalar equal(vec2_scalar x, vec2_scalar y) {
- return bvec2_scalar(equal(x.x, y.x), equal(x.y, y.y));
-}
+struct samplerCommon {
+ uint32_t* buf = nullptr;
+ uint32_t stride = 0; // in dwords
+ uint32_t height = 0;
+ uint32_t width = 0;
+ TextureFormat format = TextureFormat::RGBA8;
+};
-template <typename T>
-auto notEqual(T x, T y) -> decltype(x > y) {
- return x != y;
-}
+struct samplerDepth {
+ int depth = 0;
+ uint32_t height_stride = 0; // in dwords
+};
-bvec2 notEqual(vec2 x, vec2 y) {
- return bvec2(notEqual(x.x, y.x), notEqual(x.y, y.y));
-}
+struct samplerFilter {
+ TextureFilter filter = TextureFilter::NEAREST;
+};
-bvec2_scalar notEqual(vec2_scalar x, vec2_scalar y) {
- return bvec2_scalar(notEqual(x.x, y.x), notEqual(x.y, y.y));
-}
+struct sampler2DArray_impl : samplerCommon, samplerDepth, samplerFilter {};
+typedef sampler2DArray_impl* sampler2DArray;
+
+typedef struct sampler2DArrayR8_impl : sampler2DArray_impl{} * sampler2DArrayR8;
+typedef struct sampler2DArrayRGBA8_impl : sampler2DArray_impl{} *
+ sampler2DArrayRGBA8;
+typedef struct sampler2DArrayRGBA32F_impl : sampler2DArray_impl{} *
+ sampler2DArrayRGBA32F;
+
+struct sampler2D_impl : samplerCommon, samplerFilter {};
+typedef sampler2D_impl* sampler2D;
+
+typedef struct sampler2DR8_impl : sampler2D_impl{} * sampler2DR8;
+typedef struct sampler2DRGBA8_impl : sampler2D_impl{} * sampler2DRGBA8;
+typedef struct sampler2DRGBA32F_impl : sampler2D_impl{} * sampler2DRGBA32F;
+
+struct isampler2D_impl : samplerCommon {};
+typedef isampler2D_impl* isampler2D;
+
+struct isampler2DRGBA32I_impl : isampler2D_impl {};
+typedef isampler2DRGBA32I_impl* isampler2DRGBA32I;
+
+struct sampler2DRect_impl : samplerCommon, samplerFilter {};
+typedef sampler2DRect_impl* sampler2DRect;
struct mat4_scalar;
@@ -2240,7 +1851,7 @@ struct mat2_scalar {
vec2_scalar data[2];
mat2_scalar() = default;
- IMPLICIT constexpr mat2_scalar(float a) {
+ constexpr mat2_scalar(float a) {
data[0] = vec2_scalar(a);
data[1] = vec2_scalar(a);
}
@@ -2248,7 +1859,7 @@ struct mat2_scalar {
data[0] = a;
data[1] = b;
}
- IMPLICIT mat2_scalar(const mat4_scalar& mat);
+ mat2_scalar(const mat4_scalar& mat);
vec2_scalar& operator[](int index) { return data[index]; }
const vec2_scalar& operator[](int index) const { return data[index]; }
@@ -2286,7 +1897,7 @@ struct mat2 {
const vec2& operator[](int index) const { return data[index]; }
mat2() = default;
- IMPLICIT mat2(Float a) {
+ mat2(Float a) {
data[0] = vec2(a);
data[1] = vec2(a);
}
@@ -2295,8 +1906,8 @@ struct mat2 {
data[0] = a;
data[1] = b;
}
- IMPLICIT mat2(const mat4& mat);
- IMPLICIT constexpr mat2(mat2_scalar s) {
+ mat2(const mat4& mat);
+ constexpr mat2(mat2_scalar s) {
data[0] = vec2(s.data[0]);
data[1] = vec2(s.data[1]);
}
@@ -2350,7 +1961,7 @@ struct mat3_scalar {
data[1] = b;
data[2] = c;
}
- IMPLICIT mat3_scalar(const mat4_scalar& mat);
+ mat3_scalar(const mat4_scalar& mat);
vec3_scalar& operator[](int index) { return data[index]; }
const vec3_scalar& operator[](int index) const { return data[index]; }
@@ -2384,7 +1995,7 @@ struct mat3 {
data[2] = c;
}
- IMPLICIT constexpr mat3(mat3_scalar s) {
+ constexpr mat3(mat3_scalar s) {
data[0] = vec3(s.data[0]);
data[1] = vec3(s.data[1]);
data[2] = vec3(s.data[2]);
@@ -2403,7 +2014,7 @@ struct mat3 {
data[2] = vec3(d7, d8, d9);
}
- IMPLICIT mat3(const mat4& mat);
+ mat3(const mat4& mat);
friend vec3 operator*(mat3 m, vec3 v) {
vec3 u;
@@ -2490,7 +2101,7 @@ struct mat4 {
vec4 data[4];
mat4() = default;
- IMPLICIT constexpr mat4(mat4_scalar s) {
+ constexpr mat4(mat4_scalar s) {
data[0] = vec4(s.data[0]);
data[1] = vec4(s.data[1]);
data[2] = vec4(s.data[2]);
@@ -2522,15 +2133,15 @@ mat3::mat3(const mat4& mat)
vec3(mat[1].x, mat[1].y, mat[1].z),
vec3(mat[2].x, mat[2].y, mat[2].z)) {}
-IMPLICIT mat3_scalar::mat3_scalar(const mat4_scalar& mat)
+mat3_scalar::mat3_scalar(const mat4_scalar& mat)
: mat3_scalar(vec3_scalar(mat[0].x, mat[0].y, mat[0].z),
vec3_scalar(mat[1].x, mat[1].y, mat[1].z),
vec3_scalar(mat[2].x, mat[2].y, mat[2].z)) {}
-IMPLICIT mat2::mat2(const mat4& mat)
+mat2::mat2(const mat4& mat)
: mat2(vec2(mat[0].x, mat[0].y), vec2(mat[1].x, mat[1].y)) {}
-IMPLICIT mat2_scalar::mat2_scalar(const mat4_scalar& mat)
+mat2_scalar::mat2_scalar(const mat4_scalar& mat)
: mat2_scalar(vec2_scalar(mat[0].x, mat[0].y),
vec2_scalar(mat[1].x, mat[1].y)) {}
@@ -2584,6 +2195,256 @@ SI mat4 if_then_else(I32 c, mat4 t, mat4 e) {
SI mat4 if_then_else(int32_t c, mat4 t, mat4 e) { return c ? t : e; }
+SI I32 clampCoord(I32 coord, int limit) {
+#if USE_SSE2
+ return _mm_min_epi16(_mm_max_epi16(coord, _mm_setzero_si128()),
+ _mm_set1_epi32(limit - 1));
+#else
+ return clamp(coord, 0, limit - 1);
+#endif
+}
+SI int clampCoord(int coord, int limit) {
+ return min(max(coord, 0), limit - 1);
+}
+template <typename T, typename S>
+SI T clamp2D(T P, S sampler) {
+ return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height)};
+}
+template <typename T>
+SI T clamp2DArray(T P, sampler2DArray sampler) {
+ return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height),
+ clampCoord(P.z, sampler->depth)};
+}
+
+float to_float(uint32_t x) { return x * (1.f / 255.f); }
+
+vec4 pixel_to_vec4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+ U32 pixels = {a, b, c, d};
+ return vec4(cast((pixels >> 16) & 0xFF), cast((pixels >> 8) & 0xFF),
+ cast(pixels & 0xFF), cast(pixels >> 24)) *
+ (1.0f / 255.0f);
+}
+
+vec4 pixel_float_to_vec4(Float a, Float b, Float c, Float d) {
+ return vec4(Float{a.x, b.x, c.x, d.x}, Float{a.y, b.y, c.y, d.y},
+ Float{a.z, b.z, c.z, d.z}, Float{a.w, b.w, c.w, d.w});
+}
+
+ivec4 pixel_int_to_ivec4(I32 a, I32 b, I32 c, I32 d) {
+ return ivec4(I32{a.x, b.x, c.x, d.x}, I32{a.y, b.y, c.y, d.y},
+ I32{a.z, b.z, c.z, d.z}, I32{a.w, b.w, c.w, d.w});
+}
+
+vec4_scalar pixel_to_vec4(uint32_t p) {
+ U32 i = {(p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF, p >> 24};
+ Float f = cast(i) * (1.0f / 255.0f);
+ return vec4_scalar(f.x, f.y, f.z, f.w);
+}
+
+template <typename S>
+SI vec4 fetchOffsetsRGBA8(S sampler, I32 offset) {
+ return pixel_to_vec4(sampler->buf[offset.x], sampler->buf[offset.y],
+ sampler->buf[offset.z], sampler->buf[offset.w]);
+}
+
+vec4 texelFetchRGBA8(sampler2D sampler, ivec2 P) {
+ I32 offset = P.x + P.y * sampler->stride;
+ return fetchOffsetsRGBA8(sampler, offset);
+}
+
+vec4 texelFetchRGBA8(sampler2DArray sampler, ivec3 P) {
+ assert(test_all(P.z == P.z.x));
+ I32 offset = P.x + P.y * sampler->stride + P.z.x * sampler->height_stride;
+ return fetchOffsetsRGBA8(sampler, offset);
+}
+
+template <typename S>
+SI Float fetchOffsetsR8(S sampler, I32 offset) {
+ U32 i = {
+ ((uint8_t*)sampler->buf)[offset.x], ((uint8_t*)sampler->buf)[offset.y],
+ ((uint8_t*)sampler->buf)[offset.z], ((uint8_t*)sampler->buf)[offset.w]};
+ return cast(i) * (1.0f / 255.0f);
+}
+
+vec4 texelFetchR8(sampler2D sampler, ivec2 P) {
+ I32 offset = P.x + P.y * sampler->stride;
+ return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f);
+}
+
+vec4 texelFetchR8(sampler2DArray sampler, ivec3 P) {
+ assert(test_all(P.z == P.z.x));
+ I32 offset = P.x + P.y * sampler->stride + P.z.x * sampler->height_stride;
+ return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f);
+}
+
+template <typename S>
+SI vec4 fetchOffsetsFloat(S sampler, I32 offset) {
+ return pixel_float_to_vec4(
+ *(Float*)&sampler->buf[offset.x], *(Float*)&sampler->buf[offset.y],
+ *(Float*)&sampler->buf[offset.z], *(Float*)&sampler->buf[offset.w]);
+}
+
+vec4 texelFetchFloat(sampler2D sampler, ivec2 P) {
+ I32 offset = P.x * 4 + P.y * sampler->stride;
+ return fetchOffsetsFloat(sampler, offset);
+}
+
+SI vec4 texelFetchFloat(sampler2DArray sampler, ivec3 P) {
+ assert(test_all(P.z == P.z.x));
+ I32 offset = P.x * 4 + P.y * sampler->stride + P.z.x * sampler->height_stride;
+ return fetchOffsetsFloat(sampler, offset);
+}
+
+vec4 texelFetch(sampler2D sampler, ivec2 P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ if (sampler->format == TextureFormat::RGBA32F) {
+ return texelFetchFloat(sampler, P);
+ } else if (sampler->format == TextureFormat::RGBA8) {
+ return texelFetchRGBA8(sampler, P);
+ } else {
+ assert(sampler->format == TextureFormat::R8);
+ return texelFetchR8(sampler, P);
+ }
+}
+
+vec4 texelFetch(sampler2DRGBA32F sampler, ivec2 P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA32F);
+ return texelFetchFloat(sampler, P);
+}
+
+vec4 texelFetch(sampler2DRGBA8 sampler, ivec2 P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA8);
+ return texelFetchRGBA8(sampler, P);
+}
+
+vec4 texelFetch(sampler2DR8 sampler, ivec2 P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::R8);
+ return texelFetchR8(sampler, P);
+}
+
+vec4_scalar texelFetch(sampler2D sampler, ivec2_scalar P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ if (sampler->format == TextureFormat::RGBA32F) {
+ return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+ } else {
+ assert(sampler->format == TextureFormat::RGBA8);
+ return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
+ }
+}
+
+vec4_scalar texelFetch(sampler2DRGBA32F sampler, ivec2_scalar P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA32F);
+ return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+vec4_scalar texelFetch(sampler2DRGBA8 sampler, ivec2_scalar P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA8);
+ return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
+}
+
+vec4_scalar texelFetch(sampler2DR8 sampler, ivec2_scalar P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::R8);
+ return vec4_scalar{
+ to_float(((uint8_t*)sampler->buf)[P.x + P.y * sampler->stride]), 0.0f,
+ 0.0f, 0.0f};
+}
+
+vec4 texelFetch(sampler2DRect sampler, ivec2 P) {
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA8);
+ I32 offset = P.x + P.y * sampler->stride;
+ return fetchOffsetsRGBA8(sampler, offset);
+}
+
+SI vec4 texelFetch(sampler2DArray sampler, ivec3 P, int lod) {
+ assert(lod == 0);
+ P = clamp2DArray(P, sampler);
+ if (sampler->format == TextureFormat::RGBA32F) {
+ return texelFetchFloat(sampler, P);
+ } else if (sampler->format == TextureFormat::R8) {
+ return texelFetchR8(sampler, P);
+ } else {
+ assert(sampler->format == TextureFormat::RGBA8);
+ return texelFetchRGBA8(sampler, P);
+ }
+}
+
+vec4 texelFetch(sampler2DArrayRGBA32F sampler, ivec3 P, int lod) {
+ assert(lod == 0);
+ P = clamp2DArray(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA32F);
+ return texelFetchFloat(sampler, P);
+}
+
+vec4 texelFetch(sampler2DArrayRGBA8 sampler, ivec3 P, int lod) {
+ assert(lod == 0);
+ P = clamp2DArray(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA8);
+ return texelFetchRGBA8(sampler, P);
+}
+
+vec4 texelFetch(sampler2DArrayR8 sampler, ivec3 P, int lod) {
+ assert(lod == 0);
+ P = clamp2DArray(P, sampler);
+ assert(sampler->format == TextureFormat::R8);
+ return texelFetchR8(sampler, P);
+}
+
+template <typename S>
+SI ivec4 fetchOffsetsInt(S sampler, I32 offset) {
+ return pixel_int_to_ivec4(
+ *(I32*)&sampler->buf[offset.x], *(I32*)&sampler->buf[offset.y],
+ *(I32*)&sampler->buf[offset.z], *(I32*)&sampler->buf[offset.w]);
+}
+
+ivec4 texelFetch(isampler2D sampler, ivec2 P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA32I);
+ I32 offset = P.x * 4 + P.y * sampler->stride;
+ return fetchOffsetsInt(sampler, offset);
+}
+
+ivec4_scalar texelFetch(isampler2D sampler, ivec2_scalar P, int lod) {
+ assert(lod == 0);
+ P = clamp2D(P, sampler);
+ assert(sampler->format == TextureFormat::RGBA32I);
+ return *(ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+SI vec4_scalar* texelFetchPtr(sampler2D sampler, ivec2_scalar P, int min_x,
+ int max_x, int min_y, int max_y) {
+ P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x);
+ P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y);
+ assert(sampler->format == TextureFormat::RGBA32F);
+ return (vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+SI ivec4_scalar* texelFetchPtr(isampler2D sampler, ivec2_scalar P, int min_x,
+ int max_x, int min_y, int max_y) {
+ P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x);
+ P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y);
+ assert(sampler->format == TextureFormat::RGBA32I);
+ return (ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+#define texelFetchOffset(sampler, P, lod, offset) \
+ texelFetch(sampler, (P) + (offset), lod)
+
template <typename T, typename U, typename A,
typename R = typename T::vector_type>
SI R mix(T x, U y, A a) {
@@ -2598,19 +2459,416 @@ SI T mix(T x, T y, float a) {
}
template <typename T>
-SI T mix(T x, T y, vec2_scalar a) {
- return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y)};
+SI T mix(T x, T y, vec4_scalar a) {
+ return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z),
+ mix(x.w, y.w, a.w)};
}
+// Scale texture coords for quantization, subtract offset for filtering
+// (assuming coords already offset to texel centers), and round to nearest
+// 1/scale increment
template <typename T>
-SI T mix(T x, T y, vec3_scalar a) {
- return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z)};
+SI T linearQuantize(T P, float scale) {
+ return P * scale + (0.5f - 0.5f * scale);
}
-template <typename T>
-SI T mix(T x, T y, vec4_scalar a) {
- return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z),
- mix(x.w, y.w, a.w)};
+// Helper version that also scales normalized texture coords for sampler
+template <typename T, typename S>
+SI T linearQuantize(T P, float scale, S sampler) {
+ P.x *= sampler->width;
+ P.y *= sampler->height;
+ return linearQuantize(P, scale);
+}
+
+template <typename S>
+vec4 textureLinearRGBA8(S sampler, vec2 P, int32_t zoffset = 0) {
+ assert(sampler->format == TextureFormat::RGBA8);
+
+#if USE_SSE2
+ ivec2 i(linearQuantize(P, 256, sampler));
+ ivec2 frac = i & (I32)0xFF;
+ i >>= 8;
+
+ // Pack coords so they get clamped into range, and also for later bounding
+ // of fractional coords. Store Y as low-bits for easier access, X as high.
+ __m128i yx = _mm_packs_epi32(i.y, i.x);
+ __m128i hw = _mm_packs_epi32(_mm_set1_epi32(sampler->height - 1),
+ _mm_set1_epi32(sampler->width - 1));
+ // Clamp coords to valid range to prevent sampling outside texture.
+ __m128i clampyx = _mm_min_epi16(_mm_max_epi16(yx, _mm_setzero_si128()), hw);
+ // Multiply clamped Y by stride and add X offset.
+ __m128i row0 = _mm_madd_epi16(
+ _mm_unpacklo_epi16(clampyx, _mm_setzero_si128()),
+ _mm_set1_epi16(sampler->stride));
+ row0 = _mm_add_epi32(row0, _mm_unpackhi_epi16(clampyx, _mm_setzero_si128()));
+ // Add in layer offset if available
+ row0 = _mm_add_epi32(row0, _mm_set1_epi32(zoffset));
+
+ // Check if fractional coords are all zero, in which case skip filtering.
+ __m128i fracyx = _mm_packs_epi32(frac.y, frac.x);
+ if (!_mm_movemask_epi8(_mm_cmpgt_epi16(fracyx, _mm_setzero_si128()))) {
+ return fetchOffsetsRGBA8(sampler, row0);
+ }
+
+ // Check if coords were clamped at all above. If so, need to adjust fractions
+ // to avoid sampling outside the texture on the edges.
+ __m128i yxinside = _mm_andnot_si128(
+ _mm_cmplt_epi16(yx, _mm_setzero_si128()),
+ _mm_cmplt_epi16(yx, hw));
+ // Set fraction to zero when outside.
+ fracyx = _mm_and_si128(fracyx, yxinside);
+ // Store two side-by-side copies of X fraction, as below each pixel value
+ // will be interleaved to be next to the pixel value for the next row.
+ __m128i fracx = _mm_unpackhi_epi16(fracyx, fracyx);
+ // For Y fraction, we need to store 1-fraction before each fraction, as a
+ // madd will be used to weight and collapse all results as last step.
+ __m128i fracy = _mm_unpacklo_epi16(
+ _mm_sub_epi16(_mm_set1_epi16(256), fracyx), fracyx);
+
+ // Ensure we don't sample row off end of texture from added stride.
+ __m128i row1 = _mm_and_si128(yxinside, _mm_set1_epi16(sampler->stride));
+
+ // Load two adjacent pixels on each row and interleave them.
+ // r0,g0,b0,a0,r1,g1,b1,a1 \/ R0,G0,B0,A0,R1,G1,B1,A1
+ // r0,R0,g0,G0,b0,B0,a0,A0,r1,R1,g1,G1,b1,B1,a1,A1
+# define LOAD_LANE(out, idx) \
+ { \
+ uint32_t* buf = &sampler->buf[_mm_cvtsi128_si32( \
+ _mm_shuffle_epi32(row0, _MM_SHUFFLE(idx, idx, idx, idx)))]; \
+ out = _mm_unpacklo_epi8( \
+ _mm_loadl_epi64((__m128i*)buf), \
+ _mm_loadl_epi64((__m128i*)(buf + _mm_extract_epi16(row1, idx)))); \
+ }
+ __m128i x, y, z, w;
+ LOAD_LANE(x, 0)
+ LOAD_LANE(y, 1)
+ LOAD_LANE(z, 2)
+ LOAD_LANE(w, 3)
+# undef LOAD_LANE
+
+ // Need to transpose the data from AoS to SoA format. Best to do this here
+ // while the data is still packed into 8-bit components, requiring fewer
+ // insns.
+ // r0,R0,g0,G0,b0,B0,a0,A0,r1,R1,g1,G1,b1,B1,a1,A1 \/
+ // r2,R2,g2,G2,b2,B2,a2,A2,r3,R3,g3,G3,b3,B3,a3,A3
+ // ... r0,R0,r2,R2,g0,G0,g2,G2,b0,B0,b2,B2,a0,A0,a2,A2
+ // ... r1,R1,r3,R3,g1,G1,g3,G3,b1,B1,b3,B3,a1,A1,a3,A3
+ __m128i xy0 = _mm_unpacklo_epi16(x, y);
+ __m128i xy1 = _mm_unpackhi_epi16(x, y);
+ __m128i zw0 = _mm_unpacklo_epi16(z, w);
+ __m128i zw1 = _mm_unpackhi_epi16(z, w);
+ // r0,R0,r2,R2,g0,G0,g2,G2,b0,B0,b2,B2,a0,A0,a2,A2 \/
+ // r4,R4,r6,R6,g4,G4,g6,G6,b4,B4,b6,B6,a4,A4,a6,A6
+ // ... r0,R0,r2,R2,r4,R4,r6,R6,g0,G0,g2,G2,g4,G4,g6,G6
+ // ... b0,B0,b2,B2,b4,B4,b6,B6,a0,A0,a2,A2,a4,A4,a6,A6
+ __m128i rg0 = _mm_unpacklo_epi32(xy0, zw0);
+ __m128i ba0 = _mm_unpackhi_epi32(xy0, zw0);
+ __m128i rg1 = _mm_unpacklo_epi32(xy1, zw1);
+ __m128i ba1 = _mm_unpackhi_epi32(xy1, zw1);
+
+ // Expand packed SoA pixels for each column. Multiply then add columns with
+ // 8-bit precision so we don't carry to high byte of word accidentally. Use
+ // final madd insn to blend interleaved rows and expand result to 32 bits.
+# define FILTER_COMPONENT(out, unpack, src0, src1) \
+ { \
+ __m128i cc0 = unpack(src0, _mm_setzero_si128()); \
+ __m128i cc1 = unpack(src1, _mm_setzero_si128()); \
+ cc0 = _mm_add_epi8( \
+ cc0, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(cc1, cc0), fracx), \
+ 8)); \
+ out = _mm_cvtepi32_ps(_mm_madd_epi16(cc0, fracy)); \
+ }
+ __m128 fr, fg, fb, fa;
+ FILTER_COMPONENT(fr, _mm_unpacklo_epi8, rg0, rg1);
+ FILTER_COMPONENT(fg, _mm_unpackhi_epi8, rg0, rg1);
+ FILTER_COMPONENT(fb, _mm_unpacklo_epi8, ba0, ba1);
+ FILTER_COMPONENT(fa, _mm_unpackhi_epi8, ba0, ba1);
+# undef FILTER_COMPONENT
+
+ return vec4(fb, fg, fr, fa) * (1.0f / 0xFF00);
+#else
+ ivec2 i(linearQuantize(P, 128, sampler));
+ ivec2 frac = i & (I32)0x7F;
+ i >>= 7;
+
+ I32 row0 = clampCoord(i.x, sampler->width) +
+ clampCoord(i.y, sampler->height) * sampler->stride + zoffset;
+ I32 row1 = row0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) &
+ I32(sampler->stride));
+ I16 fracx =
+ CONVERT(frac.x & (i.x >= 0 && i.x < int32_t(sampler->width) - 1), I16);
+ I16 fracy = CONVERT(frac.y, I16);
+
+ auto a0 =
+ CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.x]), V8<int16_t>);
+ auto a1 =
+ CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.x]), V8<int16_t>);
+ a0 += ((a1 - a0) * fracy.x) >> 7;
+
+ auto b0 =
+ CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.y]), V8<int16_t>);
+ auto b1 =
+ CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.y]), V8<int16_t>);
+ b0 += ((b1 - b0) * fracy.y) >> 7;
+
+ auto abl = zipLow(a0, b0);
+ auto abh = zipHigh(a0, b0);
+ abl += ((abh - abl) * fracx.xyxyxyxy) >> 7;
+
+ auto c0 =
+ CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.z]), V8<int16_t>);
+ auto c1 =
+ CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.z]), V8<int16_t>);
+ c0 += ((c1 - c0) * fracy.z) >> 7;
+
+ auto d0 =
+ CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.w]), V8<int16_t>);
+ auto d1 =
+ CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.w]), V8<int16_t>);
+ d0 += ((d1 - d0) * fracy.w) >> 7;
+
+ auto cdl = zipLow(c0, d0);
+ auto cdh = zipHigh(c0, d0);
+ cdl += ((cdh - cdl) * fracx.zwzwzwzw) >> 7;
+
+ auto rg = CONVERT(V8<uint16_t>(zip2Low(abl, cdl)), V8<float>);
+ auto ba = CONVERT(V8<uint16_t>(zip2High(abl, cdl)), V8<float>);
+
+ auto r = lowHalf(rg);
+ auto g = highHalf(rg);
+ auto b = lowHalf(ba);
+ auto a = highHalf(ba);
+ return vec4(b, g, r, a) * (1.0f / 255.0f);
+#endif
+}
+
+template <typename S>
+static U16 textureLinearPackedR8(S sampler, ivec2 i, int32_t zoffset) {
+ assert(sampler->format == TextureFormat::R8);
+ ivec2 frac = i & (I32)0x7F;
+ i >>= 7;
+
+ I32 row0 = clampCoord(i.x, sampler->width) +
+ clampCoord(i.y, sampler->height) * sampler->stride + zoffset;
+ I32 row1 = row0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) &
+ I32(sampler->stride));
+ I16 fracx =
+ CONVERT(frac.x & (i.x >= 0 && i.x < int32_t(sampler->width) - 1), I16);
+ I16 fracy = CONVERT(frac.y, I16);
+
+ uint8_t* buf = (uint8_t*)sampler->buf;
+ auto a0 = unaligned_load<V2<uint8_t> >(&buf[row0.x]);
+ auto b0 = unaligned_load<V2<uint8_t> >(&buf[row0.y]);
+ auto c0 = unaligned_load<V2<uint8_t> >(&buf[row0.z]);
+ auto d0 = unaligned_load<V2<uint8_t> >(&buf[row0.w]);
+ auto abcd0 = CONVERT(combine(combine(a0, b0), combine(c0, d0)), V8<int16_t>);
+
+ auto a1 = unaligned_load<V2<uint8_t> >(&buf[row1.x]);
+ auto b1 = unaligned_load<V2<uint8_t> >(&buf[row1.y]);
+ auto c1 = unaligned_load<V2<uint8_t> >(&buf[row1.z]);
+ auto d1 = unaligned_load<V2<uint8_t> >(&buf[row1.w]);
+ auto abcd1 = CONVERT(combine(combine(a1, b1), combine(c1, d1)), V8<int16_t>);
+
+ abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7;
+
+ abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);
+ auto abcdl = lowHalf(abcd0);
+ auto abcdh = highHalf(abcd0);
+ abcdl += ((abcdh - abcdl) * fracx) >> 7;
+
+ return U16(abcdl);
+}
+
+template <typename S>
+vec4 textureLinearR8(S sampler, vec2 P, int32_t zoffset = 0) {
+ assert(sampler->format == TextureFormat::R8);
+
+#if USE_SSE2
+ ivec2 i(linearQuantize(P, 256, sampler));
+ ivec2 frac = i & (I32)0xFF;
+ i >>= 8;
+
+ // Pack coords so they get clamped into range, and also for later bounding
+ // of fractional coords. Store Y as low-bits for easier access, X as high.
+ __m128i yx = _mm_packs_epi32(i.y, i.x);
+ __m128i hw = _mm_packs_epi32(_mm_set1_epi32(sampler->height - 1),
+ _mm_set1_epi32(sampler->width - 1));
+ // Clamp coords to valid range to prevent sampling outside texture.
+ __m128i clampyx = _mm_min_epi16(_mm_max_epi16(yx, _mm_setzero_si128()), hw);
+ // Multiply clamped Y by stride and add X offset.
+ __m128i row0 = _mm_madd_epi16(
+ _mm_unpacklo_epi16(clampyx, _mm_setzero_si128()),
+ _mm_set1_epi16(sampler->stride));
+ row0 = _mm_add_epi32(row0, _mm_unpackhi_epi16(clampyx, _mm_setzero_si128()));
+ // Add in layer offset if available
+ row0 = _mm_add_epi32(row0, _mm_set1_epi32(zoffset));
+
+ __m128i fracyx = _mm_packs_epi32(frac.y, frac.x);
+
+ // Check if coords were clamped at all above. If so, need to adjust fractions
+ // to avoid sampling outside the texture on the edges.
+ __m128i yxinside = _mm_andnot_si128(
+ _mm_cmplt_epi16(yx, _mm_setzero_si128()),
+ _mm_cmplt_epi16(yx, hw));
+ // Set fraction to zero when outside.
+ fracyx = _mm_and_si128(fracyx, yxinside);
+ // For X fraction, we need to store 1-fraction before each fraction, as a
+ // madd will be used to weight and collapse all results as last step.
+ __m128i fracx = _mm_unpackhi_epi16(
+ _mm_sub_epi16(_mm_set1_epi16(256), fracyx), fracyx);
+ // Store two side-by-side copies of Y fraction, as below each pixel value
+ // will be interleaved to be next to the pixel value for the next column.
+ __m128i fracy = _mm_unpacklo_epi16(fracyx, fracyx);
+
+ // Ensure we don't sample row off end of texture from added stride.
+ __m128i row1 = _mm_and_si128(yxinside, _mm_set1_epi16(sampler->stride));
+
+ // Calculate pointers for first row in each lane
+ uint8_t* buf = (uint8_t*)sampler->buf;
+ uint8_t* buf0 =
+ buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(0, 0, 0, 0)));
+ uint8_t* buf1 =
+ buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(1, 1, 1, 1)));
+ uint8_t* buf2 =
+ buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(2, 2, 2, 2)));
+ uint8_t* buf3 =
+ buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 3, 3, 3)));
+ // Load adjacent columns from first row, pack into register, then expand.
+ __m128i cc0 = _mm_unpacklo_epi8(
+ _mm_setr_epi16(*(uint16_t*)buf0, *(uint16_t*)buf1, *(uint16_t*)buf2,
+ *(uint16_t*)buf3, 0, 0, 0, 0),
+ _mm_setzero_si128());
+ // Load adjacent columns from next row, pack into register, then expand.
+ __m128i cc1 = _mm_unpacklo_epi8(
+ _mm_setr_epi16(*(uint16_t*)(buf0 + _mm_extract_epi16(row1, 0)),
+ *(uint16_t*)(buf1 + _mm_extract_epi16(row1, 1)),
+ *(uint16_t*)(buf2 + _mm_extract_epi16(row1, 2)),
+ *(uint16_t*)(buf3 + _mm_extract_epi16(row1, 3)),
+ 0, 0, 0, 0),
+ _mm_setzero_si128());
+ // Multiply then add rows with 8-bit precision so we don't carry to high byte
+ // of word accidentally. Use final madd insn to blend interleaved columns and
+ // expand result to 32 bits.
+ __m128i cc = _mm_add_epi8(
+ cc0, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(cc1, cc0), fracy), 8));
+ __m128 r = _mm_cvtepi32_ps(_mm_madd_epi16(cc, fracx));
+ return vec4((Float)r * (1.0f / 0xFF00), 0.0f, 0.0f, 1.0f);
+#else
+ ivec2 i(linearQuantize(P, 128, sampler));
+ Float r = CONVERT(textureLinearPackedR8(sampler, i, zoffset), Float);
+ return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f);
+#endif
+}
+
+template <typename S>
+vec4 textureLinearRGBA32F(S sampler, vec2 P, int32_t zoffset = 0) {
+ assert(sampler->format == TextureFormat::RGBA32F);
+ P.x *= sampler->width;
+ P.y *= sampler->height;
+ P -= 0.5f;
+ vec2 f = floor(P);
+ vec2 r = P - f;
+ ivec2 i(f);
+ ivec2 c = clamp2D(i, sampler);
+ r.x = if_then_else(i.x >= 0 && i.x < sampler->width - 1, r.x, 0.0f);
+ I32 offset0 = c.x * 4 + c.y * sampler->stride + zoffset;
+ I32 offset1 = offset0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) &
+ I32(sampler->stride));
+
+ Float c0 = mix(mix(*(Float*)&sampler->buf[offset0.x],
+ *(Float*)&sampler->buf[offset0.x + 4], r.x),
+ mix(*(Float*)&sampler->buf[offset1.x],
+ *(Float*)&sampler->buf[offset1.x + 4], r.x),
+ r.y);
+ Float c1 = mix(mix(*(Float*)&sampler->buf[offset0.y],
+ *(Float*)&sampler->buf[offset0.y + 4], r.x),
+ mix(*(Float*)&sampler->buf[offset1.y],
+ *(Float*)&sampler->buf[offset1.y + 4], r.x),
+ r.y);
+ Float c2 = mix(mix(*(Float*)&sampler->buf[offset0.z],
+ *(Float*)&sampler->buf[offset0.z + 4], r.x),
+ mix(*(Float*)&sampler->buf[offset1.z],
+ *(Float*)&sampler->buf[offset1.z + 4], r.x),
+ r.y);
+ Float c3 = mix(mix(*(Float*)&sampler->buf[offset0.w],
+ *(Float*)&sampler->buf[offset0.w + 4], r.x),
+ mix(*(Float*)&sampler->buf[offset1.w],
+ *(Float*)&sampler->buf[offset1.w + 4], r.x),
+ r.y);
+ return pixel_float_to_vec4(c0, c1, c2, c3);
+}
+
+SI vec4 texture(sampler2D sampler, vec2 P) {
+ if (sampler->filter == TextureFilter::LINEAR) {
+ if (sampler->format == TextureFormat::RGBA8) {
+ return textureLinearRGBA8(sampler, P);
+ } else if (sampler->format == TextureFormat::R8) {
+ return textureLinearR8(sampler, P);
+ } else {
+ assert(sampler->format == TextureFormat::RGBA32F);
+ return textureLinearRGBA32F(sampler, P);
+ }
+ } else {
+ ivec2 coord(roundzero(P.x, sampler->width), roundzero(P.y, sampler->height));
+ return texelFetch(sampler, coord, 0);
+ }
+}
+
+vec4 texture(sampler2DRect sampler, vec2 P) {
+ assert(sampler->format == TextureFormat::RGBA8);
+ if (sampler->filter == TextureFilter::LINEAR) {
+ return textureLinearRGBA8(sampler,
+ P * vec2_scalar{1.0f / sampler->width, 1.0f / sampler->height});
+ } else {
+ ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f));
+ return texelFetch(sampler, coord);
+ }
+}
+
+SI vec4 texture(sampler2DArray sampler, vec3 P) {
+ if (sampler->filter == TextureFilter::LINEAR) {
+ // SSE2 can generate slow code for 32-bit multiply, and we never actually sample
+ // from different layers in one chunk, so do cheaper scalar multiplication instead.
+ assert(test_all(P.z == P.z.x));
+ int32_t zoffset =
+ clampCoord(roundeven(P.z.x, 1.0f), sampler->depth) * sampler->height_stride;
+ if (sampler->format == TextureFormat::RGBA8) {
+ return textureLinearRGBA8(sampler, vec2(P.x, P.y), zoffset);
+ } else if (sampler->format == TextureFormat::R8) {
+ return textureLinearR8(sampler, vec2(P.x, P.y), zoffset);
+ } else {
+ assert(sampler->format == TextureFormat::RGBA32F);
+ return textureLinearRGBA32F(sampler, vec2(P.x, P.y), zoffset);
+ }
+ } else {
+ // just do nearest for now
+ ivec3 coord(roundzero(P.x, sampler->width), roundzero(P.y, sampler->height),
+ roundeven(P.z, 1.0f));
+ return texelFetch(sampler, coord, 0);
+ }
+}
+
+vec4 texture(sampler2DArray sampler, vec3 P, float bias) {
+ assert(bias == 0.0f);
+ return texture(sampler, P);
+}
+
+vec4 textureLod(sampler2DArray sampler, vec3 P, float lod) {
+ assert(lod == 0.0f);
+ return texture(sampler, P);
+}
+
+ivec3_scalar textureSize(sampler2DArray sampler, int) {
+ return ivec3_scalar{int32_t(sampler->width), int32_t(sampler->height),
+ int32_t(sampler->depth)};
+}
+
+ivec2_scalar textureSize(sampler2D sampler, int) {
+ return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
+}
+
+ivec2_scalar textureSize(sampler2DRect sampler) {
+ return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
}
ivec4 ivec2::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
@@ -2675,30 +2933,15 @@ SI T mix(T x, T y, bvec4_scalar a) {
}
template <typename T>
-SI T mix(T x, T y, bvec4_scalar1 a) {
- return a.x ? y : x;
-}
-
-template <typename T>
SI T mix(T x, T y, bvec3_scalar a) {
return T{a.x ? y.x : x.x, a.y ? y.y : x.y, a.z ? y.z : x.z};
}
template <typename T>
-SI T mix(T x, T y, bvec3_scalar1 a) {
- return a.x ? y : x;
-}
-
-template <typename T>
SI T mix(T x, T y, bvec2_scalar a) {
return T{a.x ? y.x : x.x, a.y ? y.y : x.y};
}
-template <typename T>
-SI T mix(T x, T y, bvec2_scalar1 a) {
- return a.x ? y : x;
-}
-
float dot(vec3_scalar a, vec3_scalar b) {
return a.x * b.x + a.y * b.y + a.z * b.z;
}
@@ -2736,28 +2979,7 @@ Float atan(Float v) { return {atanf(v.x), atanf(v.y), atanf(v.z), atanf(v.w)}; }
float atan(float a, float b) { return atan2f(a, b); }
Float atan(Float a, Float b) {
- return {atan2f(a.x, b.x), atan2f(a.y, b.y), atan2f(a.z, b.z),
- atan2f(a.w, b.w)};
-}
-
-bvec4 equal(vec4 x, vec4 y) {
- return bvec4(equal(x.x, y.x), equal(x.y, y.y), equal(x.z, y.z),
- equal(x.w, y.w));
-}
-
-bvec4_scalar equal(vec4_scalar x, vec4_scalar y) {
- return bvec4_scalar(equal(x.x, y.x), equal(x.y, y.y), equal(x.z, y.z),
- equal(x.w, y.w));
-}
-
-bvec4 notEqual(vec4 x, vec4 y) {
- return bvec4(notEqual(x.x, y.x), notEqual(x.y, y.y), notEqual(x.z, y.z),
- notEqual(x.w, y.w));
-}
-
-bvec4_scalar notEqual(vec4_scalar x, vec4_scalar y) {
- return bvec4_scalar(notEqual(x.x, y.x), notEqual(x.y, y.y),
- notEqual(x.z, y.z), notEqual(x.w, y.w));
+ return {atan2f(a.x, b.x), atan2f(a.y, b.y), atan2f(a.z, b.z), atan2f(a.w, b.w)};
}
bvec4 notEqual(ivec4 a, ivec4 b) {
@@ -2783,18 +3005,12 @@ vec2 abs(vec2 v) { return vec2(abs(v.x), abs(v.y)); }
vec2_scalar abs(vec2_scalar v) { return vec2_scalar{fabsf(v.x), fabsf(v.y)}; }
-vec2 sign(vec2 v) { return vec2(sign(v.x), sign(v.y)); }
-
-vec2_scalar sign(vec2_scalar v) { return vec2_scalar{sign(v.x), sign(v.y)}; }
-
Float mod(Float a, Float b) { return a - b * floor(a / b); }
vec2 mod(vec2 a, vec2 b) { return vec2(mod(a.x, b.x), mod(a.y, b.y)); }
vec3 abs(vec3 v) { return vec3(abs(v.x), abs(v.y), abs(v.z)); }
-vec3 sign(vec3 v) { return vec3(sign(v.x), sign(v.y), sign(v.z)); }
-
mat2 inverse(mat2 v) {
Float det = v[0].x * v[1].y - v[0].y * v[1].x;
return mat2(vec2(v[1].y, -v[0].y), vec2(-v[1].x, v[0].x)) * (1. / det);
diff --git a/third_party/webrender/swgl/src/lib.rs b/third_party/webrender/swgl/src/lib.rs
index e8fc030e0c9..e19e85fd512 100644
--- a/third_party/webrender/swgl/src/lib.rs
+++ b/third_party/webrender/swgl/src/lib.rs
@@ -5,7 +5,7 @@
#![crate_name = "swgl"]
#![crate_type = "lib"]
-extern crate gleam;
+use gleam;
mod swgl_fns;
diff --git a/third_party/webrender/swgl/src/program.h b/third_party/webrender/swgl/src/program.h
index 9ea7c6dd6eb..80e5a5b68f7 100644
--- a/third_party/webrender/swgl/src/program.h
+++ b/third_party/webrender/swgl/src/program.h
@@ -12,12 +12,6 @@ namespace glsl {
// to operate in Float-sized chunks.
typedef vec3 Interpolants;
-// Clip distances, if enabled, are always stored in the first SIMD chunk of the
-// interpolants.
-static ALWAYS_INLINE Float get_clip_distances(const Interpolants& interp) {
- return interp.x;
-}
-
struct VertexShaderImpl;
struct FragmentShaderImpl;
@@ -29,14 +23,10 @@ struct ProgramImpl {
virtual size_t interpolants_size() const = 0;
virtual VertexShaderImpl* get_vertex_shader() = 0;
virtual FragmentShaderImpl* get_fragment_shader() = 0;
- virtual const char* get_name() const = 0;
};
typedef ProgramImpl* (*ProgramLoader)();
-// The maximum size of the gl_ClipDistance array.
-constexpr int32_t gl_MaxClipDistances = 4;
-
struct VertexShaderImpl {
typedef void (*SetUniform1iFunc)(VertexShaderImpl*, int index, int value);
typedef void (*SetUniform4fvFunc)(VertexShaderImpl*, int index,
@@ -56,17 +46,7 @@ struct VertexShaderImpl {
LoadAttribsFunc load_attribs_func = nullptr;
RunPrimitiveFunc run_primitive_func = nullptr;
- enum FLAGS {
- CLIP_DISTANCE = 1 << 0,
- };
- int flags = 0;
- void enable_clip_distance() { flags |= CLIP_DISTANCE; }
- ALWAYS_INLINE bool use_clip_distance() const {
- return (flags & CLIP_DISTANCE) != 0;
- }
-
vec4 gl_Position;
- Float gl_ClipDistance[gl_MaxClipDistances];
void set_uniform_1i(int index, int value) {
(*set_uniform_1i_func)(this, index, value);
@@ -92,20 +72,18 @@ struct VertexShaderImpl {
}
};
-// The number of pixels in a step.
-constexpr int32_t swgl_StepSize = 4;
-
struct FragmentShaderImpl {
typedef void (*InitSpanFunc)(FragmentShaderImpl*, const void* interps,
- const void* step);
+ const void* step, float step_width);
typedef void (*RunFunc)(FragmentShaderImpl*);
- typedef void (*SkipFunc)(FragmentShaderImpl*, int steps);
+ typedef void (*SkipFunc)(FragmentShaderImpl*, int chunks);
typedef void (*InitSpanWFunc)(FragmentShaderImpl*, const void* interps,
- const void* step);
+ const void* step, float step_width);
typedef void (*RunWFunc)(FragmentShaderImpl*);
- typedef void (*SkipWFunc)(FragmentShaderImpl*, int steps);
- typedef int (*DrawSpanRGBA8Func)(FragmentShaderImpl*);
- typedef int (*DrawSpanR8Func)(FragmentShaderImpl*);
+ typedef void (*SkipWFunc)(FragmentShaderImpl*, int chunks);
+ typedef void (*DrawSpanRGBA8Func)(FragmentShaderImpl*, uint32_t* buf,
+ int len);
+ typedef void (*DrawSpanR8Func)(FragmentShaderImpl*, uint8_t* buf, int len);
InitSpanFunc init_span_func = nullptr;
RunFunc run_func = nullptr;
@@ -129,27 +107,31 @@ struct FragmentShaderImpl {
}
vec4 gl_FragCoord;
+ vec2_scalar stepZW;
+ Bool isPixelDiscarded = false;
vec4 gl_FragColor;
vec4 gl_SecondaryFragColor;
- vec2_scalar swgl_StepZW;
- Bool swgl_IsPixelDiscarded = false;
- // The current buffer position for committing span output.
- uint32_t* swgl_OutRGBA8 = nullptr;
- uint8_t* swgl_OutR8 = nullptr;
- // The remaining number of pixels in the span.
- int32_t swgl_SpanLength = 0;
+ ALWAYS_INLINE void step_fragcoord() { gl_FragCoord.x += 4; }
- ALWAYS_INLINE void step_fragcoord(int steps = 4) { gl_FragCoord.x += steps; }
+ ALWAYS_INLINE void step_fragcoord(int chunks) {
+ gl_FragCoord.x += 4 * chunks;
+ }
+
+ ALWAYS_INLINE void step_perspective() {
+ gl_FragCoord.z += stepZW.x;
+ gl_FragCoord.w += stepZW.y;
+ }
- ALWAYS_INLINE void step_perspective(int steps = 4) {
- gl_FragCoord.z += swgl_StepZW.x * steps;
- gl_FragCoord.w += swgl_StepZW.y * steps;
+ ALWAYS_INLINE void step_perspective(int chunks) {
+ gl_FragCoord.z += stepZW.x * chunks;
+ gl_FragCoord.w += stepZW.y * chunks;
}
template <bool W = false>
- ALWAYS_INLINE void init_span(const void* interps, const void* step) {
- (*(W ? init_span_w_func : init_span_func))(this, interps, step);
+ ALWAYS_INLINE void init_span(const void* interps, const void* step,
+ float step_width) {
+ (*(W ? init_span_w_func : init_span_func))(this, interps, step, step_width);
}
template <bool W = false>
@@ -158,24 +140,20 @@ struct FragmentShaderImpl {
}
template <bool W = false>
- ALWAYS_INLINE void skip(int steps = 4) {
- (*(W ? skip_w_func : skip_func))(this, steps);
+ ALWAYS_INLINE void skip(int chunks = 1) {
+ (*(W ? skip_w_func : skip_func))(this, chunks);
}
- ALWAYS_INLINE int draw_span(uint32_t* buf, int len) {
- swgl_OutRGBA8 = buf;
- swgl_SpanLength = len;
- return (*draw_span_RGBA8_func)(this);
+ ALWAYS_INLINE void draw_span(uint32_t* buf, int len) {
+ (*draw_span_RGBA8_func)(this, buf, len);
}
ALWAYS_INLINE bool has_draw_span(uint32_t*) {
return draw_span_RGBA8_func != nullptr;
}
- ALWAYS_INLINE int draw_span(uint8_t* buf, int len) {
- swgl_OutR8 = buf;
- swgl_SpanLength = len;
- return (*draw_span_R8_func)(this);
+ ALWAYS_INLINE void draw_span(uint8_t* buf, int len) {
+ (*draw_span_R8_func)(this, buf, len);
}
ALWAYS_INLINE bool has_draw_span(uint8_t*) {
diff --git a/third_party/webrender/swgl/src/rasterize.h b/third_party/webrender/swgl/src/rasterize.h
deleted file mode 100644
index 48f3b9e5898..00000000000
--- a/third_party/webrender/swgl/src/rasterize.h
+++ /dev/null
@@ -1,1670 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-// The SWGL depth buffer is roughly organized as a span buffer where each row
-// of the depth buffer is a list of spans, and each span has a constant depth
-// and a run length (represented by DepthRun). The span from start..start+count
-// is placed directly at that start index in the row's array of runs, so that
-// there is no need to explicitly record the start index at all. This also
-// avoids the need to move items around in the run array to manage insertions
-// since space is implicitly always available for a run between any two
-// pre-existing runs. Linkage from one run to the next is implicitly defined by
-// the count, so if a run exists from start..start+count, the next run will
-// implicitly pick up right at index start+count where that preceding run left
-// off. All of the DepthRun items that are after the head of the run can remain
-// uninitialized until the run needs to be split and a new run needs to start
-// somewhere in between.
-// For uses like perspective-correct rasterization or with a discard mask, a
-// run is not an efficient representation, and it is more beneficial to have
-// a flattened array of individual depth samples that can be masked off easily.
-// To support this case, the first run in a given row's run array may have a
-// zero count, signaling that this entire row is flattened. Critically, the
-// depth and count fields in DepthRun are ordered (endian-dependently) so that
-// the DepthRun struct can be interpreted as a sign-extended int32_t depth. It
-// is then possible to just treat the entire row as an array of int32_t depth
-// samples that can be processed with SIMD comparisons, since the count field
-// behaves as just the sign-extension of the depth field. The count field is
-// limited to 8 bits so that we can support depth values up to 24 bits.
-// When a depth buffer is cleared, each row is initialized to a maximal runs
-// spanning the entire row. In the normal case, the depth buffer will continue
-// to manage itself as a list of runs. If perspective or discard is used for
-// a given row, the row will be converted to the flattened representation to
-// support it, after which it will only ever revert back to runs if the depth
-// buffer is cleared.
-
-// The largest 24-bit depth value supported.
-constexpr uint32_t MAX_DEPTH_VALUE = 0xFFFFFF;
-// The longest 8-bit depth run that is supported, aligned to SIMD chunk size.
-constexpr uint32_t MAX_DEPTH_RUN = 255 & ~3;
-
-struct DepthRun {
- // Ensure that depth always occupies the LSB and count the MSB so that we
- // can sign-extend depth just by setting count to zero, marking it flat.
- // When count is non-zero, then this is interpreted as an actual run and
- // depth is read in isolation.
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- uint32_t depth : 24;
- uint32_t count : 8;
-#else
- uint32_t count : 8;
- uint32_t depth : 24;
-#endif
-
- DepthRun() = default;
- DepthRun(uint32_t depth, uint8_t count) : depth(depth), count(count) {}
-
- // If count is zero, this is actually a flat depth sample rather than a run.
- bool is_flat() const { return !count; }
-
- // Compare a source depth from rasterization with a stored depth value.
- template <int FUNC>
- ALWAYS_INLINE bool compare(uint32_t src) const {
- switch (FUNC) {
- case GL_LEQUAL:
- return src <= depth;
- case GL_LESS:
- return src < depth;
- case GL_ALWAYS:
- return true;
- default:
- assert(false);
- return false;
- }
- }
-};
-
-// Fills runs at the given position with the given depth up to the span width.
-static ALWAYS_INLINE void set_depth_runs(DepthRun* runs, uint32_t depth,
- uint32_t width) {
- // If the width exceeds the maximum run size, then we need to output clamped
- // runs first.
- for (; width >= MAX_DEPTH_RUN;
- runs += MAX_DEPTH_RUN, width -= MAX_DEPTH_RUN) {
- *runs = DepthRun(depth, MAX_DEPTH_RUN);
- }
- // If there are still any left over samples to fill under the maximum run
- // size, then output one last run for them.
- if (width > 0) {
- *runs = DepthRun(depth, width);
- }
-}
-
-// A cursor for reading and modifying a row's depth run array. It locates
-// and iterates through a desired span within all the runs, testing if
-// the depth of this span passes or fails the depth test against existing
-// runs. If desired, new runs may be inserted to represent depth occlusion
-// from this span in the run array.
-struct DepthCursor {
- // Current position of run the cursor has advanced to.
- DepthRun* cur = nullptr;
- // The start of the remaining potential samples in the desired span.
- DepthRun* start = nullptr;
- // The end of the potential samples in the desired span.
- DepthRun* end = nullptr;
-
- DepthCursor() = default;
-
- // Construct a cursor with runs for a given row's run array and the bounds
- // of the span we wish to iterate within it.
- DepthCursor(DepthRun* runs, int num_runs, int span_offset, int span_count)
- : cur(runs), start(&runs[span_offset]), end(start + span_count) {
- // This cursor should never iterate over flat runs
- assert(!runs->is_flat());
- DepthRun* end_runs = &runs[num_runs];
- // Clamp end of span to end of row
- if (end > end_runs) {
- end = end_runs;
- }
- // If the span starts past the end of the row, just advance immediately
- // to it to signal that we're done.
- if (start >= end_runs) {
- cur = end_runs;
- start = end_runs;
- return;
- }
- // Otherwise, find the first depth run that contains the start of the span.
- // If the span starts after the given run, then we need to keep searching
- // through the row to find an appropriate run. The check above already
- // guaranteed that the span starts within the row's runs, and the search
- // won't fall off the end.
- for (;;) {
- assert(cur < end);
- DepthRun* next = cur + cur->count;
- if (start < next) {
- break;
- }
- cur = next;
- }
- }
-
- // The cursor is valid if the current position is at the end or if the run
- // contains the start position.
- bool valid() const {
- return cur >= end || (cur <= start && start < cur + cur->count);
- }
-
- // Skip past any initial runs that fail the depth test. If we find a run that
- // would pass, then return the accumulated length between where we started
- // and that position. Otherwise, if we fall off the end, return -1 to signal
- // that there are no more passed runs at the end of this failed region and
- // so it is safe for the caller to stop processing any more regions in this
- // row.
- template <int FUNC>
- int skip_failed(uint32_t val) {
- assert(valid());
- DepthRun* prev = start;
- while (cur < end) {
- if (cur->compare<FUNC>(val)) {
- return start - prev;
- }
- cur += cur->count;
- start = cur;
- }
- return -1;
- }
-
- // Helper to convert function parameters into template parameters to hoist
- // some checks out of inner loops.
- ALWAYS_INLINE int skip_failed(uint32_t val, GLenum func) {
- switch (func) {
- case GL_LEQUAL:
- return skip_failed<GL_LEQUAL>(val);
- case GL_LESS:
- return skip_failed<GL_LESS>(val);
- default:
- assert(false);
- return -1;
- }
- }
-
- // Find a region of runs that passes the depth test. It is assumed the caller
- // has called skip_failed first to skip past any runs that failed the depth
- // test. This stops when it finds a run that fails the depth test or we fall
- // off the end of the row. If the write mask is enabled, this will insert runs
- // to represent this new region that passed the depth test. The length of the
- // region is returned.
- template <int FUNC, bool MASK>
- int check_passed(uint32_t val) {
- assert(valid());
- DepthRun* prev = cur;
- while (cur < end) {
- if (!cur->compare<FUNC>(val)) {
- break;
- }
- DepthRun* next = cur + cur->count;
- if (next > end) {
- if (MASK) {
- // Chop the current run where the end of the span falls, making a new
- // run from the end of the span till the next run. The beginning of
- // the current run will be folded into the run from the start of the
- // passed region before returning below.
- *end = DepthRun(cur->depth, next - end);
- }
- // If the next run starts past the end, then just advance the current
- // run to the end to signal that we're now at the end of the row.
- next = end;
- }
- cur = next;
- }
- // If we haven't advanced past the start of the span region, then we found
- // nothing that passed.
- if (cur <= start) {
- return 0;
- }
- // If 'end' fell within the middle of a passing run, then 'cur' will end up
- // pointing at the new partial run created at 'end' where the passing run
- // was split to accommodate starting in the middle. The preceding runs will
- // be fixed below to properly join with this new split.
- int passed = cur - start;
- if (MASK) {
- // If the search started from a run before the start of the span, then
- // edit that run to meet up with the start.
- if (prev < start) {
- prev->count = start - prev;
- }
- // Create a new run for the entirety of the passed samples.
- set_depth_runs(start, val, passed);
- }
- start = cur;
- return passed;
- }
-
- // Helper to convert function parameters into template parameters to hoist
- // some checks out of inner loops.
- template <bool MASK>
- ALWAYS_INLINE int check_passed(uint32_t val, GLenum func) {
- switch (func) {
- case GL_LEQUAL:
- return check_passed<GL_LEQUAL, MASK>(val);
- case GL_LESS:
- return check_passed<GL_LESS, MASK>(val);
- default:
- assert(false);
- return 0;
- }
- }
-
- ALWAYS_INLINE int check_passed(uint32_t val, GLenum func, bool mask) {
- return mask ? check_passed<true>(val, func)
- : check_passed<false>(val, func);
- }
-
- // Fill a region of runs with a given depth value, bypassing any depth test.
- ALWAYS_INLINE void fill(uint32_t depth) {
- check_passed<GL_ALWAYS, true>(depth);
- }
-};
-
-// Initialize a depth texture by setting the first run in each row to encompass
-// the entire row.
-void Texture::init_depth_runs(uint32_t depth) {
- if (!buf) return;
- DepthRun* runs = (DepthRun*)buf;
- for (int y = 0; y < height; y++) {
- set_depth_runs(runs, depth, width);
- runs += stride() / sizeof(DepthRun);
- }
- set_cleared(true);
-}
-
-// Fill a portion of the run array with flattened depth samples.
-static ALWAYS_INLINE void fill_flat_depth(DepthRun* dst, size_t n,
- uint32_t depth) {
- fill_n((uint32_t*)dst, n, depth);
-}
-
-// Fills a scissored region of a depth texture with a given depth.
-void Texture::fill_depth_runs(uint32_t depth, const IntRect& scissor) {
- if (!buf) return;
- assert(cleared());
- IntRect bb = bounds().intersection(scissor - offset);
- DepthRun* runs = (DepthRun*)sample_ptr(0, bb.y0);
- for (int rows = bb.height(); rows > 0; rows--) {
- if (bb.width() >= width) {
- // If the scissor region encompasses the entire row, reset the row to a
- // single run encompassing the entire row.
- set_depth_runs(runs, depth, width);
- } else if (runs->is_flat()) {
- // If the row is flattened, just directly fill the portion of the row.
- fill_flat_depth(&runs[bb.x0], bb.width(), depth);
- } else {
- // Otherwise, if we are still using runs, then set up a cursor to fill
- // it with depth runs.
- DepthCursor(runs, width, bb.x0, bb.width()).fill(depth);
- }
- runs += stride() / sizeof(DepthRun);
- }
-}
-
-using ZMask = I32;
-
-#if USE_SSE2
-# define ZMASK_NONE_PASSED 0xFFFF
-# define ZMASK_ALL_PASSED 0
-static inline uint32_t zmask_code(ZMask mask) {
- return _mm_movemask_epi8(mask);
-}
-#else
-# define ZMASK_NONE_PASSED 0xFFFFFFFFU
-# define ZMASK_ALL_PASSED 0
-static inline uint32_t zmask_code(ZMask mask) {
- return bit_cast<uint32_t>(CONVERT(mask, U8));
-}
-#endif
-
-// Interprets items in the depth buffer as sign-extended 32-bit depth values
-// instead of as runs. Returns a mask that signals which samples in the given
-// chunk passed or failed the depth test with given Z value.
-template <bool DISCARD>
-static ALWAYS_INLINE bool check_depth(I32 src, DepthRun* zbuf, ZMask& outmask,
- int span = 4) {
- // SSE2 does not support unsigned comparison. So ensure Z value is
- // sign-extended to int32_t.
- I32 dest = unaligned_load<I32>(zbuf);
- // Invert the depth test to check which pixels failed and should be discarded.
- ZMask mask = ctx->depthfunc == GL_LEQUAL
- ?
- // GL_LEQUAL: Not(LessEqual) = Greater
- ZMask(src > dest)
- :
- // GL_LESS: Not(Less) = GreaterEqual
- ZMask(src >= dest);
- // Mask off any unused lanes in the span.
- mask |= ZMask(span) < ZMask{1, 2, 3, 4};
- if (zmask_code(mask) == ZMASK_NONE_PASSED) {
- return false;
- }
- if (!DISCARD && ctx->depthmask) {
- unaligned_store(zbuf, (mask & dest) | (~mask & src));
- }
- outmask = mask;
- return true;
-}
-
-static ALWAYS_INLINE I32 packDepth() {
- return cast(fragment_shader->gl_FragCoord.z * MAX_DEPTH_VALUE);
-}
-
-static ALWAYS_INLINE void discard_depth(I32 src, DepthRun* zbuf, I32 mask) {
- if (ctx->depthmask) {
- I32 dest = unaligned_load<I32>(zbuf);
- mask |= fragment_shader->swgl_IsPixelDiscarded;
- unaligned_store(zbuf, (mask & dest) | (~mask & src));
- }
-}
-
-static ALWAYS_INLINE void mask_output(uint32_t* buf, ZMask zmask,
- int span = 4) {
- WideRGBA8 r = pack_pixels_RGBA8();
- PackedRGBA8 dst = load_span<PackedRGBA8>(buf, span);
- if (blend_key) r = blend_pixels(buf, dst, r, span);
- PackedRGBA8 mask = bit_cast<PackedRGBA8>(zmask);
- store_span(buf, (mask & dst) | (~mask & pack(r)), span);
-}
-
-template <bool DISCARD>
-static ALWAYS_INLINE void discard_output(uint32_t* buf, int span = 4) {
- mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span);
-}
-
-template <>
-ALWAYS_INLINE void discard_output<false>(uint32_t* buf, int span) {
- WideRGBA8 r = pack_pixels_RGBA8();
- if (blend_key)
- r = blend_pixels(buf, load_span<PackedRGBA8>(buf, span), r, span);
- store_span(buf, pack(r), span);
-}
-
-static ALWAYS_INLINE void mask_output(uint8_t* buf, ZMask zmask, int span = 4) {
- WideR8 r = pack_pixels_R8();
- WideR8 dst = unpack(load_span<PackedR8>(buf, span));
- if (blend_key) r = blend_pixels(buf, dst, r, span);
- WideR8 mask = packR8(zmask);
- store_span(buf, pack((mask & dst) | (~mask & r)), span);
-}
-
-template <bool DISCARD>
-static ALWAYS_INLINE void discard_output(uint8_t* buf, int span = 4) {
- mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span);
-}
-
-template <>
-ALWAYS_INLINE void discard_output<false>(uint8_t* buf, int span) {
- WideR8 r = pack_pixels_R8();
- if (blend_key)
- r = blend_pixels(buf, unpack(load_span<PackedR8>(buf, span)), r, span);
- store_span(buf, pack(r), span);
-}
-
-struct ClipRect {
- float x0;
- float y0;
- float x1;
- float y1;
-
- explicit ClipRect(const IntRect& i)
- : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {}
- explicit ClipRect(const Texture& t) : ClipRect(ctx->apply_scissor(t)) {
- // If blending is enabled, set blend_key to reflect the resolved blend
- // state for the currently drawn primitive.
- if (ctx->blend) {
- blend_key = ctx->blend_key;
- if (swgl_ClipFlags) {
- // If there is a blend override set, replace the blend key with it.
- if (swgl_ClipFlags & SWGL_CLIP_FLAG_BLEND_OVERRIDE) {
- blend_key = swgl_BlendOverride;
- }
- // If a clip mask is available, set up blending state to use the clip
- // mask.
- if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) {
- assert(swgl_ClipMask->format == TextureFormat::R8);
- // Constrain the clip mask bounds to always fall within the clip mask.
- swgl_ClipMaskBounds.intersect(IntRect{0, 0, int(swgl_ClipMask->width),
- int(swgl_ClipMask->height)});
- // The clip mask offset is relative to the viewport.
- swgl_ClipMaskOffset += ctx->viewport.origin() - t.offset;
- // The clip mask bounds are relative to the clip mask offset.
- swgl_ClipMaskBounds.offset(swgl_ClipMaskOffset);
- // Finally, constrain the clip rectangle by the clip mask bounds.
- intersect(swgl_ClipMaskBounds);
- // Modify the blend key so that it will use the clip mask while
- // blending.
- restore_clip_mask();
- }
- if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) {
- // Modify the blend key so that it will use AA while blending.
- restore_aa();
- }
- }
- } else {
- blend_key = BLEND_KEY_NONE;
- swgl_ClipFlags = 0;
- }
- }
-
- FloatRange x_range() const { return {x0, x1}; }
-
- void intersect(const IntRect& c) {
- x0 = max(x0, float(c.x0));
- y0 = max(y0, float(c.y0));
- x1 = min(x1, float(c.x1));
- y1 = min(y1, float(c.y1));
- }
-
- template <typename P>
- void set_clip_mask(int x, int y, P* buf) const {
- if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) {
- swgl_SpanBuf = buf;
- swgl_ClipMaskBuf = (uint8_t*)swgl_ClipMask->buf +
- (y - swgl_ClipMaskOffset.y) * swgl_ClipMask->stride +
- (x - swgl_ClipMaskOffset.x);
- }
- }
-
- template <typename P>
- bool overlaps(int nump, const P* p) const {
- // Generate a mask of which side of the clip rect all of a polygon's points
- // fall inside of. This is a cheap conservative estimate of whether the
- // bounding box of the polygon might overlap the clip rect, rather than an
- // exact test that would require multiple slower line intersections.
- int sides = 0;
- for (int i = 0; i < nump; i++) {
- sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2;
- sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8;
- }
- return sides == 0xF;
- }
-};
-
-// Given a current X position at the center Y position of a row, return the X
-// position of the left and right intercepts of the row top and bottom.
-template <typename E>
-static ALWAYS_INLINE FloatRange x_intercepts(const E& e) {
- float rad = 0.5f * abs(e.x_slope());
- return {e.cur_x() - rad, e.cur_x() + rad};
-}
-
-// Return the AA sub-span corresponding to a given edge. If AA is requested,
-// then this finds the X intercepts with the row clipped into range of the
-// edge and finally conservatively rounds them out. If there is no AA, then
-// it just returns the current rounded X position clipped within bounds.
-template <typename E>
-static ALWAYS_INLINE IntRange aa_edge(const E& e, const FloatRange& bounds) {
- return e.edgeMask ? bounds.clip(x_intercepts(e)).round_out()
- : bounds.clip({e.cur_x(), e.cur_x()}).round();
-}
-
-// Calculate the initial AA coverage as an approximation of the distance from
-// the center of the pixel in the direction of the edge slope. Given an edge
-// (x,y)..(x+dx,y+dy), then the normalized tangent vector along the edge is
-// (dx,dy)/sqrt(dx^2+dy^2). We know that for dy=1 then dx=e.x_slope. We rotate
-// the tangent vector either -90 or +90 degrees to get the edge normal vector,
-// where 'dx=-dy and 'dy=dx. Once normalized by 1/sqrt(dx^2+dy^2), scale into
-// the range of 0..256 so that we can cheaply convert to a fixed-point scale
-// factor. It is assumed that at exactly the pixel center the opacity is half
-// (128) and linearly decreases along the normal vector at 1:1 scale with the
-// slope. While not entirely accurate, this gives a reasonably agreeable looking
-// approximation of AA. For edges on which there is no AA, just force the
-// opacity to maximum (256) with no slope, relying on the span clipping to trim
-// pixels outside the span.
-template <typename E>
-static ALWAYS_INLINE FloatRange aa_dist(const E& e, float dir) {
- if (e.edgeMask) {
- float dx = (dir * 256.0f) * inversesqrt(1.0f + e.x_slope() * e.x_slope());
- return {128.0f + dx * (e.cur_x() - 0.5f), -dx};
- } else {
- return {256.0f, 0.0f};
- }
-}
-
-template <typename P, typename E>
-static ALWAYS_INLINE IntRange aa_span(P* buf, const E& left, const E& right,
- const FloatRange& bounds) {
- // If there is no AA, just return the span from the rounded left edge X
- // position to the rounded right edge X position. Clip the span to be within
- // the valid bounds.
- if (!(swgl_ClipFlags & SWGL_CLIP_FLAG_AA)) {
- return bounds.clip({left.cur_x(), right.cur_x()}).round();
- }
-
- // Calculate the left and right AA spans along with the coverage distances
- // and slopes necessary to do blending.
- IntRange leftAA = aa_edge(left, bounds);
- FloatRange leftDist = aa_dist(left, -1.0f);
- IntRange rightAA = aa_edge(right, bounds);
- FloatRange rightDist = aa_dist(right, 1.0f);
-
- // Use the pointer into the destination buffer as a status indicator of the
- // coverage offset. The pointer is calculated so that subtracting it with
- // the current destination pointer will yield a negative value if the span
- // is outside the opaque area and otherwise will yield a positive value
- // above the opaque size. This pointer is stored as a uint8 pointer so that
- // there are no hidden multiplication instructions and will just return a
- // 1:1 linear memory address. Thus the size of the opaque region must also
- // be scaled by the pixel size in bytes.
- swgl_OpaqueStart = (const uint8_t*)(buf + leftAA.end);
- swgl_OpaqueSize = max(rightAA.start - leftAA.end - 3, 0) * sizeof(P);
-
- // Offset the coverage distances by the end of the left AA span, which
- // corresponds to the opaque start pointer, so that pixels become opaque
- // immediately after. The distances are also offset for each lane in the
- // chunk.
- Float offset = cast(leftAA.end + (I32){0, 1, 2, 3});
- swgl_LeftAADist = leftDist.start + offset * leftDist.end;
- swgl_RightAADist = rightDist.start + offset * rightDist.end;
- swgl_AASlope =
- (Float){leftDist.end, rightDist.end, 0.0f, 0.0f} / float(sizeof(P));
-
- // Return the full span width from the start of the left span to the end of
- // the right span.
- return {leftAA.start, rightAA.end};
-}
-
-// Calculate the span the user clip distances occupy from the left and right
-// edges at the current row.
-template <typename E>
-static ALWAYS_INLINE IntRange clip_distance_range(const E& left,
- const E& right) {
- Float leftClip = get_clip_distances(left.interp);
- Float rightClip = get_clip_distances(right.interp);
- // Get the change in clip dist per X step.
- Float clipStep = (rightClip - leftClip) / (right.cur_x() - left.cur_x());
- // Find the zero intercepts starting from the left edge.
- Float clipDist = left.cur_x() - leftClip * recip(clipStep);
- // Find the distance to the start of the span for any clip distances that
- // are increasing in value. If the clip distance is constant or decreasing
- // in value, then check if it starts outside the clip volume.
- Float start = if_then_else(clipStep > 0.0f, clipDist,
- if_then_else(leftClip < 0.0f, 1.0e6f, 0.0f));
- // Find the distance to the end of the span for any clip distances that are
- // decreasing in value. If the clip distance is constant or increasing in
- // value, then check if it ends inside the clip volume.
- Float end = if_then_else(clipStep < 0.0f, clipDist,
- if_then_else(rightClip >= 0.0f, 1.0e6f, 0.0f));
- // Find the furthest start offset.
- start = max(start, start.zwxy);
- // Find the closest end offset.
- end = min(end, end.zwxy);
- // Finally, round the offsets to an integer span that can be used to bound
- // the current span.
- return FloatRange{max(start.x, start.y), min(end.x, end.y)}.round();
-}
-
-// Converts a run array into a flattened array of depth samples. This just
-// walks through every run and fills the samples with the depth value from
-// the run.
-static void flatten_depth_runs(DepthRun* runs, size_t width) {
- if (runs->is_flat()) {
- return;
- }
- while (width > 0) {
- size_t n = runs->count;
- fill_flat_depth(runs, n, runs->depth);
- runs += n;
- width -= n;
- }
-}
-
-// Helper function for drawing passed depth runs within the depth buffer.
-// Flattened depth (perspective or discard) is not supported.
-template <typename P>
-static ALWAYS_INLINE void draw_depth_span(uint32_t z, P* buf,
- DepthCursor& cursor) {
- for (;;) {
- // Get the span that passes the depth test. Assume on entry that
- // any failed runs have already been skipped.
- int span = cursor.check_passed(z, ctx->depthfunc, ctx->depthmask);
- // If nothing passed, since we already skipped passed failed runs
- // previously, we must have hit the end of the row. Bail out.
- if (span <= 0) {
- break;
- }
- if (span >= 4) {
- // If we have a draw specialization, try to process as many 4-pixel
- // chunks as possible using it.
- if (fragment_shader->has_draw_span(buf)) {
- int drawn = fragment_shader->draw_span(buf, span & ~3);
- buf += drawn;
- span -= drawn;
- }
- // Otherwise, just process each chunk individually.
- while (span >= 4) {
- fragment_shader->run();
- discard_output<false>(buf);
- buf += 4;
- span -= 4;
- }
- }
- // If we have a partial chunk left over, we still have to process it as if
- // it were a full chunk. Mask off only the part of the chunk we want to
- // use.
- if (span > 0) {
- fragment_shader->run();
- discard_output<false>(buf, span);
- buf += span;
- }
- // Skip past any runs that fail the depth test.
- int skip = cursor.skip_failed(z, ctx->depthfunc);
- // If there aren't any, that means we won't encounter any more passing runs
- // and so it's safe to bail out.
- if (skip <= 0) {
- break;
- }
- // Advance interpolants for the fragment shader past the skipped region.
- // If we processed a partial chunk above, we actually advanced the
- // interpolants a full chunk in the fragment shader's run function. Thus,
- // we need to first subtract off that 4-pixel chunk and only partially
- // advance them to that partial chunk before we can add on the rest of the
- // skips. This is combined with the skip here for efficiency's sake.
- fragment_shader->skip(skip - (span > 0 ? 4 - span : 0));
- buf += skip;
- }
-}
-
-// Draw a simple span in 4-pixel wide chunks, optionally using depth.
-template <bool DISCARD, bool W, typename P, typename Z>
-static ALWAYS_INLINE void draw_span(P* buf, DepthRun* depth, int span, Z z) {
- if (depth) {
- // Depth testing is enabled. If perspective is used, Z values will vary
- // across the span, we use packDepth to generate packed Z values suitable
- // for depth testing based on current values from gl_FragCoord.z.
- // Otherwise, for the no-perspective case, we just use the provided Z.
- // Process 4-pixel chunks first.
- for (; span >= 4; span -= 4, buf += 4, depth += 4) {
- I32 zsrc = z();
- ZMask zmask;
- if (check_depth<DISCARD>(zsrc, depth, zmask)) {
- fragment_shader->run<W>();
- mask_output(buf, zmask);
- if (DISCARD) discard_depth(zsrc, depth, zmask);
- } else {
- fragment_shader->skip<W>();
- }
- }
- // If there are any remaining pixels, do a partial chunk.
- if (span > 0) {
- I32 zsrc = z();
- ZMask zmask;
- if (check_depth<DISCARD>(zsrc, depth, zmask, span)) {
- fragment_shader->run<W>();
- mask_output(buf, zmask, span);
- if (DISCARD) discard_depth(zsrc, depth, zmask);
- }
- }
- } else {
- // Process 4-pixel chunks first.
- for (; span >= 4; span -= 4, buf += 4) {
- fragment_shader->run<W>();
- discard_output<DISCARD>(buf);
- }
- // If there are any remaining pixels, do a partial chunk.
- if (span > 0) {
- fragment_shader->run<W>();
- discard_output<DISCARD>(buf, span);
- }
- }
-}
-
-// Called during rasterization to forcefully clear a row on which delayed clear
-// has been enabled. If we know that we are going to completely overwrite a part
-// of the row, then we only need to clear the row outside of that part. However,
-// if blending or discard is enabled, the values of that underlying part of the
-// row may be used regardless to produce the final rasterization result, so we
-// have to then clear the entire underlying row to prepare it.
-template <typename P>
-static inline void prepare_row(Texture& colortex, int y, int startx, int endx,
- bool use_discard, DepthRun* depth,
- uint32_t z = 0, DepthCursor* cursor = nullptr) {
- assert(colortex.delay_clear > 0);
- // Delayed clear is enabled for the color buffer. Check if needs clear.
- uint32_t& mask = colortex.cleared_rows[y / 32];
- if ((mask & (1 << (y & 31))) == 0) {
- mask |= 1 << (y & 31);
- colortex.delay_clear--;
- if (blend_key || use_discard) {
- // If depth test, blending, or discard is used, old color values
- // might be sampled, so we need to clear the entire row to fill it.
- force_clear_row<P>(colortex, y);
- } else if (depth) {
- if (depth->is_flat() || !cursor) {
- // If flat depth is used, we can't cheaply predict if which samples will
- // pass.
- force_clear_row<P>(colortex, y);
- } else {
- // Otherwise if depth runs are used, see how many samples initially pass
- // the depth test and only fill the row outside those. The fragment
- // shader will fill the row within the passed samples.
- int passed =
- DepthCursor(*cursor).check_passed<false>(z, ctx->depthfunc);
- if (startx > 0 || startx + passed < colortex.width) {
- force_clear_row<P>(colortex, y, startx, startx + passed);
- }
- }
- } else if (startx > 0 || endx < colortex.width) {
- // Otherwise, we only need to clear the row outside of the span.
- // The fragment shader will fill the row within the span itself.
- force_clear_row<P>(colortex, y, startx, endx);
- }
- }
-}
-
-// Perpendicular dot-product is the dot-product of a vector with the
-// perpendicular vector of the other, i.e. dot(a, {-b.y, b.x})
-template <typename T>
-static ALWAYS_INLINE auto perpDot(T a, T b) {
- return a.x * b.y - a.y * b.x;
-}
-
-// Check if the winding of the initial edges is flipped, requiring us to swap
-// the edges to avoid spans having negative lengths. Assume that l0.y == r0.y
-// due to the initial edge scan in draw_quad/perspective_spans.
-template <typename T>
-static ALWAYS_INLINE bool checkIfEdgesFlipped(T l0, T l1, T r0, T r1) {
- // If the starting point of the left edge is to the right of the starting
- // point of the right edge, then just assume the edges are flipped. If the
- // left and right starting points are the same, then check the sign of the
- // cross-product of the edges to see if the edges are flipped. Otherwise,
- // if the left starting point is actually just to the left of the right
- // starting point, then assume no edge flip.
- return l0.x > r0.x || (l0.x == r0.x && perpDot(l1 - l0, r1 - r0) > 0.0f);
-}
-
-// Draw spans for each row of a given quad (or triangle) with a constant Z
-// value. The quad is assumed convex. It is clipped to fall within the given
-// clip rect. In short, this function rasterizes a quad by first finding a
-// top most starting point and then from there tracing down the left and right
-// sides of this quad until it hits the bottom, outputting a span between the
-// current left and right positions at each row along the way. Points are
-// assumed to be ordered in either CW or CCW to support this, but currently
-// both orders (CW and CCW) are supported and equivalent.
-template <typename P>
-static inline void draw_quad_spans(int nump, Point2D p[4], uint32_t z,
- Interpolants interp_outs[4],
- Texture& colortex, Texture& depthtex,
- const ClipRect& clipRect) {
- // Only triangles and convex quads supported.
- assert(nump == 3 || nump == 4);
-
- Point2D l0, r0, l1, r1;
- int l0i, r0i, l1i, r1i;
- {
- // Find the index of the top-most (smallest Y) point from which
- // rasterization can start.
- int top = nump > 3 && p[3].y < p[2].y
- ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3)
- : (p[1].y < p[3].y ? 1 : 3))
- : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2)
- : (p[1].y < p[2].y ? 1 : 2));
- // Helper to find next index in the points array, walking forward.
-#define NEXT_POINT(idx) \
- ({ \
- int cur = (idx) + 1; \
- cur < nump ? cur : 0; \
- })
- // Helper to find the previous index in the points array, walking backward.
-#define PREV_POINT(idx) \
- ({ \
- int cur = (idx)-1; \
- cur >= 0 ? cur : nump - 1; \
- })
- // Start looking for "left"-side and "right"-side descending edges starting
- // from the determined top point.
- int next = NEXT_POINT(top);
- int prev = PREV_POINT(top);
- if (p[top].y == p[next].y) {
- // If the next point is on the same row as the top, then advance one more
- // time to the next point and use that as the "left" descending edge.
- l0i = next;
- l1i = NEXT_POINT(next);
- // Assume top and prev form a descending "right" edge, as otherwise this
- // will be a collapsed polygon and harmlessly bail out down below.
- r0i = top;
- r1i = prev;
- } else if (p[top].y == p[prev].y) {
- // If the prev point is on the same row as the top, then advance to the
- // prev again and use that as the "right" descending edge.
- // Assume top and next form a non-empty descending "left" edge.
- l0i = top;
- l1i = next;
- r0i = prev;
- r1i = PREV_POINT(prev);
- } else {
- // Both next and prev are on distinct rows from top, so both "left" and
- // "right" edges are non-empty/descending.
- l0i = r0i = top;
- l1i = next;
- r1i = prev;
- }
- // Load the points from the indices.
- l0 = p[l0i]; // Start of left edge
- r0 = p[r0i]; // End of left edge
- l1 = p[l1i]; // Start of right edge
- r1 = p[r1i]; // End of right edge
- // debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1:
- // %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i,
- // r1.x, r1.y);
- }
-
- struct Edge {
- float yScale;
- float xSlope;
- float x;
- Interpolants interpSlope;
- Interpolants interp;
- bool edgeMask;
-
- Edge(float y, const Point2D& p0, const Point2D& p1, const Interpolants& i0,
- const Interpolants& i1, int edgeIndex)
- : // Inverse Y scale for slope calculations. Avoid divide on 0-length
- // edge. Later checks below ensure that Y <= p1.y, or otherwise we
- // don't use this edge. We just need to guard against Y == p1.y ==
- // p0.y. In that case, Y - p0.y == 0 and will cancel out the slopes
- // below, except if yScale is Inf for some reason (or worse, NaN),
- // which 1/(p1.y-p0.y) might produce if we don't bound it.
- yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
- // Calculate dX/dY slope
- xSlope((p1.x - p0.x) * yScale),
- // Initialize current X based on Y and slope
- x(p0.x + (y - p0.y) * xSlope),
- // Calculate change in interpolants per change in Y
- interpSlope((i1 - i0) * yScale),
- // Initialize current interpolants based on Y and slope
- interp(i0 + (y - p0.y) * interpSlope),
- // Extract the edge mask status for this edge
- edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {}
-
- void nextRow() {
- // step current X and interpolants to next row from slope
- x += xSlope;
- interp += interpSlope;
- }
-
- float cur_x() const { return x; }
- float x_slope() const { return xSlope; }
- };
-
- // Vertex selection above should result in equal left and right start rows
- assert(l0.y == r0.y);
- // Find the start y, clip to within the clip rect, and round to row center.
- // If AA is enabled, round out conservatively rather than round to nearest.
- float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f;
- float y = floor(max(l0.y, clipRect.y0) + aaRound) + 0.5f;
- // Initialize left and right edges from end points and start Y
- Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
- Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
- // WR does not use backface culling, so check if edges are flipped.
- bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1);
- if (flipped) swap(left, right);
- // Get pointer to color buffer and depth buffer at current Y
- P* fbuf = (P*)colortex.sample_ptr(0, int(y));
- DepthRun* fdepth = (DepthRun*)depthtex.sample_ptr(0, int(y));
- // Loop along advancing Ys, rasterizing spans at each row
- float checkY = min(min(l1.y, r1.y), clipRect.y1);
- // Ensure we don't rasterize out edge bounds
- FloatRange clipSpan =
- clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
- for (;;) {
- // Check if we maybe passed edge ends or outside clip rect...
- if (y > checkY) {
- // If we're outside the clip rect, we're done.
- if (y > clipRect.y1) break;
- // Helper to find the next non-duplicate vertex that doesn't loop back.
-#define STEP_EDGE(y, e0i, e0, e1i, e1, STEP_POINT, end) \
- do { \
- /* Set new start of edge to be end of old edge */ \
- e0i = e1i; \
- e0 = e1; \
- /* Set new end of edge to next point */ \
- e1i = STEP_POINT(e1i); \
- e1 = p[e1i]; \
- /* If the edge crossed the end, we're done. */ \
- if (e0i == end) return; \
- /* Otherwise, it doesn't advance, so keep searching. */ \
- } while (y > e1.y)
- // Check if Y advanced past the end of the left edge
- if (y > l1.y) {
- // Step to next left edge past Y and reset edge interpolants.
- STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i);
- (flipped ? right : left) =
- Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
- }
- // Check if Y advanced past the end of the right edge
- if (y > r1.y) {
- // Step to next right edge past Y and reset edge interpolants.
- STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i);
- (flipped ? left : right) =
- Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
- }
- // Reset the clip bounds for the new edges
- clipSpan =
- clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
- // Reset check condition for next time around.
- checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1);
- }
-
- // Calculate a potentially AA'd span and check if it is non-empty.
- IntRange span = aa_span(fbuf, left, right, clipSpan);
- if (span.len() > 0) {
- // If user clip planes are enabled, use them to bound the current span.
- if (vertex_shader->use_clip_distance()) {
- span = span.intersect(clip_distance_range(left, right));
- if (span.len() <= 0) goto next_span;
- }
- ctx->shaded_rows++;
- ctx->shaded_pixels += span.len();
- // Advance color/depth buffer pointers to the start of the span.
- P* buf = fbuf + span.start;
- // Check if we will need to use depth-buffer or discard on this span.
- DepthRun* depth =
- depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr;
- DepthCursor cursor;
- bool use_discard = fragment_shader->use_discard();
- if (use_discard) {
- if (depth) {
- // If we're using discard, we may have to unpredictably drop out some
- // samples. Flatten the depth run array here to allow this.
- if (!depth->is_flat()) {
- flatten_depth_runs(depth, depthtex.width);
- }
- // Advance to the depth sample at the start of the span.
- depth += span.start;
- }
- } else if (depth) {
- if (!depth->is_flat()) {
- // We're not using discard and the depth row is still organized into
- // runs. Skip past any runs that would fail the depth test so we
- // don't have to do any extra work to process them with the rest of
- // the span.
- cursor = DepthCursor(depth, depthtex.width, span.start, span.len());
- int skipped = cursor.skip_failed(z, ctx->depthfunc);
- // If we fell off the row, that means we couldn't find any passing
- // runs. We can just skip the entire span.
- if (skipped < 0) {
- goto next_span;
- }
- buf += skipped;
- span.start += skipped;
- } else {
- // The row is already flattened, so just advance to the span start.
- depth += span.start;
- }
- }
-
- if (colortex.delay_clear) {
- // Delayed clear is enabled for the color buffer. Check if needs clear.
- prepare_row<P>(colortex, int(y), span.start, span.end, use_discard,
- depth, z, &cursor);
- }
-
- // Initialize fragment shader interpolants to current span position.
- fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1);
- fragment_shader->gl_FragCoord.y = y;
- {
- // Change in interpolants is difference between current right and left
- // edges per the change in right and left X.
- Interpolants step =
- (right.interp - left.interp) * (1.0f / (right.x - left.x));
- // Advance current interpolants to X at start of span.
- Interpolants o = left.interp + step * (span.start + 0.5f - left.x);
- fragment_shader->init_span(&o, &step);
- }
- clipRect.set_clip_mask(span.start, y, buf);
- if (!use_discard) {
- // Fast paths for the case where fragment discard is not used.
- if (depth) {
- // If depth is used, we want to process entire depth runs if depth is
- // not flattened.
- if (!depth->is_flat()) {
- draw_depth_span(z, buf, cursor);
- goto next_span;
- }
- // Otherwise, flattened depth must fall back to the slightly slower
- // per-chunk depth test path in draw_span below.
- } else {
- // Check if the fragment shader has an optimized draw specialization.
- if (span.len() >= 4 && fragment_shader->has_draw_span(buf)) {
- // Draw specialization expects 4-pixel chunks.
- int drawn = fragment_shader->draw_span(buf, span.len() & ~3);
- buf += drawn;
- span.start += drawn;
- }
- }
- draw_span<false, false>(buf, depth, span.len(), [=] { return z; });
- } else {
- // If discard is used, then use slower fallbacks. This should be rare.
- // Just needs to work, doesn't need to be too fast yet...
- draw_span<true, false>(buf, depth, span.len(), [=] { return z; });
- }
- }
- next_span:
- // Advance Y and edge interpolants to next row.
- y++;
- left.nextRow();
- right.nextRow();
- // Advance buffers to next row.
- fbuf += colortex.stride() / sizeof(P);
- fdepth += depthtex.stride() / sizeof(DepthRun);
- }
-}
-
-// Draw perspective-correct spans for a convex quad that has been clipped to
-// the near and far Z planes, possibly producing a clipped convex polygon with
-// more than 4 sides. This assumes the Z value will vary across the spans and
-// requires interpolants to factor in W values. This tends to be slower than
-// the simpler 2D draw_quad_spans above, especially since we can't optimize the
-// depth test easily when Z values, and should be used only rarely if possible.
-template <typename P>
-static inline void draw_perspective_spans(int nump, Point3D* p,
- Interpolants* interp_outs,
- Texture& colortex, Texture& depthtex,
- const ClipRect& clipRect) {
- Point3D l0, r0, l1, r1;
- int l0i, r0i, l1i, r1i;
- {
- // Find the index of the top-most point (smallest Y) from which
- // rasterization can start.
- int top = 0;
- for (int i = 1; i < nump; i++) {
- if (p[i].y < p[top].y) {
- top = i;
- }
- }
- // Find left-most top point, the start of the left descending edge.
- // Advance forward in the points array, searching at most nump points
- // in case the polygon is flat.
- l0i = top;
- for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) {
- l0i = i;
- }
- if (l0i == nump - 1) {
- for (int i = 0; i <= top && p[i].y == p[top].y; i++) {
- l0i = i;
- }
- }
- // Find right-most top point, the start of the right descending edge.
- // Advance backward in the points array, searching at most nump points.
- r0i = top;
- for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) {
- r0i = i;
- }
- if (r0i == 0) {
- for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) {
- r0i = i;
- }
- }
- // End of left edge is next point after left edge start.
- l1i = NEXT_POINT(l0i);
- // End of right edge is prev point after right edge start.
- r1i = PREV_POINT(r0i);
- l0 = p[l0i]; // Start of left edge
- r0 = p[r0i]; // End of left edge
- l1 = p[l1i]; // Start of right edge
- r1 = p[r1i]; // End of right edge
- }
-
- struct Edge {
- float yScale;
- // Current coordinates for edge. Where in the 2D case of draw_quad_spans,
- // it is enough to just track the X coordinate as we advance along the rows,
- // for the perspective case we also need to keep track of Z and W. For
- // simplicity, we just use the full 3D point to track all these coordinates.
- Point3D pSlope;
- Point3D p;
- Interpolants interpSlope;
- Interpolants interp;
- bool edgeMask;
-
- Edge(float y, const Point3D& p0, const Point3D& p1, const Interpolants& i0,
- const Interpolants& i1, int edgeIndex)
- : // Inverse Y scale for slope calculations. Avoid divide on 0-length
- // edge.
- yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
- // Calculate dX/dY slope
- pSlope((p1 - p0) * yScale),
- // Initialize current coords based on Y and slope
- p(p0 + (y - p0.y) * pSlope),
- // Crucially, these interpolants must be scaled by the point's 1/w
- // value, which allows linear interpolation in a perspective-correct
- // manner. This will be canceled out inside the fragment shader later.
- // Calculate change in interpolants per change in Y
- interpSlope((i1 * p1.w - i0 * p0.w) * yScale),
- // Initialize current interpolants based on Y and slope
- interp(i0 * p0.w + (y - p0.y) * interpSlope),
- // Extract the edge mask status for this edge
- edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {}
-
- float x() const { return p.x; }
- vec2_scalar zw() const { return {p.z, p.w}; }
-
- void nextRow() {
- // step current coords and interpolants to next row from slope
- p += pSlope;
- interp += interpSlope;
- }
-
- float cur_x() const { return p.x; }
- float x_slope() const { return pSlope.x; }
- };
-
- // Vertex selection above should result in equal left and right start rows
- assert(l0.y == r0.y);
- // Find the start y, clip to within the clip rect, and round to row center.
- // If AA is enabled, round out conservatively rather than round to nearest.
- float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f;
- float y = floor(max(l0.y, clipRect.y0) + aaRound) + 0.5f;
- // Initialize left and right edges from end points and start Y
- Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
- Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
- // WR does not use backface culling, so check if edges are flipped.
- bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1);
- if (flipped) swap(left, right);
- // Get pointer to color buffer and depth buffer at current Y
- P* fbuf = (P*)colortex.sample_ptr(0, int(y));
- DepthRun* fdepth = (DepthRun*)depthtex.sample_ptr(0, int(y));
- // Loop along advancing Ys, rasterizing spans at each row
- float checkY = min(min(l1.y, r1.y), clipRect.y1);
- // Ensure we don't rasterize out edge bounds
- FloatRange clipSpan =
- clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
- for (;;) {
- // Check if we maybe passed edge ends or outside clip rect...
- if (y > checkY) {
- // If we're outside the clip rect, we're done.
- if (y > clipRect.y1) break;
- // Check if Y advanced past the end of the left edge
- if (y > l1.y) {
- // Step to next left edge past Y and reset edge interpolants.
- STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i);
- (flipped ? right : left) =
- Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
- }
- // Check if Y advanced past the end of the right edge
- if (y > r1.y) {
- // Step to next right edge past Y and reset edge interpolants.
- STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i);
- (flipped ? left : right) =
- Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
- }
- // Reset the clip bounds for the new edges
- clipSpan =
- clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
- // Reset check condition for next time around.
- checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1);
- }
-
- // Calculate a potentially AA'd span and check if it is non-empty.
- IntRange span = aa_span(fbuf, left, right, clipSpan);
- if (span.len() > 0) {
- // If user clip planes are enabled, use them to bound the current span.
- if (vertex_shader->use_clip_distance()) {
- span = span.intersect(clip_distance_range(left, right));
- if (span.len() <= 0) goto next_span;
- }
- ctx->shaded_rows++;
- ctx->shaded_pixels += span.len();
- // Advance color/depth buffer pointers to the start of the span.
- P* buf = fbuf + span.start;
- // Check if the we will need to use depth-buffer or discard on this span.
- DepthRun* depth =
- depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr;
- bool use_discard = fragment_shader->use_discard();
- if (depth) {
- // Perspective may cause the depth value to vary on a per sample basis.
- // Ensure the depth row is flattened to allow testing of individual
- // samples
- if (!depth->is_flat()) {
- flatten_depth_runs(depth, depthtex.width);
- }
- // Advance to the depth sample at the start of the span.
- depth += span.start;
- }
- if (colortex.delay_clear) {
- // Delayed clear is enabled for the color buffer. Check if needs clear.
- prepare_row<P>(colortex, int(y), span.start, span.end, use_discard,
- depth);
- }
- // Initialize fragment shader interpolants to current span position.
- fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1);
- fragment_shader->gl_FragCoord.y = y;
- {
- // Calculate the fragment Z and W change per change in fragment X step.
- vec2_scalar stepZW =
- (right.zw() - left.zw()) * (1.0f / (right.x() - left.x()));
- // Calculate initial Z and W values for span start.
- vec2_scalar zw = left.zw() + stepZW * (span.start + 0.5f - left.x());
- // Set fragment shader's Z and W values so that it can use them to
- // cancel out the 1/w baked into the interpolants.
- fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x);
- fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y);
- fragment_shader->swgl_StepZW = stepZW;
- // Change in interpolants is difference between current right and left
- // edges per the change in right and left X. The left and right
- // interpolant values were previously multipled by 1/w, so the step and
- // initial span values take this into account.
- Interpolants step =
- (right.interp - left.interp) * (1.0f / (right.x() - left.x()));
- // Advance current interpolants to X at start of span.
- Interpolants o = left.interp + step * (span.start + 0.5f - left.x());
- fragment_shader->init_span<true>(&o, &step);
- }
- clipRect.set_clip_mask(span.start, y, buf);
- if (!use_discard) {
- // No discard is used. Common case.
- draw_span<false, true>(buf, depth, span.len(), packDepth);
- } else {
- // Discard is used. Rare.
- draw_span<true, true>(buf, depth, span.len(), packDepth);
- }
- }
- next_span:
- // Advance Y and edge interpolants to next row.
- y++;
- left.nextRow();
- right.nextRow();
- // Advance buffers to next row.
- fbuf += colortex.stride() / sizeof(P);
- fdepth += depthtex.stride() / sizeof(DepthRun);
- }
-}
-
-// Clip a primitive against both sides of a view-frustum axis, producing
-// intermediate vertexes with interpolated attributes that will no longer
-// intersect the selected axis planes. This assumes the primitive is convex
-// and should produce at most N+2 vertexes for each invocation (only in the
-// worst case where one point falls outside on each of the opposite sides
-// with the rest of the points inside). The supplied AA edge mask will be
-// modified such that it corresponds to the clipped polygon edges.
-template <XYZW AXIS>
-static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP,
- Interpolants* outInterp, int& outEdgeMask) {
- // Potential mask bits of which side of a plane a coordinate falls on.
- enum SIDE { POSITIVE = 1, NEGATIVE = 2 };
- int numClip = 0;
- int edgeMask = outEdgeMask;
- Point3D prev = p[nump - 1];
- Interpolants prevInterp = interp[nump - 1];
- float prevCoord = prev.select(AXIS);
- // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and
- // if so, remember which side it is outside of. In the special case that W is
- // negative and |C| < |W|, both -W <= C and C <= W will be false, such that
- // we must consider the coordinate as falling outside of both plane sides
- // simultaneously. We test each condition separately and combine them to form
- // a mask of which plane sides we exceeded. If we neglect to consider both
- // sides simultaneously, points can erroneously oscillate from one plane side
- // to the other and exceed the supported maximum number of clip outputs.
- int prevMask = (prevCoord < -prev.w ? NEGATIVE : 0) |
- (prevCoord > prev.w ? POSITIVE : 0);
- // Loop through points, finding edges that cross the planes by evaluating
- // the side at each point.
- outEdgeMask = 0;
- for (int i = 0; i < nump; i++, edgeMask >>= 1) {
- Point3D cur = p[i];
- Interpolants curInterp = interp[i];
- float curCoord = cur.select(AXIS);
- int curMask =
- (curCoord < -cur.w ? NEGATIVE : 0) | (curCoord > cur.w ? POSITIVE : 0);
- // Check if the previous and current end points are on different sides. If
- // the masks of sides intersect, then we consider them to be on the same
- // side. So in the case the masks do not intersect, we then consider them
- // to fall on different sides.
- if (!(curMask & prevMask)) {
- // One of the edge's end points is outside the plane with the other
- // inside the plane. Find the offset where it crosses the plane and
- // adjust the point and interpolants to there.
- if (prevMask) {
- // Edge that was previously outside crosses inside.
- // Evaluate plane equation for previous and current end-point
- // based on previous side and calculate relative offset.
- if (numClip >= nump + 2) {
- // If for some reason we produced more vertexes than we support, just
- // bail out.
- assert(false);
- return 0;
- }
- // The positive plane is assigned the sign 1, and the negative plane is
- // assigned -1. If the point falls outside both planes, that means W is
- // negative. To compensate for this, we must interpolate the coordinate
- // till W=0, at which point we can choose a single plane side for the
- // coordinate to fall on since W will no longer be negative. To compute
- // the coordinate where W=0, we compute K = prev.w / (prev.w-cur.w) and
- // interpolate C = prev.C + K*(cur.C - prev.C). The sign of C will be
- // the side of the plane we need to consider. Substituting K into the
- // comparison C < 0, we can then avoid the division in K with a
- // cross-multiplication.
- float prevSide =
- (prevMask & NEGATIVE) && (!(prevMask & POSITIVE) ||
- prevCoord * (cur.w - prev.w) <
- prev.w * (curCoord - prevCoord))
- ? -1
- : 1;
- float prevDist = prevCoord - prevSide * prev.w;
- float curDist = curCoord - prevSide * cur.w;
- // It may happen that after we interpolate by the weight k that due to
- // floating point rounding we've underestimated the value necessary to
- // push it over the clipping boundary. Just in case, nudge the mantissa
- // by a single increment so that we essentially round it up and move it
- // further inside the clipping boundary. We use nextafter to do this in
- // a portable fashion.
- float k = prevDist / (prevDist - curDist);
- Point3D clipped = prev + (cur - prev) * k;
- if (prevSide * clipped.select(AXIS) > clipped.w) {
- k = nextafterf(k, 1.0f);
- clipped = prev + (cur - prev) * k;
- }
- outP[numClip] = clipped;
- outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
- // Don't output the current edge mask since start point was outside.
- numClip++;
- }
- if (curMask) {
- // Edge that was previously inside crosses outside.
- // Evaluate plane equation for previous and current end-point
- // based on current side and calculate relative offset.
- if (numClip >= nump + 2) {
- assert(false);
- return 0;
- }
- // In the case the coordinate falls on both plane sides, the computation
- // here is much the same as for prevSide, but since we are going from a
- // previous W that is positive to current W that is negative, then the
- // sign of cur.w - prev.w will flip in the equation. The resulting sign
- // is negated to compensate for this.
- float curSide =
- (curMask & POSITIVE) && (!(curMask & NEGATIVE) ||
- prevCoord * (cur.w - prev.w) <
- prev.w * (curCoord - prevCoord))
- ? 1
- : -1;
- float prevDist = prevCoord - curSide * prev.w;
- float curDist = curCoord - curSide * cur.w;
- // Calculate interpolation weight k and the nudge it inside clipping
- // boundary with nextafter. Note that since we were previously inside
- // and now crossing outside, we have to flip the nudge direction for
- // the weight towards 0 instead of 1.
- float k = prevDist / (prevDist - curDist);
- Point3D clipped = prev + (cur - prev) * k;
- if (curSide * clipped.select(AXIS) > clipped.w) {
- k = nextafterf(k, 0.0f);
- clipped = prev + (cur - prev) * k;
- }
- outP[numClip] = clipped;
- outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
- // Output the current edge mask since the end point is inside.
- outEdgeMask |= (edgeMask & 1) << numClip;
- numClip++;
- }
- }
- if (!curMask) {
- // The current end point is inside the plane, so output point unmodified.
- if (numClip >= nump + 2) {
- assert(false);
- return 0;
- }
- outP[numClip] = cur;
- outInterp[numClip] = curInterp;
- // Output the current edge mask since the end point is inside.
- outEdgeMask |= (edgeMask & 1) << numClip;
- numClip++;
- }
- prev = cur;
- prevInterp = curInterp;
- prevCoord = curCoord;
- prevMask = curMask;
- }
- return numClip;
-}
-
-// Helper function to dispatch to perspective span drawing with points that
-// have already been transformed and clipped.
-static inline void draw_perspective_clipped(int nump, Point3D* p_clip,
- Interpolants* interp_clip,
- Texture& colortex,
- Texture& depthtex) {
- // If polygon is ouside clip rect, nothing to draw.
- ClipRect clipRect(colortex);
- if (!clipRect.overlaps(nump, p_clip)) {
- return;
- }
-
- // Finally draw perspective-correct spans for the polygon.
- if (colortex.internal_format == GL_RGBA8) {
- draw_perspective_spans<uint32_t>(nump, p_clip, interp_clip, colortex,
- depthtex, clipRect);
- } else if (colortex.internal_format == GL_R8) {
- draw_perspective_spans<uint8_t>(nump, p_clip, interp_clip, colortex,
- depthtex, clipRect);
- } else {
- assert(false);
- }
-}
-
-// Draws a perspective-correct 3D primitive with varying Z value, as opposed
-// to a simple 2D planar primitive with a constant Z value that could be
-// trivially Z rejected. This requires clipping the primitive against the near
-// and far planes to ensure it stays within the valid Z-buffer range. The Z
-// and W of each fragment of the primitives are interpolated across the
-// generated spans and then depth-tested as appropriate.
-// Additionally, vertex attributes must be interpolated with perspective-
-// correction by dividing by W before interpolation, and then later multiplied
-// by W again to produce the final correct attribute value for each fragment.
-// This process is expensive and should be avoided if possible for primitive
-// batches that are known ahead of time to not need perspective-correction.
-static void draw_perspective(int nump, Interpolants interp_outs[4],
- Texture& colortex, Texture& depthtex) {
- // Lines are not supported with perspective.
- assert(nump >= 3);
- // Convert output of vertex shader to screen space.
- vec4 pos = vertex_shader->gl_Position;
- vec3_scalar scale =
- vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f;
- vec3_scalar offset =
- make_vec3(make_vec2(ctx->viewport.origin() - colortex.offset), 0.0f) +
- scale;
- // Verify if point is between near and far planes, rejecting NaN.
- if (test_all(pos.z > -pos.w && pos.z < pos.w)) {
- // No points cross the near or far planes, so no clipping required.
- // Just divide coords by W and convert to viewport. We assume the W
- // coordinate is non-zero and the reciprocal is finite since it would
- // otherwise fail the test_none condition.
- Float w = 1.0f / pos.w;
- vec3 screen = pos.sel(X, Y, Z) * w * scale + offset;
- Point3D p[4] = {{screen.x.x, screen.y.x, screen.z.x, w.x},
- {screen.x.y, screen.y.y, screen.z.y, w.y},
- {screen.x.z, screen.y.z, screen.z.z, w.z},
- {screen.x.w, screen.y.w, screen.z.w, w.w}};
- draw_perspective_clipped(nump, p, interp_outs, colortex, depthtex);
- } else {
- // Points cross the near or far planes, so we need to clip.
- // Start with the original 3 or 4 points...
- Point3D p[4] = {{pos.x.x, pos.y.x, pos.z.x, pos.w.x},
- {pos.x.y, pos.y.y, pos.z.y, pos.w.y},
- {pos.x.z, pos.y.z, pos.z.z, pos.w.z},
- {pos.x.w, pos.y.w, pos.z.w, pos.w.w}};
- // Clipping can expand the points by 1 for each of 6 view frustum planes.
- Point3D p_clip[4 + 6];
- Interpolants interp_clip[4 + 6];
- // Clip against near and far Z planes.
- nump = clip_side<Z>(nump, p, interp_outs, p_clip, interp_clip,
- swgl_AAEdgeMask);
- // If no points are left inside the view frustum, there's nothing to draw.
- if (nump < 3) {
- return;
- }
- // After clipping against only the near and far planes, we might still
- // produce points where W = 0, exactly at the camera plane. OpenGL specifies
- // that for clip coordinates, points must satisfy:
- // -W <= X <= W
- // -W <= Y <= W
- // -W <= Z <= W
- // When Z = W = 0, this is trivially satisfied, but when we transform and
- // divide by W below it will produce a divide by 0. Usually we want to only
- // clip Z to avoid the extra work of clipping X and Y. We can still project
- // points that fall outside the view frustum X and Y so long as Z is valid.
- // The span drawing code will then ensure X and Y are clamped to viewport
- // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y,
- // will push W further inside the view frustum so that it is no longer 0,
- // allowing us to finally proceed to projecting the points to the screen.
- for (int i = 0; i < nump; i++) {
- // Found an invalid W, so need to clip against X and Y...
- if (p_clip[i].w <= 0.0f) {
- // Ping-pong p_clip -> p_tmp -> p_clip.
- Point3D p_tmp[4 + 6];
- Interpolants interp_tmp[4 + 6];
- nump = clip_side<X>(nump, p_clip, interp_clip, p_tmp, interp_tmp,
- swgl_AAEdgeMask);
- if (nump < 3) return;
- nump = clip_side<Y>(nump, p_tmp, interp_tmp, p_clip, interp_clip,
- swgl_AAEdgeMask);
- if (nump < 3) return;
- // After clipping against X and Y planes, there's still points left
- // to draw, so proceed to trying projection now...
- break;
- }
- }
- // Divide coords by W and convert to viewport.
- for (int i = 0; i < nump; i++) {
- float w = 1.0f / p_clip[i].w;
- // If the W coord is essentially zero, small enough that division would
- // result in Inf/NaN, then just set the reciprocal itself to zero so that
- // the coordinates becomes zeroed out, as the only valid point that
- // satisfies -W <= X/Y/Z <= W is all zeroes.
- if (!isfinite(w)) w = 0.0f;
- p_clip[i] = Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w);
- }
- draw_perspective_clipped(nump, p_clip, interp_clip, colortex, depthtex);
- }
-}
-
-static void draw_quad(int nump, Texture& colortex, Texture& depthtex) {
- // Run vertex shader once for the primitive's vertices.
- // Reserve space for 6 sets of interpolants, in case we need to clip against
- // near and far planes in the perspective case.
- Interpolants interp_outs[4];
- swgl_ClipFlags = 0;
- vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants));
- vec4 pos = vertex_shader->gl_Position;
- // Check if any vertex W is different from another. If so, use perspective.
- if (test_any(pos.w != pos.w.x)) {
- draw_perspective(nump, interp_outs, colortex, depthtex);
- return;
- }
-
- // Convert output of vertex shader to screen space.
- // Divide coords by W and convert to viewport.
- float w = 1.0f / pos.w.x;
- // If the W coord is essentially zero, small enough that division would
- // result in Inf/NaN, then just set the reciprocal itself to zero so that
- // the coordinates becomes zeroed out, as the only valid point that
- // satisfies -W <= X/Y/Z <= W is all zeroes.
- if (!isfinite(w)) w = 0.0f;
- vec2 screen = (pos.sel(X, Y) * w + 1) * 0.5f *
- vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) +
- make_vec2(ctx->viewport.origin() - colortex.offset);
- Point2D p[4] = {{screen.x.x, screen.y.x},
- {screen.x.y, screen.y.y},
- {screen.x.z, screen.y.z},
- {screen.x.w, screen.y.w}};
-
- // If quad is ouside clip rect, nothing to draw.
- ClipRect clipRect(colortex);
- if (!clipRect.overlaps(nump, p)) {
- return;
- }
-
- // Since the quad is assumed 2D, Z is constant across the quad.
- float screenZ = (pos.z.x * w + 1) * 0.5f;
- if (screenZ < 0 || screenZ > 1) {
- // Z values would cross the near or far plane, so just bail.
- return;
- }
- // Since Z doesn't need to be interpolated, just set the fragment shader's
- // Z and W values here, once and for all fragment shader invocations.
- uint32_t z = uint32_t(MAX_DEPTH_VALUE * screenZ);
- fragment_shader->gl_FragCoord.z = screenZ;
- fragment_shader->gl_FragCoord.w = w;
-
- // If supplied a line, adjust it so that it is a quad at least 1 pixel thick.
- // Assume that for a line that all 4 SIMD lanes were actually filled with
- // vertexes 0, 1, 1, 0.
- if (nump == 2) {
- // Nudge Y height to span at least 1 pixel by advancing to next pixel
- // boundary so that we step at least 1 row when drawing spans.
- if (int(p[0].y + 0.5f) == int(p[1].y + 0.5f)) {
- p[2].y = 1 + int(p[1].y + 0.5f);
- p[3].y = p[2].y;
- // Nudge X width to span at least 1 pixel so that rounded coords fall on
- // separate pixels.
- if (int(p[0].x + 0.5f) == int(p[1].x + 0.5f)) {
- p[1].x += 1.0f;
- p[2].x += 1.0f;
- }
- } else {
- // If the line already spans at least 1 row, then assume line is vertical
- // or diagonal and just needs to be dilated horizontally.
- p[2].x += 1.0f;
- p[3].x += 1.0f;
- }
- // Pretend that it's a quad now...
- nump = 4;
- }
-
- // Finally draw 2D spans for the quad. Currently only supports drawing to
- // RGBA8 and R8 color buffers.
- if (colortex.internal_format == GL_RGBA8) {
- draw_quad_spans<uint32_t>(nump, p, z, interp_outs, colortex, depthtex,
- clipRect);
- } else if (colortex.internal_format == GL_R8) {
- draw_quad_spans<uint8_t>(nump, p, z, interp_outs, colortex, depthtex,
- clipRect);
- } else {
- assert(false);
- }
-}
-
-template <typename INDEX>
-static inline void draw_elements(GLsizei count, GLsizei instancecount,
- size_t offset, VertexArray& v,
- Texture& colortex, Texture& depthtex) {
- Buffer& indices_buf = ctx->buffers[v.element_array_buffer_binding];
- if (!indices_buf.buf || offset >= indices_buf.size) {
- return;
- }
- assert((offset & (sizeof(INDEX) - 1)) == 0);
- INDEX* indices = (INDEX*)(indices_buf.buf + offset);
- count = min(count, (GLsizei)((indices_buf.size - offset) / sizeof(INDEX)));
- // Triangles must be indexed at offsets 0, 1, 2.
- // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3.
- if (count == 6 && indices[1] == indices[0] + 1 &&
- indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) {
- assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1);
- // Fast path - since there is only a single quad, we only load per-vertex
- // attribs once for all instances, as they won't change across instances
- // or within an instance.
- vertex_shader->load_attribs(v.attribs, indices[0], 0, 4);
- draw_quad(4, colortex, depthtex);
- for (GLsizei instance = 1; instance < instancecount; instance++) {
- vertex_shader->load_attribs(v.attribs, indices[0], instance, 0);
- draw_quad(4, colortex, depthtex);
- }
- } else {
- for (GLsizei instance = 0; instance < instancecount; instance++) {
- for (GLsizei i = 0; i + 3 <= count; i += 3) {
- if (indices[i + 1] != indices[i] + 1 ||
- indices[i + 2] != indices[i] + 2) {
- continue;
- }
- if (i + 6 <= count && indices[i + 5] == indices[i] + 3) {
- assert(indices[i + 3] == indices[i] + 2 &&
- indices[i + 4] == indices[i] + 1);
- vertex_shader->load_attribs(v.attribs, indices[i], instance, 4);
- draw_quad(4, colortex, depthtex);
- i += 3;
- } else {
- vertex_shader->load_attribs(v.attribs, indices[i], instance, 3);
- draw_quad(3, colortex, depthtex);
- }
- }
- }
- }
-}
diff --git a/third_party/webrender/swgl/src/swgl_ext.h b/third_party/webrender/swgl/src/swgl_ext.h
deleted file mode 100644
index 52d240e0818..00000000000
--- a/third_party/webrender/swgl/src/swgl_ext.h
+++ /dev/null
@@ -1,1826 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-// When using a solid color with clip masking, the cost of loading the clip mask
-// in the blend stage exceeds the cost of processing the color. Here we handle
-// the entire span of clip mask texture before the blend stage to more
-// efficiently process it and modulate it with color without incurring blend
-// stage overheads.
-template <typename P, typename C>
-static void commit_masked_solid_span(P* buf, C color, int len) {
- override_clip_mask();
- uint8_t* mask = get_clip_mask(buf);
- for (P* end = &buf[len]; buf < end; buf += 4, mask += 4) {
- commit_span(
- buf,
- blend_span(
- buf,
- applyColor(expand_mask(buf, unpack(unaligned_load<PackedR8>(mask))),
- color)));
- }
- restore_clip_mask();
-}
-
-// When using a solid color with anti-aliasing, most of the solid span will not
-// benefit from anti-aliasing in the opaque region. We only want to apply the AA
-// blend stage in the non-opaque start and end of the span where AA is needed.
-template <typename P, typename R>
-static ALWAYS_INLINE void commit_aa_solid_span(P* buf, R r, int len) {
- if (int start = min((get_aa_opaque_start(buf) + 3) & ~3, len)) {
- commit_solid_span<true>(buf, r, start);
- buf += start;
- len -= start;
- }
- if (int opaque = min((get_aa_opaque_size(buf) + 3) & ~3, len)) {
- override_aa();
- commit_solid_span<true>(buf, r, opaque);
- restore_aa();
- buf += opaque;
- len -= opaque;
- }
- if (len > 0) {
- commit_solid_span<true>(buf, r, len);
- }
-}
-
-// Forces a value with vector run-class to have scalar run-class.
-template <typename T>
-static ALWAYS_INLINE auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) {
- return force_scalar(v);
-}
-
-// Advance all varying inperpolants by a single chunk
-#define swgl_stepInterp() step_interp_inputs()
-
-// Pseudo-intrinsic that accesses the interpolation step for a given varying
-#define swgl_interpStep(v) (interp_step.v)
-
-// Commit an entire span of a solid color. This dispatches to clip-masked and
-// anti-aliased fast-paths as appropriate.
-#define swgl_commitSolid(format, v, n) \
- do { \
- int len = (n); \
- if (blend_key) { \
- if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { \
- commit_masked_solid_span(swgl_Out##format, \
- packColor(swgl_Out##format, (v)), len); \
- } else if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) { \
- commit_aa_solid_span(swgl_Out##format, \
- pack_span(swgl_Out##format, (v)), len); \
- } else { \
- commit_solid_span<true>(swgl_Out##format, \
- pack_span(swgl_Out##format, (v)), len); \
- } \
- } else { \
- commit_solid_span<false>(swgl_Out##format, \
- pack_span(swgl_Out##format, (v)), len); \
- } \
- swgl_Out##format += len; \
- swgl_SpanLength -= len; \
- } while (0)
-#define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v, swgl_SpanLength)
-#define swgl_commitSolidR8(v) swgl_commitSolid(R8, v, swgl_SpanLength)
-#define swgl_commitPartialSolidRGBA8(len, v) \
- swgl_commitSolid(RGBA8, v, min(int(len), swgl_SpanLength))
-#define swgl_commitPartialSolidR8(len, v) \
- swgl_commitSolid(R8, v, min(int(len), swgl_SpanLength))
-
-#define swgl_commitChunk(format, chunk) \
- do { \
- auto r = chunk; \
- if (blend_key) r = blend_span(swgl_Out##format, r); \
- commit_span(swgl_Out##format, r); \
- swgl_Out##format += swgl_StepSize; \
- swgl_SpanLength -= swgl_StepSize; \
- } while (0)
-
-// Commit a single chunk of a color
-#define swgl_commitColor(format, color) \
- swgl_commitChunk(format, pack_pixels_##format(color))
-#define swgl_commitColorRGBA8(color) swgl_commitColor(RGBA8, color)
-#define swgl_commitColorR8(color) swgl_commitColor(R8, color)
-
-template <typename S>
-static ALWAYS_INLINE bool swgl_isTextureLinear(S s) {
- return s->filter == TextureFilter::LINEAR;
-}
-
-template <typename S>
-static ALWAYS_INLINE bool swgl_isTextureRGBA8(S s) {
- return s->format == TextureFormat::RGBA8;
-}
-
-template <typename S>
-static ALWAYS_INLINE bool swgl_isTextureR8(S s) {
- return s->format == TextureFormat::R8;
-}
-
-// Use the default linear quantization scale of 128. This gives 7 bits of
-// fractional precision, which when multiplied with a signed 9 bit value
-// still fits in a 16 bit integer.
-const int swgl_LinearQuantizeScale = 128;
-
-// Quantizes UVs for access into a linear texture.
-template <typename S, typename T>
-static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) {
- return linearQuantize(p, swgl_LinearQuantizeScale, s);
-}
-
-// Quantizes an interpolation step for UVs for access into a linear texture.
-template <typename S, typename T>
-static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) {
- return samplerScale(s, p) * swgl_LinearQuantizeScale;
-}
-
-template <typename S>
-static ALWAYS_INLINE WideRGBA8 textureLinearUnpacked(UNUSED uint32_t* buf,
- S sampler, ivec2 i) {
- return textureLinearUnpackedRGBA8(sampler, i);
-}
-
-template <typename S>
-static ALWAYS_INLINE WideR8 textureLinearUnpacked(UNUSED uint8_t* buf,
- S sampler, ivec2 i) {
- return textureLinearUnpackedR8(sampler, i);
-}
-
-template <typename S>
-static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint32_t* buf) {
- return swgl_isTextureRGBA8(s);
-}
-
-template <typename S>
-static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint8_t* buf) {
- return swgl_isTextureR8(s);
-}
-
-// Quantizes the UVs to the 2^7 scale needed for calculating fractional offsets
-// for linear sampling.
-#define LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv) \
- uv = swgl_linearQuantize(sampler, uv); \
- vec2_scalar uv_step = \
- float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; \
- vec2_scalar min_uv = max( \
- swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); \
- vec2_scalar max_uv = \
- max(swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), \
- min_uv);
-
-// Implements the fallback linear filter that can deal with clamping and
-// arbitrary scales.
-template <bool BLEND, typename S, typename C, typename P>
-static P* blendTextureLinearFallback(S sampler, vec2 uv, int span,
- vec2_scalar uv_step, vec2_scalar min_uv,
- vec2_scalar max_uv, C color, P* buf) {
- for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
- commit_blend_span<BLEND>(
- buf, applyColor(textureLinearUnpacked(buf, sampler,
- ivec2(clamp(uv, min_uv, max_uv))),
- color));
- }
- return buf;
-}
-
-static ALWAYS_INLINE U64 castForShuffle(V16<int16_t> r) {
- return bit_cast<U64>(r);
-}
-static ALWAYS_INLINE U16 castForShuffle(V4<int16_t> r) {
- return bit_cast<U16>(r);
-}
-
-static ALWAYS_INLINE V16<int16_t> applyFracX(V16<int16_t> r, I16 fracx) {
- return r * fracx.xxxxyyyyzzzzwwww;
-}
-static ALWAYS_INLINE V4<int16_t> applyFracX(V4<int16_t> r, I16 fracx) {
- return r * fracx;
-}
-
-// Implements a faster linear filter that works with axis-aligned constant Y but
-// scales less than 1, i.e. upscaling. In this case we can optimize for the
-// constant Y fraction as well as load all chunks from memory in a single tap
-// for each row.
-template <bool BLEND, typename S, typename C, typename P>
-static void blendTextureLinearUpscale(S sampler, vec2 uv, int span,
- vec2_scalar uv_step, vec2_scalar min_uv,
- vec2_scalar max_uv, C color, P* buf) {
- typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
- typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
- typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
-
- ivec2 i(clamp(uv, min_uv, max_uv));
- ivec2 frac = i;
- i >>= 7;
- P* row0 = (P*)sampler->buf + computeRow(sampler, ivec2_scalar(0, i.y.x));
- P* row1 = row0 + computeNextRowOffset(sampler, ivec2_scalar(0, i.y.x));
- I16 fracx = computeFracX(sampler, i, frac);
- int16_t fracy = computeFracY(frac).x;
- auto src0 =
- CONVERT(unaligned_load<packed_type>(&row0[i.x.x]), signed_unpacked_type);
- auto src1 =
- CONVERT(unaligned_load<packed_type>(&row1[i.x.x]), signed_unpacked_type);
- auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
-
- // We attempt to sample ahead by one chunk and interpolate it with the current
- // one. However, due to the complication of upscaling, we may not necessarily
- // shift in all the next set of samples.
- for (P* end = buf + span; buf < end; buf += 4) {
- uv.x += uv_step.x;
- I32 ixn = cast(uv.x);
- I16 fracn = computeFracNoClamp(ixn);
- ixn >>= 7;
- auto src0n = CONVERT(unaligned_load<packed_type>(&row0[ixn.x]),
- signed_unpacked_type);
- auto src1n = CONVERT(unaligned_load<packed_type>(&row1[ixn.x]),
- signed_unpacked_type);
- auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
-
- // Since we're upscaling, we know that a source pixel has a larger footprint
- // than the destination pixel, and thus all the source pixels needed for
- // this chunk will fall within a single chunk of texture data. However,
- // since the source pixels don't map 1:1 with destination pixels, we need to
- // shift the source pixels over based on their offset from the start of the
- // chunk. This could conceivably be optimized better with usage of PSHUFB or
- // VTBL instructions However, since PSHUFB requires SSSE3, instead we resort
- // to masking in the correct pixels to avoid having to index into memory.
- // For the last sample to interpolate with, we need to potentially shift in
- // a sample from the next chunk over in the case the samples fill out an
- // entire chunk.
- auto shuf = src;
- auto shufn = SHUFFLE(src, ixn.x == i.x.w ? srcn.yyyy : srcn, 1, 2, 3, 4);
- if (i.x.y == i.x.x) {
- shuf = shuf.xxyz;
- shufn = shufn.xxyz;
- }
- if (i.x.z == i.x.y) {
- shuf = shuf.xyyz;
- shufn = shufn.xyyz;
- }
- if (i.x.w == i.x.z) {
- shuf = shuf.xyzz;
- shufn = shufn.xyzz;
- }
-
- // Convert back to a signed unpacked type so that we can interpolate the
- // final result.
- auto interp = bit_cast<signed_unpacked_type>(shuf);
- auto interpn = bit_cast<signed_unpacked_type>(shufn);
- interp += applyFracX(interpn - interp, fracx) >> 7;
-
- commit_blend_span<BLEND>(
- buf, applyColor(bit_cast<unpacked_type>(interp), color));
-
- i.x = ixn;
- fracx = fracn;
- src = srcn;
- }
-}
-
-// This is the fastest variant of the linear filter that still provides
-// filtering. In cases where there is no scaling required, but we have a
-// subpixel offset that forces us to blend in neighboring pixels, we can
-// optimize away most of the memory loads and shuffling that is required by the
-// fallback filter.
-template <bool BLEND, typename S, typename C, typename P>
-static void blendTextureLinearFast(S sampler, vec2 uv, int span,
- vec2_scalar min_uv, vec2_scalar max_uv,
- C color, P* buf) {
- typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
- typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
- typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
-
- ivec2 i(clamp(uv, min_uv, max_uv));
- ivec2 frac = i;
- i >>= 7;
- P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i));
- P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i));
- int16_t fracx = computeFracX(sampler, i, frac).x;
- int16_t fracy = computeFracY(frac).x;
- auto src0 = CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
- auto src1 = CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
- auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
-
- // Since there is no scaling, we sample ahead by one chunk and interpolate it
- // with the current one. We can then reuse this value on the next iteration.
- for (P* end = buf + span; buf < end; buf += 4) {
- row0 += 4;
- row1 += 4;
- auto src0n =
- CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
- auto src1n =
- CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
- auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
-
- // For the last sample to interpolate with, we need to potentially shift in
- // a sample from the next chunk over since the samples fill out an entire
- // chunk.
- auto interp = bit_cast<signed_unpacked_type>(src);
- auto interpn =
- bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 2, 3, 4));
- interp += ((interpn - interp) * fracx) >> 7;
-
- commit_blend_span<BLEND>(
- buf, applyColor(bit_cast<unpacked_type>(interp), color));
-
- src = srcn;
- }
-}
-
-// Implements a faster linear filter that works with axis-aligned constant Y but
-// downscaling the texture by half. In this case we can optimize for the
-// constant X/Y fractions and reduction factor while minimizing shuffling.
-template <bool BLEND, typename S, typename C, typename P>
-static NO_INLINE void blendTextureLinearDownscale(S sampler, vec2 uv, int span,
- vec2_scalar min_uv,
- vec2_scalar max_uv, C color,
- P* buf) {
- typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
- typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
- typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
-
- ivec2 i(clamp(uv, min_uv, max_uv));
- ivec2 frac = i;
- i >>= 7;
- P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i));
- P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i));
- int16_t fracx = computeFracX(sampler, i, frac).x;
- int16_t fracy = computeFracY(frac).x;
-
- for (P* end = buf + span; buf < end; buf += 4) {
- auto src0 =
- CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
- auto src1 =
- CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
- auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
- row0 += 4;
- row1 += 4;
- auto src0n =
- CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
- auto src1n =
- CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
- auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
- row0 += 4;
- row1 += 4;
-
- auto interp =
- bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 0, 2, 4, 6));
- auto interpn =
- bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 3, 5, 7));
- interp += ((interpn - interp) * fracx) >> 7;
-
- commit_blend_span<BLEND>(
- buf, applyColor(bit_cast<unpacked_type>(interp), color));
- }
-}
-
-enum LinearFilter {
- // No linear filter is needed.
- LINEAR_FILTER_NEAREST = 0,
- // The most general linear filter that handles clamping and varying scales.
- LINEAR_FILTER_FALLBACK,
- // A linear filter optimized for axis-aligned upscaling.
- LINEAR_FILTER_UPSCALE,
- // A linear filter with no scaling but with subpixel offset.
- LINEAR_FILTER_FAST,
- // A linear filter optimized for 2x axis-aligned downscaling.
- LINEAR_FILTER_DOWNSCALE
-};
-
-// Dispatches to an appropriate linear filter depending on the selected filter.
-template <bool BLEND, typename S, typename C, typename P>
-static P* blendTextureLinearDispatch(S sampler, vec2 uv, int span,
- vec2_scalar uv_step, vec2_scalar min_uv,
- vec2_scalar max_uv, C color, P* buf,
- LinearFilter filter) {
- P* end = buf + span;
- if (filter != LINEAR_FILTER_FALLBACK) {
- // If we're not using the fallback, then Y is constant across the entire
- // row. We just need to ensure that we handle any samples that might pull
- // data from before the start of the row and require clamping.
- float beforeDist = max(0.0f, min_uv.x) - uv.x.x;
- if (beforeDist > 0) {
- int before = clamp(int(ceil(beforeDist / uv_step.x)) * swgl_StepSize, 0,
- int(end - buf));
- buf = blendTextureLinearFallback<BLEND>(sampler, uv, before, uv_step,
- min_uv, max_uv, color, buf);
- uv.x += (before / swgl_StepSize) * uv_step.x;
- }
- // We need to check how many samples we can take from inside the row without
- // requiring clamping. In case the filter oversamples the row by a step, we
- // subtract off a step from the width to leave some room.
- float insideDist =
- min(max_uv.x, float((int(sampler->width) - swgl_StepSize) *
- swgl_LinearQuantizeScale)) -
- uv.x.x;
- if (uv_step.x > 0.0f && insideDist >= uv_step.x) {
- int inside = int(end - buf);
- if (filter == LINEAR_FILTER_DOWNSCALE) {
- inside = clamp(int(insideDist * (0.5f / swgl_LinearQuantizeScale)) &
- ~(swgl_StepSize - 1),
- 0, inside);
- blendTextureLinearDownscale<BLEND>(sampler, uv, inside, min_uv, max_uv,
- color, buf);
- } else if (filter == LINEAR_FILTER_UPSCALE) {
- inside = clamp(int(insideDist / uv_step.x) * swgl_StepSize, 0, inside);
- blendTextureLinearUpscale<BLEND>(sampler, uv, inside, uv_step, min_uv,
- max_uv, color, buf);
- } else {
- inside = clamp(int(insideDist * (1.0f / swgl_LinearQuantizeScale)) &
- ~(swgl_StepSize - 1),
- 0, inside);
- blendTextureLinearFast<BLEND>(sampler, uv, inside, min_uv, max_uv,
- color, buf);
- }
- buf += inside;
- uv.x += (inside / swgl_StepSize) * uv_step.x;
- }
- }
- // If the fallback filter was requested, or if there are any samples left that
- // may be outside the row and require clamping, then handle that with here.
- if (buf < end) {
- buf = blendTextureLinearFallback<BLEND>(
- sampler, uv, int(end - buf), uv_step, min_uv, max_uv, color, buf);
- }
- return buf;
-}
-
-// Helper function to quantize UVs for linear filtering before dispatch
-template <bool BLEND, typename S, typename C, typename P>
-static inline int blendTextureLinear(S sampler, vec2 uv, int span,
- const vec4_scalar& uv_rect, C color,
- P* buf, LinearFilter filter) {
- if (!matchTextureFormat(sampler, buf)) {
- return 0;
- }
- LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv);
- blendTextureLinearDispatch<BLEND>(sampler, uv, span, uv_step, min_uv, max_uv,
- color, buf, filter);
- return span;
-}
-
-// Samples an axis-aligned span of on a single row of a texture using 1:1
-// nearest filtering. Sampling is constrained to only fall within the given UV
-// bounds. This requires a pointer to the destination buffer. An optional color
-// modulus can be supplied.
-template <bool BLEND, typename S, typename C, typename P>
-static int blendTextureNearestFast(S sampler, vec2 uv, int span,
- const vec4_scalar& uv_rect, C color,
- P* buf) {
- if (!matchTextureFormat(sampler, buf)) {
- return 0;
- }
-
- typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
-
- ivec2_scalar i = make_ivec2(samplerScale(sampler, force_scalar(uv)));
- ivec2_scalar minUV =
- make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y}));
- ivec2_scalar maxUV =
- make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w}));
-
- // Calculate the row pointer within the buffer, clamping to within valid row
- // bounds.
- P* row =
- &((P*)sampler
- ->buf)[clamp(clampCoord(i.y, sampler->height), minUV.y, maxUV.y) *
- sampler->stride];
- // Find clamped X bounds within the row.
- int minX = clamp(minUV.x, 0, sampler->width - 1);
- int maxX = clamp(maxUV.x, minX, sampler->width - 1);
- int curX = i.x;
- int endX = i.x + span;
- // If we need to start sampling below the valid sample bounds, then we need to
- // fill this section with a constant clamped sample.
- if (curX < minX) {
- int n = min(minX, endX) - curX;
- auto src =
- applyColor(unpack(bit_cast<packed_type>(V4<P>(row[minX]))), color);
- commit_solid_span<BLEND>(buf, src, n);
- buf += n;
- curX += n;
- }
- // Here we only deal with valid samples within the sample bounds. No clamping
- // should occur here within these inner loops.
- int n = max(min(maxX + 1, endX) - curX, 0);
- // Try to process as many chunks as possible with full loads and stores.
- for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) {
- auto src = applyColor(unaligned_load<packed_type>(&row[curX]), color);
- commit_blend_span<BLEND>(buf, src);
- }
- n &= 3;
- // If we have any leftover samples after processing chunks, use partial loads
- // and stores.
- if (n > 0) {
- auto src = applyColor(partial_load_span<packed_type>(&row[curX], n), color);
- commit_blend_span<BLEND>(buf, src, n);
- buf += n;
- curX += n;
- }
- // If we still have samples left above the valid sample bounds, then we again
- // need to fill this section with a constant clamped sample.
- if (curX < endX) {
- auto src =
- applyColor(unpack(bit_cast<packed_type>(V4<P>(row[maxX]))), color);
- commit_solid_span<BLEND>(buf, src, endX - curX);
- }
- return span;
-}
-
-// We need to verify that the pixel step reasonably approximates stepping by a
-// single texel for every pixel we need to reproduce. Try to ensure that the
-// margin of error is no more than approximately 2^-7. Also, we check here if
-// the scaling can be quantized for acceleration.
-template <typename T>
-static ALWAYS_INLINE int spanNeedsScale(int span, T P) {
- span &= ~(128 - 1);
- span += 128;
- int scaled = round((P.x.y - P.x.x) * span);
- return scaled != span ? (scaled == span * 2 ? 2 : 1) : 0;
-}
-
-// Helper function to decide whether we can safely apply 1:1 nearest filtering
-// without diverging too much from the linear filter.
-template <typename S, typename T>
-static inline LinearFilter needsTextureLinear(S sampler, T P, int span) {
- // First verify if the row Y doesn't change across samples
- if (P.y.x != P.y.y) {
- return LINEAR_FILTER_FALLBACK;
- }
- P = samplerScale(sampler, P);
- if (int scale = spanNeedsScale(span, P)) {
- // If the source region is not flipped and smaller than the destination,
- // then we can use the upscaling filter since row Y is constant.
- return P.x.x < P.x.y && P.x.y - P.x.x <= 1
- ? LINEAR_FILTER_UPSCALE
- : (scale == 2 ? LINEAR_FILTER_DOWNSCALE
- : LINEAR_FILTER_FALLBACK);
- }
- // Also verify that we're reasonably close to the center of a texel
- // so that it doesn't look that much different than if a linear filter
- // was used.
- if ((int(P.x.x * 4.0f + 0.5f) & 3) != 2 ||
- (int(P.y.x * 4.0f + 0.5f) & 3) != 2) {
- // The source and destination regions are the same, but there is a
- // significant subpixel offset. We can use a faster linear filter to deal
- // with the offset in this case.
- return LINEAR_FILTER_FAST;
- }
- // Otherwise, we have a constant 1:1 step and we're stepping reasonably close
- // to the center of each pixel, so it's safe to disable the linear filter and
- // use nearest.
- return LINEAR_FILTER_NEAREST;
-}
-
-// Commit an entire span with linear filtering
-#define swgl_commitTextureLinear(format, s, p, uv_rect, color, n) \
- do { \
- auto packed_color = packColor(swgl_Out##format, color); \
- int len = (n); \
- int drawn = 0; \
- if (LinearFilter filter = needsTextureLinear(s, p, len)) { \
- if (blend_key) { \
- drawn = blendTextureLinear<true>(s, p, len, uv_rect, packed_color, \
- swgl_Out##format, filter); \
- } else { \
- drawn = blendTextureLinear<false>(s, p, len, uv_rect, packed_color, \
- swgl_Out##format, filter); \
- } \
- } else if (blend_key) { \
- drawn = blendTextureNearestFast<true>(s, p, len, uv_rect, packed_color, \
- swgl_Out##format); \
- } else { \
- drawn = blendTextureNearestFast<false>(s, p, len, uv_rect, packed_color, \
- swgl_Out##format); \
- } \
- swgl_Out##format += drawn; \
- swgl_SpanLength -= drawn; \
- } while (0)
-#define swgl_commitTextureLinearRGBA8(s, p, uv_rect) \
- swgl_commitTextureLinear(RGBA8, s, p, uv_rect, NoColor(), swgl_SpanLength)
-#define swgl_commitTextureLinearR8(s, p, uv_rect) \
- swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), swgl_SpanLength)
-
-// Commit a partial span with linear filtering, optionally inverting the color
-#define swgl_commitPartialTextureLinearR8(len, s, p, uv_rect) \
- swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), \
- min(int(len), swgl_SpanLength))
-#define swgl_commitPartialTextureLinearInvertR8(len, s, p, uv_rect) \
- swgl_commitTextureLinear(R8, s, p, uv_rect, InvertColor(), \
- min(int(len), swgl_SpanLength))
-
-// Commit an entire span with linear filtering that is scaled by a color
-#define swgl_commitTextureLinearColorRGBA8(s, p, uv_rect, color) \
- swgl_commitTextureLinear(RGBA8, s, p, uv_rect, color, swgl_SpanLength)
-#define swgl_commitTextureLinearColorR8(s, p, uv_rect, color) \
- swgl_commitTextureLinear(R8, s, p, uv_rect, color, swgl_SpanLength)
-
-// Helper function that samples from an R8 texture while expanding it to support
-// a differing framebuffer format.
-template <bool BLEND, typename S, typename C, typename P>
-static inline int blendTextureLinearR8(S sampler, vec2 uv, int span,
- const vec4_scalar& uv_rect, C color,
- P* buf) {
- if (!swgl_isTextureR8(sampler)) {
- return 0;
- }
- LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv);
- for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
- commit_blend_span<BLEND>(
- buf, applyColor(expand_mask(buf, textureLinearUnpackedR8(
- sampler,
- ivec2(clamp(uv, min_uv, max_uv)))),
- color));
- }
- return span;
-}
-
-// Commit an entire span with linear filtering while expanding from R8 to RGBA8
-#define swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, color) \
- do { \
- auto packed_color = packColor(swgl_OutRGBA8, color); \
- int drawn = 0; \
- if (blend_key) { \
- drawn = blendTextureLinearR8<true>(s, p, swgl_SpanLength, uv_rect, \
- packed_color, swgl_OutRGBA8); \
- } else { \
- drawn = blendTextureLinearR8<false>(s, p, swgl_SpanLength, uv_rect, \
- packed_color, swgl_OutRGBA8); \
- } \
- swgl_OutRGBA8 += drawn; \
- swgl_SpanLength -= drawn; \
- } while (0)
-#define swgl_commitTextureLinearR8ToRGBA8(s, p, uv_rect) \
- swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, NoColor())
-
-// Compute repeating UVs, possibly constrained by tile repeat limits
-static inline vec2 tileRepeatUV(vec2 uv, const vec2_scalar& tile_repeat) {
- if (tile_repeat.x > 0.0f) {
- // Clamp to a number slightly less than the tile repeat limit so that
- // it results in a number close to but not equal to 1 after fract().
- // This avoids fract() yielding 0 if the limit was left as whole integer.
- uv = clamp(uv, vec2_scalar(0.0f), tile_repeat - 1.0e-6f);
- }
- return fract(uv);
-}
-
-// Compute the number of non-repeating steps before we need to potentially
-// repeat the UVs.
-static inline int computeNoRepeatSteps(Float uv, float uv_step,
- float tile_repeat, int steps) {
- if (uv.w < uv.x) {
- // Ensure the UV taps are ordered low to high.
- uv = uv.wzyx;
- }
- // Check if the samples cross the boundary of the next whole integer or the
- // tile repeat limit, whichever is lower.
- float limit = floor(uv.x) + 1.0f;
- if (tile_repeat > 0.0f) {
- limit = min(limit, tile_repeat);
- }
- return uv.x >= 0.0f && uv.w < limit
- ? (uv_step != 0.0f
- ? int(min(float(steps), (limit - uv.x) / uv_step))
- : steps)
- : 0;
-}
-
-// Blends an entire span of texture with linear filtering and repeating UVs.
-template <bool BLEND, typename S, typename C, typename P>
-static int blendTextureLinearRepeat(S sampler, vec2 uv, int span,
- const vec2_scalar& tile_repeat,
- const vec4_scalar& uv_repeat,
- const vec4_scalar& uv_rect, C color,
- P* buf) {
- if (!matchTextureFormat(sampler, buf)) {
- return 0;
- }
- vec2_scalar uv_scale = {uv_repeat.z - uv_repeat.x, uv_repeat.w - uv_repeat.y};
- vec2_scalar uv_offset = {uv_repeat.x, uv_repeat.y};
- // Choose a linear filter to use for no-repeat sub-spans
- LinearFilter filter =
- needsTextureLinear(sampler, uv * uv_scale + uv_offset, span);
- // We need to step UVs unscaled and unquantized so that we can modulo them
- // with fract. We use uv_scale and uv_offset to map them into the correct
- // range.
- vec2_scalar uv_step =
- float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};
- uv_scale = swgl_linearQuantizeStep(sampler, uv_scale);
- uv_offset = swgl_linearQuantize(sampler, uv_offset);
- vec2_scalar min_uv = max(
- swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f);
- vec2_scalar max_uv = max(
- swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), min_uv);
- for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
- int steps = int(end - buf) / swgl_StepSize;
- // Find the sub-span before UVs repeat to avoid expensive repeat math
- steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps);
- if (steps > 0) {
- steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps);
- if (steps > 0) {
- buf = blendTextureLinearDispatch<BLEND>(
- sampler, fract(uv) * uv_scale + uv_offset, steps * swgl_StepSize,
- uv_step * uv_scale, min_uv, max_uv, color, buf, filter);
- if (buf >= end) {
- break;
- }
- uv += steps * uv_step;
- }
- }
- // UVs might repeat within this step, so explicitly compute repeated UVs
- vec2 repeated_uv = clamp(
- tileRepeatUV(uv, tile_repeat) * uv_scale + uv_offset, min_uv, max_uv);
- commit_blend_span<BLEND>(
- buf, applyColor(textureLinearUnpacked(buf, sampler, ivec2(repeated_uv)),
- color));
- }
- return span;
-}
-
-// Commit an entire span with linear filtering and repeating UVs
-#define swgl_commitTextureLinearRepeat(format, s, p, tile_repeat, uv_repeat, \
- uv_rect, color) \
- do { \
- auto packed_color = packColor(swgl_Out##format, color); \
- int drawn = 0; \
- if (blend_key) { \
- drawn = blendTextureLinearRepeat<true>(s, p, swgl_SpanLength, \
- tile_repeat, uv_repeat, uv_rect, \
- packed_color, swgl_Out##format); \
- } else { \
- drawn = blendTextureLinearRepeat<false>(s, p, swgl_SpanLength, \
- tile_repeat, uv_repeat, uv_rect, \
- packed_color, swgl_Out##format); \
- } \
- swgl_Out##format += drawn; \
- swgl_SpanLength -= drawn; \
- } while (0)
-#define swgl_commitTextureLinearRepeatRGBA8(s, p, tile_repeat, uv_repeat, \
- uv_rect) \
- swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \
- NoColor())
-#define swgl_commitTextureLinearRepeatColorRGBA8(s, p, tile_repeat, uv_repeat, \
- uv_rect, color) \
- swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \
- color)
-
-template <typename S>
-static ALWAYS_INLINE PackedRGBA8 textureNearestPacked(UNUSED uint32_t* buf,
- S sampler, ivec2 i) {
- return textureNearestPackedRGBA8(sampler, i);
-}
-
-// Blends an entire span of texture with nearest filtering and either
-// repeated or clamped UVs.
-template <bool BLEND, bool REPEAT, typename S, typename C, typename P>
-static int blendTextureNearestRepeat(S sampler, vec2 uv, int span,
- const vec2_scalar& tile_repeat,
- const vec4_scalar& uv_rect, C color,
- P* buf) {
- if (!matchTextureFormat(sampler, buf)) {
- return 0;
- }
- if (!REPEAT) {
- // If clamping, then we step pre-scaled to the sampler. For repeat modes,
- // this will be accomplished via uv_scale instead.
- uv = samplerScale(sampler, uv);
- }
- vec2_scalar uv_step =
- float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};
- vec2_scalar min_uv = samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y});
- vec2_scalar max_uv = samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w});
- vec2_scalar uv_scale = max_uv - min_uv;
- // If the effective sampling area of this texture is only a single pixel, then
- // treat it as a solid span. For repeat modes, the bounds are specified on
- // pixel boundaries, whereas for clamp modes, bounds are on pixel centers, so
- // the test varies depending on which. If the sample range on an axis is
- // greater than one pixel, we can still check if we don't move far enough from
- // the pixel center on that axis to hit the next pixel.
- if ((int(min_uv.x) + (REPEAT ? 1 : 0) >= int(max_uv.x) ||
- (uv_step.x * span * (REPEAT ? uv_scale.x : 1.0f) < 0.5f)) &&
- (int(min_uv.y) + (REPEAT ? 1 : 0) >= int(max_uv.y) ||
- (uv_step.y * span * (REPEAT ? uv_scale.y : 1.0f) < 0.5f))) {
- vec2 repeated_uv = REPEAT
- ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv
- : clamp(uv, min_uv, max_uv);
- commit_solid_span<BLEND>(buf,
- applyColor(unpack(textureNearestPacked(
- buf, sampler, ivec2(repeated_uv))),
- color),
- span);
- } else {
- for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
- if (REPEAT) {
- int steps = int(end - buf) / swgl_StepSize;
- // Find the sub-span before UVs repeat to avoid expensive repeat math
- steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps);
- if (steps > 0) {
- steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps);
- if (steps > 0) {
- vec2 inside_uv = fract(uv) * uv_scale + min_uv;
- vec2 inside_step = uv_step * uv_scale;
- for (P* outside = &buf[steps * swgl_StepSize]; buf < outside;
- buf += swgl_StepSize, inside_uv += inside_step) {
- commit_blend_span<BLEND>(
- buf, applyColor(
- textureNearestPacked(buf, sampler, ivec2(inside_uv)),
- color));
- }
- if (buf >= end) {
- break;
- }
- uv += steps * uv_step;
- }
- }
- }
-
- // UVs might repeat within this step, so explicitly compute repeated UVs
- vec2 repeated_uv = REPEAT
- ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv
- : clamp(uv, min_uv, max_uv);
- commit_blend_span<BLEND>(
- buf,
- applyColor(textureNearestPacked(buf, sampler, ivec2(repeated_uv)),
- color));
- }
- }
- return span;
-}
-
-// Determine if we can use the fast nearest filter for the given nearest mode.
-// If the Y coordinate varies more than half a pixel over
-// the span (which might cause the texel to alias to the next one), or the span
-// needs X scaling, then we have to use the fallback.
-template <typename S, typename T>
-static ALWAYS_INLINE bool needsNearestFallback(S sampler, T P, int span) {
- P = samplerScale(sampler, P);
- return (P.y.y - P.y.x) * span >= 0.5f || spanNeedsScale(span, P);
-}
-
-// Commit an entire span with nearest filtering and either clamped or repeating
-// UVs
-#define swgl_commitTextureNearest(format, s, p, uv_rect, color) \
- do { \
- auto packed_color = packColor(swgl_Out##format, color); \
- int drawn = 0; \
- if (needsNearestFallback(s, p, swgl_SpanLength)) { \
- if (blend_key) { \
- drawn = blendTextureNearestRepeat<true, false>( \
- s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \
- swgl_Out##format); \
- } else { \
- drawn = blendTextureNearestRepeat<false, false>( \
- s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \
- swgl_Out##format); \
- } \
- } else if (blend_key) { \
- drawn = blendTextureNearestFast<true>(s, p, swgl_SpanLength, uv_rect, \
- packed_color, swgl_Out##format); \
- } else { \
- drawn = blendTextureNearestFast<false>(s, p, swgl_SpanLength, uv_rect, \
- packed_color, swgl_Out##format); \
- } \
- swgl_Out##format += drawn; \
- swgl_SpanLength -= drawn; \
- } while (0)
-#define swgl_commitTextureNearestRGBA8(s, p, uv_rect) \
- swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor())
-#define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color) \
- swgl_commitTextureNearest(RGBA8, s, p, uv_rect, color)
-
-#define swgl_commitTextureNearestRepeat(format, s, p, tile_repeat, uv_rect, \
- color) \
- do { \
- auto packed_color = packColor(swgl_Out##format, color); \
- int drawn = 0; \
- if (blend_key) { \
- drawn = blendTextureNearestRepeat<true, true>( \
- s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \
- swgl_Out##format); \
- } else { \
- drawn = blendTextureNearestRepeat<false, true>( \
- s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \
- swgl_Out##format); \
- } \
- swgl_Out##format += drawn; \
- swgl_SpanLength -= drawn; \
- } while (0)
-#define swgl_commitTextureNearestRepeatRGBA8(s, p, tile_repeat, uv_repeat, \
- uv_rect) \
- swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, \
- NoColor())
-#define swgl_commitTextureNearestRepeatColorRGBA8(s, p, tile_repeat, \
- uv_repeat, uv_rect, color) \
- swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, color)
-
-// Commit an entire span of texture with filtering determined by sampler state.
-#define swgl_commitTexture(format, s, ...) \
- do { \
- if (s->filter == TextureFilter::LINEAR) { \
- swgl_commitTextureLinear##format(s, __VA_ARGS__); \
- } else { \
- swgl_commitTextureNearest##format(s, __VA_ARGS__); \
- } \
- } while (0)
-#define swgl_commitTextureRGBA8(...) swgl_commitTexture(RGBA8, __VA_ARGS__)
-#define swgl_commitTextureColorRGBA8(...) \
- swgl_commitTexture(ColorRGBA8, __VA_ARGS__)
-#define swgl_commitTextureRepeatRGBA8(...) \
- swgl_commitTexture(RepeatRGBA8, __VA_ARGS__)
-#define swgl_commitTextureRepeatColorRGBA8(...) \
- swgl_commitTexture(RepeatColorRGBA8, __VA_ARGS__)
-
-// Commit an entire span of a separable pass of a Gaussian blur that falls
-// within the given radius scaled by supplied coefficients, clamped to uv_rect
-// bounds.
-template <bool BLEND, typename S, typename P>
-static int blendGaussianBlur(S sampler, vec2 uv, const vec4_scalar& uv_rect,
- P* buf, int span, bool hori, int radius,
- vec2_scalar coeffs) {
- if (!matchTextureFormat(sampler, buf)) {
- return 0;
- }
- vec2_scalar size = {float(sampler->width), float(sampler->height)};
- ivec2_scalar curUV = make_ivec2(force_scalar(uv) * size);
- ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size));
- int startX = curUV.x;
- int endX = min(bounds.z, curUV.x + span);
- if (hori) {
- for (; curUV.x + swgl_StepSize <= endX;
- buf += swgl_StepSize, curUV.x += swgl_StepSize) {
- commit_blend_span<BLEND>(
- buf, gaussianBlurHorizontal<P>(sampler, curUV, bounds.x, bounds.z,
- radius, coeffs.x, coeffs.y));
- }
- } else {
- for (; curUV.x + swgl_StepSize <= endX;
- buf += swgl_StepSize, curUV.x += swgl_StepSize) {
- commit_blend_span<BLEND>(
- buf, gaussianBlurVertical<P>(sampler, curUV, bounds.y, bounds.w,
- radius, coeffs.x, coeffs.y));
- }
- }
- return curUV.x - startX;
-}
-
-#define swgl_commitGaussianBlur(format, s, p, uv_rect, hori, radius, coeffs) \
- do { \
- int drawn = 0; \
- if (blend_key) { \
- drawn = blendGaussianBlur<true>(s, p, uv_rect, swgl_Out##format, \
- swgl_SpanLength, hori, radius, coeffs); \
- } else { \
- drawn = blendGaussianBlur<false>(s, p, uv_rect, swgl_Out##format, \
- swgl_SpanLength, hori, radius, coeffs); \
- } \
- swgl_Out##format += drawn; \
- swgl_SpanLength -= drawn; \
- } while (0)
-#define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs) \
- swgl_commitGaussianBlur(RGBA8, s, p, uv_rect, hori, radius, coeffs)
-#define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs) \
- swgl_commitGaussianBlur(R8, s, p, uv_rect, hori, radius, coeffs)
-
-// Convert and pack planar YUV samples to RGB output using a color space
-static ALWAYS_INLINE PackedRGBA8 convertYUV(int colorSpace, U16 y, U16 u,
- U16 v) {
- auto yy = V8<int16_t>(zip(y, y));
- auto uv = V8<int16_t>(zip(u, v));
- return yuvMatrix[colorSpace].convert(yy, uv);
-}
-
-// Helper functions to sample from planar YUV textures before converting to RGB
-template <typename S0>
-static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0,
- int colorSpace,
- UNUSED int rescaleFactor) {
- switch (sampler0->format) {
- case TextureFormat::RGBA8: {
- auto planar = textureLinearPlanarRGBA8(sampler0, uv0);
- return convertYUV(colorSpace, highHalf(planar.rg), lowHalf(planar.rg),
- lowHalf(planar.ba));
- }
- case TextureFormat::YUV422: {
- auto planar = textureLinearPlanarYUV422(sampler0, uv0);
- return convertYUV(colorSpace, planar.y, planar.u, planar.v);
- }
- default:
- assert(false);
- return PackedRGBA8(0);
- }
-}
-
-template <bool BLEND, typename S0, typename P, typename C = NoColor>
-static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
- const vec4_scalar& uv_rect0, int colorSpace,
- int rescaleFactor, C color = C()) {
- if (!swgl_isTextureLinear(sampler0)) {
- return 0;
- }
- LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
- auto c = packColor(buf, color);
- auto* end = buf + span;
- for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0) {
- commit_blend_span<BLEND>(
- buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
- colorSpace, rescaleFactor),
- c));
- }
- return span;
-}
-
-template <typename S0, typename S1>
-static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1,
- ivec2 uv1, int colorSpace,
- UNUSED int rescaleFactor) {
- switch (sampler1->format) {
- case TextureFormat::RG8: {
- assert(sampler0->format == TextureFormat::R8);
- auto y = textureLinearUnpackedR8(sampler0, uv0);
- auto planar = textureLinearPlanarRG8(sampler1, uv1);
- return convertYUV(colorSpace, y, lowHalf(planar.rg), highHalf(planar.rg));
- }
- case TextureFormat::RGBA8: {
- assert(sampler0->format == TextureFormat::R8);
- auto y = textureLinearUnpackedR8(sampler0, uv0);
- auto planar = textureLinearPlanarRGBA8(sampler1, uv1);
- return convertYUV(colorSpace, y, lowHalf(planar.ba), highHalf(planar.rg));
- }
- default:
- assert(false);
- return PackedRGBA8(0);
- }
-}
-
-template <bool BLEND, typename S0, typename S1, typename P,
- typename C = NoColor>
-static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
- const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1,
- const vec4_scalar& uv_rect1, int colorSpace,
- int rescaleFactor, C color = C()) {
- if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1)) {
- return 0;
- }
- LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
- LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
- auto c = packColor(buf, color);
- auto* end = buf + span;
- for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0, uv1 += uv_step1) {
- commit_blend_span<BLEND>(
- buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
- sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)),
- colorSpace, rescaleFactor),
- c));
- }
- return span;
-}
-
-template <typename S0, typename S1, typename S2>
-static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1,
- ivec2 uv1, S2 sampler2, ivec2 uv2,
- int colorSpace, int rescaleFactor) {
- assert(sampler0->format == sampler1->format &&
- sampler0->format == sampler2->format);
- switch (sampler0->format) {
- case TextureFormat::R8: {
- auto y = textureLinearUnpackedR8(sampler0, uv0);
- auto u = textureLinearUnpackedR8(sampler1, uv1);
- auto v = textureLinearUnpackedR8(sampler2, uv2);
- return convertYUV(colorSpace, y, u, v);
- }
- case TextureFormat::R16: {
- // The rescaling factor represents how many bits to add to renormalize the
- // texture to 16 bits, and so the color depth is actually 16 minus the
- // rescaling factor.
- // Need to right shift the sample by the amount of bits over 8 it
- // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
- // of precision at the low end already, hence 1 is subtracted from the
- // color depth.
- int colorDepth = 16 - rescaleFactor;
- int rescaleBits = (colorDepth - 1) - 8;
- auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits;
- auto u = textureLinearUnpackedR16(sampler1, uv1) >> rescaleBits;
- auto v = textureLinearUnpackedR16(sampler2, uv2) >> rescaleBits;
- return convertYUV(colorSpace, U16(y), U16(u), U16(v));
- }
- default:
- assert(false);
- return PackedRGBA8(0);
- }
-}
-
-// Fallback helper for when we can't specifically accelerate YUV with
-// composition.
-template <bool BLEND, typename S0, typename S1, typename S2, typename P,
- typename C>
-static void blendYUVFallback(P* buf, int span, S0 sampler0, vec2 uv0,
- vec2_scalar uv_step0, vec2_scalar min_uv0,
- vec2_scalar max_uv0, S1 sampler1, vec2 uv1,
- vec2_scalar uv_step1, vec2_scalar min_uv1,
- vec2_scalar max_uv1, S2 sampler2, vec2 uv2,
- vec2_scalar uv_step2, vec2_scalar min_uv2,
- vec2_scalar max_uv2, int colorSpace,
- int rescaleFactor, C color) {
- for (auto* end = buf + span; buf < end; buf += swgl_StepSize, uv0 += uv_step0,
- uv1 += uv_step1, uv2 += uv_step2) {
- commit_blend_span<BLEND>(
- buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
- sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)),
- sampler2, ivec2(clamp(uv2, min_uv2, max_uv2)),
- colorSpace, rescaleFactor),
- color));
- }
-}
-
-template <bool BLEND, typename S0, typename S1, typename S2, typename P,
- typename C = NoColor>
-static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
- const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1,
- const vec4_scalar& uv_rect1, S2 sampler2, vec2 uv2,
- const vec4_scalar& uv_rect2, int colorSpace,
- int rescaleFactor, C color = C()) {
- if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) ||
- !swgl_isTextureLinear(sampler2)) {
- return 0;
- }
- LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
- LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
- LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2);
- auto c = packColor(buf, color);
- blendYUVFallback<BLEND>(buf, span, sampler0, uv0, uv_step0, min_uv0, max_uv0,
- sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2,
- uv2, uv_step2, min_uv2, max_uv2, colorSpace,
- rescaleFactor, c);
- return span;
-}
-
-// A variant of the blendYUV that attempts to reuse the inner loops from the
-// CompositeYUV infrastructure. CompositeYUV imposes stricter requirements on
-// the source data, which in turn allows it to be much faster than blendYUV.
-// At a minimum, we need to ensure that we are outputting to a BGRA8 framebuffer
-// and that no color scaling is applied, which we can accomplish via template
-// specialization. We need to further validate inside that texture formats
-// and dimensions are sane for video and that the video is axis-aligned before
-// acceleration can proceed.
-template <bool BLEND>
-static int blendYUV(uint32_t* buf, int span, sampler2DRect sampler0, vec2 uv0,
- const vec4_scalar& uv_rect0, sampler2DRect sampler1,
- vec2 uv1, const vec4_scalar& uv_rect1,
- sampler2DRect sampler2, vec2 uv2,
- const vec4_scalar& uv_rect2, int colorSpace,
- int rescaleFactor, NoColor noColor = NoColor()) {
- if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) ||
- !swgl_isTextureLinear(sampler2)) {
- return 0;
- }
- LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
- LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
- LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2);
- auto* end = buf + span;
- // CompositeYUV imposes further restrictions on the source textures, such that
- // the the Y/U/V samplers must all have a matching format, the U/V samplers
- // must have matching sizes and sample coordinates, and there must be no
- // change in row across the entire span.
- if (sampler0->format == sampler1->format &&
- sampler1->format == sampler2->format &&
- sampler1->width == sampler2->width &&
- sampler1->height == sampler2->height && uv_step0.y == 0 &&
- uv_step0.x > 0 && uv_step1.y == 0 && uv_step1.x > 0 &&
- uv_step1 == uv_step2 && uv1.x.x == uv2.x.x && uv1.y.x == uv2.y.x) {
- // CompositeYUV does not support a clamp rect, so we must take care to
- // advance till we're inside the bounds of the clamp rect.
- int outside = min(int(ceil(max((min_uv0.x - uv0.x.x) / uv_step0.x,
- (min_uv1.x - uv1.x.x) / uv_step1.x))),
- (end - buf) / swgl_StepSize);
- if (outside > 0) {
- blendYUVFallback<BLEND>(
- buf, outside * swgl_StepSize, sampler0, uv0, uv_step0, min_uv0,
- max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2, uv2,
- uv_step2, min_uv2, max_uv2, colorSpace, rescaleFactor, noColor);
- buf += outside * swgl_StepSize;
- uv0.x += outside * uv_step0.x;
- uv1.x += outside * uv_step1.x;
- uv2.x += outside * uv_step2.x;
- }
- // Find the amount of chunks inside the clamp rect before we hit the
- // maximum. If there are any chunks inside, we can finally dispatch to
- // CompositeYUV.
- int inside = min(int(min((max_uv0.x - uv0.x.x) / uv_step0.x,
- (max_uv1.x - uv1.x.x) / uv_step1.x)),
- (end - buf) / swgl_StepSize);
- if (inside > 0) {
- // We need the color depth, which is relative to the texture format and
- // rescale factor.
- int colorDepth =
- (sampler0->format == TextureFormat::R16 ? 16 : 8) - rescaleFactor;
- // Finally, call the inner loop of CompositeYUV.
- linear_row_yuv<BLEND>(
- buf, inside * swgl_StepSize, sampler0, force_scalar(uv0),
- uv_step0.x / swgl_StepSize, sampler1, sampler2, force_scalar(uv1),
- uv_step1.x / swgl_StepSize, colorDepth, yuvMatrix[colorSpace]);
- // Now that we're done, advance past the processed inside portion.
- buf += inside * swgl_StepSize;
- uv0.x += inside * uv_step0.x;
- uv1.x += inside * uv_step1.x;
- uv2.x += inside * uv_step2.x;
- }
- }
- // We either got here because we have some samples outside the clamp rect, or
- // because some of the preconditions were not satisfied. Process whatever is
- // left of the span.
- blendYUVFallback<BLEND>(buf, end - buf, sampler0, uv0, uv_step0, min_uv0,
- max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1,
- sampler2, uv2, uv_step2, min_uv2, max_uv2, colorSpace,
- rescaleFactor, noColor);
- return span;
-}
-
-// Commit a single chunk of a YUV surface represented by multiple planar
-// textures. This requires a color space specifier selecting how to convert
-// from YUV to RGB output. In the case of HDR formats, a rescaling factor
-// selects how many bits of precision must be utilized on conversion. See the
-// sampleYUV dispatcher functions for the various supported plane
-// configurations this intrinsic accepts.
-#define swgl_commitTextureLinearYUV(...) \
- do { \
- int drawn = 0; \
- if (blend_key) { \
- drawn = blendYUV<true>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \
- } else { \
- drawn = blendYUV<false>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \
- } \
- swgl_OutRGBA8 += drawn; \
- swgl_SpanLength -= drawn; \
- } while (0)
-
-// Commit a single chunk of a YUV surface scaled by a color.
-#define swgl_commitTextureLinearColorYUV(...) \
- swgl_commitTextureLinearYUV(__VA_ARGS__)
-
-// Each gradient stops entry is a pair of RGBA32F start color and end step.
-struct GradientStops {
- Float startColor;
- union {
- Float stepColor;
- vec4_scalar stepData;
- };
-
- // Whether this gradient entry can be merged with an adjacent entry. The
- // step will be equal with the adjacent step if and only if they can be
- // merged, or rather, that the stops are actually part of a single larger
- // gradient.
- bool can_merge(const GradientStops& next) const {
- return stepData == next.stepData;
- }
-
- // Get the interpolated color within the entry based on the offset from its
- // start.
- Float interpolate(float offset) const {
- return startColor + stepColor * offset;
- }
-
- // Get the end color of the entry where interpolation stops.
- Float end_color() const { return startColor + stepColor; }
-};
-
-// Checks if a gradient table of the specified size exists at the UV coords of
-// the address within an RGBA32F texture. If so, a linear address within the
-// texture is returned that may be used to sample the gradient table later. If
-// the address doesn't describe a valid gradient, then a negative value is
-// returned.
-static inline int swgl_validateGradient(sampler2D sampler, ivec2_scalar address,
- int entries) {
- return sampler->format == TextureFormat::RGBA32F && address.y >= 0 &&
- address.y < int(sampler->height) && address.x >= 0 &&
- address.x < int(sampler->width) && entries > 0 &&
- address.x +
- int(sizeof(GradientStops) / sizeof(Float)) * entries <=
- int(sampler->width)
- ? address.y * sampler->stride + address.x * 4
- : -1;
-}
-
-static inline WideRGBA8 sampleGradient(sampler2D sampler, int address,
- Float entry) {
- assert(sampler->format == TextureFormat::RGBA32F);
- assert(address >= 0 && address < int(sampler->height * sampler->stride));
- // Get the integer portion of the entry index to find the entry colors.
- I32 index = cast(entry);
- // Use the fractional portion of the entry index to control blending between
- // entry colors.
- Float offset = entry - cast(index);
- // Every entry is a pair of colors blended by the fractional offset.
- assert(test_all(index >= 0 &&
- index * int(sizeof(GradientStops) / sizeof(Float)) <
- int(sampler->width)));
- GradientStops* stops = (GradientStops*)&sampler->buf[address];
- // Blend between the colors for each SIMD lane, then pack them to RGBA8
- // result. Since the layout of the RGBA8 framebuffer is actually BGRA while
- // the gradient table has RGBA colors, swizzling is required.
- return combine(
- packRGBA8(round_pixel(stops[index.x].interpolate(offset.x).zyxw),
- round_pixel(stops[index.y].interpolate(offset.y).zyxw)),
- packRGBA8(round_pixel(stops[index.z].interpolate(offset.z).zyxw),
- round_pixel(stops[index.w].interpolate(offset.w).zyxw)));
-}
-
-// Samples a gradient entry from the gradient at the provided linearized
-// address. The integer portion of the entry index is used to find the entry
-// within the table whereas the fractional portion is used to blend between
-// adjacent table entries.
-#define swgl_commitGradientRGBA8(sampler, address, entry) \
- swgl_commitChunk(RGBA8, sampleGradient(sampler, address, entry))
-
-// Variant that allows specifying a color multiplier of the gradient result.
-#define swgl_commitGradientColorRGBA8(sampler, address, entry, color) \
- swgl_commitChunk(RGBA8, applyColor(sampleGradient(sampler, address, entry), \
- packColor(swgl_OutRGBA, color)))
-
-// Samples an entire span of a linear gradient by crawling the gradient table
-// and looking for consecutive stops that can be merged into a single larger
-// gradient, then interpolating between those larger gradients within the span.
-template <bool BLEND>
-static bool commitLinearGradient(sampler2D sampler, int address, float size,
- bool repeat, Float offset, uint32_t* buf,
- int span) {
- assert(sampler->format == TextureFormat::RGBA32F);
- assert(address >= 0 && address < int(sampler->height * sampler->stride));
- GradientStops* stops = (GradientStops*)&sampler->buf[address];
- // Get the chunk delta from the difference in offset steps. This represents
- // how far within the gradient table we advance for every step in output,
- // normalized to gradient table size.
- float delta = (offset.y - offset.x) * 4.0f;
- if (!isfinite(delta)) {
- return false;
- }
- for (; span > 0;) {
- // If repeat is desired, we need to limit the offset to a fractional value.
- if (repeat) {
- offset = fract(offset);
- }
- // Try to process as many chunks as are within the span if possible.
- float chunks = 0.25f * span;
- // To properly handle both clamping and repeating of the table offset, we
- // need to ensure we don't run past the 0 and 1 points. Here we compute the
- // intercept points depending on whether advancing forwards or backwards in
- // the gradient table to ensure the chunk count is limited by the amount
- // before intersection. If there is no delta, then we compute no intercept.
- float startEntry;
- int minIndex, maxIndex;
- if (offset.x < 0) {
- // If we're below the gradient table, use the first color stop. We can
- // only intercept the table if walking forward.
- startEntry = 0;
- minIndex = int(startEntry);
- maxIndex = minIndex;
- if (delta > 0) {
- chunks = min(chunks, -offset.x / delta);
- }
- } else if (offset.x < 1) {
- // Otherwise, we're inside the gradient table. Depending on the direction
- // we're walking the the table, we may intersect either the 0 or 1 offset.
- // Compute the start entry based on our initial offset, and compute the
- // end entry based on the available chunks limited by intercepts. Clamp
- // them into the valid range of the table.
- startEntry = 1.0f + offset.x * size;
- if (delta < 0) {
- chunks = min(chunks, -offset.x / delta);
- } else if (delta > 0) {
- chunks = min(chunks, (1 - offset.x) / delta);
- }
- float endEntry = clamp(1.0f + (offset.x + delta * int(chunks)) * size,
- 0.0f, 1.0f + size);
- // Now that we know the range of entries we need to sample, we want to
- // find the largest possible merged gradient within that range. Depending
- // on which direction we are advancing in the table, we either walk up or
- // down the table trying to merge the current entry with the adjacent
- // entry. We finally limit the chunks to only sample from this merged
- // gradient.
- minIndex = int(startEntry);
- maxIndex = minIndex;
- if (delta > 0) {
- while (maxIndex + 1 < endEntry &&
- stops[maxIndex].can_merge(stops[maxIndex + 1])) {
- maxIndex++;
- }
- chunks = min(chunks, (maxIndex + 1 - startEntry) / (delta * size));
- } else if (delta < 0) {
- while (minIndex - 1 > endEntry &&
- stops[minIndex - 1].can_merge(stops[minIndex])) {
- minIndex--;
- }
- chunks = min(chunks, (minIndex - startEntry) / (delta * size));
- }
- } else {
- // If we're above the gradient table, use the last color stop. We can
- // only intercept the table if walking backward.
- startEntry = 1.0f + size;
- minIndex = int(startEntry);
- maxIndex = minIndex;
- if (delta < 0) {
- chunks = min(chunks, (1 - offset.x) / delta);
- }
- }
- // If there are any amount of whole chunks of a merged gradient found,
- // then we want to process that as a single gradient span with the start
- // and end colors from the min and max entries.
- if (chunks >= 1.0f) {
- int inside = int(chunks);
- // Sample the start color from the min entry and the end color from the
- // max entry of the merged gradient. These are scaled to a range of
- // 0..0xFF00, as that is the largest shifted value that can fit in a U16.
- // Since we are only doing addition with the step value, we can still
- // represent negative step values without having to use an explicit sign
- // bit, as the result will still come out the same, allowing us to gain an
- // extra bit of precision. We will later shift these into 8 bit output
- // range while committing the span, but stepping with higher precision to
- // avoid banding. We convert from RGBA to BGRA here to avoid doing this in
- // the inner loop.
- auto minColorF = stops[minIndex].startColor.zyxw * float(0xFF00);
- auto maxColorF = stops[maxIndex].end_color().zyxw * float(0xFF00);
- // Get the color range of the merged gradient, normalized to its size.
- auto colorRangeF =
- (maxColorF - minColorF) * (1.0f / (maxIndex + 1 - minIndex));
- // Compute the actual starting color of the current start offset within
- // the merged gradient. The value 0.5 is added to the low bits (0x80) so
- // that the color will effective round to the nearest increment below.
- auto colorF =
- minColorF + colorRangeF * (startEntry - minIndex) + float(0x80);
- // Compute the portion of the color range that we advance on each chunk.
- Float deltaColorF = colorRangeF * (delta * size);
- // Quantize the color delta and current color. These have already been
- // scaled to the 0..0xFF00 range, so we just need to round them to U16.
- auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16));
- auto color =
- combine(CONVERT(round_pixel(colorF, 1), U16),
- CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16),
- CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16),
- CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16));
- // Finally, step the current color through the output chunks, shifting
- // it into 8 bit range and outputting as we go.
- for (auto* end = buf + inside * 4; buf < end; buf += 4) {
- commit_blend_span<BLEND>(buf, bit_cast<WideRGBA8>(color >> 8));
- color += deltaColor;
- }
- // Deduct the number of chunks inside the gradient from the remaining
- // overall span. If we exhausted the span, bail out.
- span -= inside * 4;
- if (span <= 0) {
- break;
- }
- // Otherwise, assume we're in a transitional section of the gradient that
- // will probably require per-sample table lookups, so fall through below.
- offset += inside * delta;
- if (repeat) {
- offset = fract(offset);
- }
- }
- // If we get here, there were no whole chunks of a merged gradient found
- // that we could process, but we still have a non-zero amount of span left.
- // That means we have segments of gradient that begin or end at the current
- // entry we're on. For this case, we just fall back to sampleGradient which
- // will calculate a table entry for each sample, assuming the samples may
- // have different table entries.
- Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size);
- commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry));
- span -= 4;
- buf += 4;
- offset += delta;
- }
- return true;
-}
-
-// Commits an entire span of a linear gradient, given the address of a table
-// previously resolved with swgl_validateGradient. The size of the inner portion
-// of the table is given, assuming the table start and ends with a single entry
-// each to deal with clamping. Repeating will be handled if necessary. The
-// initial offset within the table is used to designate where to start the span
-// and how to step through the gradient table.
-#define swgl_commitLinearGradientRGBA8(sampler, address, size, repeat, offset) \
- do { \
- bool drawn = false; \
- if (blend_key) { \
- drawn = \
- commitLinearGradient<true>(sampler, address, size, repeat, offset, \
- swgl_OutRGBA8, swgl_SpanLength); \
- } else { \
- drawn = \
- commitLinearGradient<false>(sampler, address, size, repeat, offset, \
- swgl_OutRGBA8, swgl_SpanLength); \
- } \
- if (drawn) { \
- swgl_OutRGBA8 += swgl_SpanLength; \
- swgl_SpanLength = 0; \
- } \
- } while (0)
-
-template <bool CLAMP, typename V>
-static ALWAYS_INLINE V fastSqrt(V v) {
-#if USE_SSE2 || USE_NEON
- // Clamp to avoid zero in inversesqrt.
- return v * inversesqrt(CLAMP ? max(v, V(1.0e-10f)) : v);
-#else
- return sqrt(v);
-#endif
-}
-
-template <bool CLAMP, typename V>
-static ALWAYS_INLINE auto fastLength(V v) {
- return fastSqrt<CLAMP>(dot(v, v));
-}
-
-// Samples an entire span of a radial gradient by crawling the gradient table
-// and looking for consecutive stops that can be merged into a single larger
-// gradient, then interpolating between those larger gradients within the span
-// based on the computed position relative to a radius.
-template <bool BLEND>
-static bool commitRadialGradient(sampler2D sampler, int address, float size,
- bool repeat, vec2 pos, float radius,
- uint32_t* buf, int span) {
- assert(sampler->format == TextureFormat::RGBA32F);
- assert(address >= 0 && address < int(sampler->height * sampler->stride));
- GradientStops* stops = (GradientStops*)&sampler->buf[address];
- // clang-format off
- // Given position p, delta d, and radius r, we need to repeatedly solve the
- // following quadratic for the pixel offset t:
- // length(p + t*d) = r
- // (px + t*dx)^2 + (py + t*dy)^2 = r^2
- // Rearranged into quadratic equation form (t^2*a + t*b + c = 0) this is:
- // t^2*(dx^2+dy^2) + t*2*(dx*px+dy*py) + (px^2+py^2-r^2) = 0
- // t^2*d.d + t*2*d.p + (p.p-r^2) = 0
- // The solution of the quadratic formula t=(-b+-sqrt(b^2-4ac))/2a reduces to:
- // t = -d.p/d.d +- sqrt((d.p/d.d)^2 - (p.p-r^2)/d.d)
- // Note that d.p, d.d, p.p, and r^2 are constant across the gradient, and so
- // we cache them below for faster computation.
- //
- // The quadratic has two solutions, representing the span intersecting the
- // given radius of gradient, which can occur at two offsets. If there is only
- // one solution (where b^2-4ac = 0), this represents the point at which the
- // span runs tangent to the radius. This middle point is significant in that
- // before it, we walk down the gradient ramp, and after it, we walk up the
- // ramp.
- // clang-format on
- vec2_scalar pos0 = {pos.x.x, pos.y.x};
- vec2_scalar delta = {pos.x.y - pos.x.x, pos.y.y - pos.y.x};
- float deltaDelta = dot(delta, delta);
- if (!isfinite(deltaDelta) || !isfinite(radius)) {
- return false;
- }
- float invDelta, middleT, middleB;
- if (deltaDelta > 0) {
- invDelta = 1.0f / deltaDelta;
- middleT = -dot(delta, pos0) * invDelta;
- middleB = middleT * middleT - dot(pos0, pos0) * invDelta;
- } else {
- // If position is invariant, just set the coefficients so the quadratic
- // always reduces to the end of the span.
- invDelta = 0.0f;
- middleT = float(span);
- middleB = 0.0f;
- }
- // We only want search for merged gradients up to the minimum of either the
- // mid-point or the span length. Cache those offsets here as they don't vary
- // in the inner loop.
- Float middleEndRadius = fastLength<true>(
- pos0 + delta * (Float){middleT, float(span), 0.0f, 0.0f});
- float middleRadius = span < middleT ? middleEndRadius.y : middleEndRadius.x;
- float endRadius = middleEndRadius.y;
- // Convert delta to change in position per chunk.
- delta *= 4;
- deltaDelta *= 4 * 4;
- // clang-format off
- // Given current position p and delta d, we reduce:
- // length(p) = sqrt(dot(p,p)) = dot(p,p) * invsqrt(dot(p,p))
- // where dot(p+d,p+d) can be accumulated as:
- // (x+dx)^2+(y+dy)^2 = (x^2+y^2) + 2(x*dx+y*dy) + (dx^2+dy^2)
- // = p.p + 2p.d + d.d
- // Since p increases by d every loop iteration, p.d increases by d.d, and thus
- // we can accumulate d.d to calculate 2p.d, then allowing us to get the next
- // dot-product by adding it to dot-product p.p of the prior iteration. This
- // saves us some multiplications and an expensive sqrt inside the inner loop.
- // clang-format on
- Float dotPos = dot(pos, pos);
- Float dotPosDelta = 2.0f * dot(pos, delta) + deltaDelta;
- float deltaDelta2 = 2.0f * deltaDelta;
- for (int t = 0; t < span;) {
- // Compute the gradient table offset from the current position.
- Float offset = fastSqrt<true>(dotPos) - radius;
- float startRadius = radius;
- // If repeat is desired, we need to limit the offset to a fractional value.
- if (repeat) {
- // The non-repeating radius at which the gradient table actually starts,
- // radius + floor(offset) = radius + (offset - fract(offset)).
- startRadius += offset.x;
- offset = fract(offset);
- startRadius -= offset.x;
- }
- // We need to find the min/max index in the table of the gradient we want to
- // use as well as the intercept point where we leave this gradient.
- float intercept = -1;
- int minIndex = 0;
- int maxIndex = int(1.0f + size);
- if (offset.x < 0) {
- // If inside the inner radius of the gradient table, then use the first
- // stop. Set the intercept to advance forward to the start of the gradient
- // table.
- maxIndex = minIndex;
- if (t >= middleT) {
- intercept = radius;
- }
- } else if (offset.x < 1) {
- // Otherwise, we're inside the valid part of the gradient table.
- minIndex = int(1.0f + offset.x * size);
- maxIndex = minIndex;
- // Find the offset in the gradient that corresponds to the search limit.
- // We only search up to the minimum of either the mid-point or the span
- // length. Get the table index that corresponds to this offset, clamped so
- // that we avoid hitting the beginning (0) or end (1 + size) of the table.
- float searchOffset =
- (t >= middleT ? endRadius : middleRadius) - startRadius;
- int searchIndex = int(clamp(1.0f + size * searchOffset, 1.0f, size));
- // If we are past the mid-point, walk up the gradient table trying to
- // merge stops. If we're below the mid-point, we need to walk down the
- // table. We note the table index at which we need to look for an
- // intercept to determine a valid span.
- if (t >= middleT) {
- while (maxIndex + 1 <= searchIndex &&
- stops[maxIndex].can_merge(stops[maxIndex + 1])) {
- maxIndex++;
- }
- intercept = maxIndex + 1;
- } else {
- while (minIndex - 1 >= searchIndex &&
- stops[minIndex - 1].can_merge(stops[minIndex])) {
- minIndex--;
- }
- intercept = minIndex;
- }
- // Convert from a table index into units of radius from the center of the
- // gradient.
- intercept = clamp((intercept - 1.0f) / size, 0.0f, 1.0f) + startRadius;
- } else {
- // If outside the outer radius of the gradient table, then use the last
- // stop. Set the intercept to advance toward the valid part of the
- // gradient table if going in, or just run to the end of the span if going
- // away from the gradient.
- minIndex = maxIndex;
- if (t < middleT) {
- intercept = radius + 1;
- }
- }
- // Solve the quadratic for t to find where the merged gradient ends. If no
- // intercept is found, just go to the middle or end of the span.
- float endT = t >= middleT ? span : min(span, int(middleT));
- if (intercept >= 0) {
- float b = middleB + intercept * intercept * invDelta;
- if (b > 0) {
- b = fastSqrt<false>(b);
- endT = min(endT, t >= middleT ? middleT + b : middleT - b);
- }
- }
- // Figure out how many chunks are actually inside the merged gradient.
- if (t + 4.0f <= endT) {
- int inside = int(endT - t) & ~3;
- // Convert start and end colors to BGRA and scale to 0..255 range later.
- auto minColorF = stops[minIndex].startColor.zyxw * 255.0f;
- auto maxColorF = stops[maxIndex].end_color().zyxw * 255.0f;
- // Compute the change in color per change in gradient offset.
- auto deltaColorF =
- (maxColorF - minColorF) * (size / (maxIndex + 1 - minIndex));
- // Subtract off the color difference of the beginning of the current span
- // from the beginning of the gradient.
- Float colorF =
- minColorF - deltaColorF * (startRadius + (minIndex - 1) / size);
- // Finally, walk over the span accumulating the position dot product and
- // getting its sqrt as an offset into the color ramp. Since we're already
- // in BGRA format and scaled to 255, we just need to round to an integer
- // and pack down to pixel format.
- for (auto* end = buf + inside; buf < end; buf += 4) {
- Float offsetG = fastSqrt<false>(dotPos);
- commit_blend_span<BLEND>(
- buf,
- combine(
- packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1),
- round_pixel(colorF + deltaColorF * offsetG.y, 1)),
- packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1),
- round_pixel(colorF + deltaColorF * offsetG.w, 1))));
- dotPos += dotPosDelta;
- dotPosDelta += deltaDelta2;
- }
- // Advance past the portion of gradient we just processed.
- t += inside;
- // If we hit the end of the span, exit out now.
- if (t >= span) {
- break;
- }
- // Otherwise, we are most likely in a transitional section of the gradient
- // between stops that will likely require doing per-sample table lookups.
- // Rather than having to redo all the searching above to figure that out,
- // just assume that to be the case and fall through below to doing the
- // table lookups to hopefully avoid an iteration.
- offset = fastSqrt<true>(dotPos) - radius;
- if (repeat) {
- offset = fract(offset);
- }
- }
- // If we got here, that means we still have span left to process but did not
- // have any whole chunks that fell within a merged gradient. Just fall back
- // to doing a table lookup for each sample.
- Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size);
- commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry));
- buf += 4;
- t += 4;
- dotPos += dotPosDelta;
- dotPosDelta += deltaDelta2;
- }
- return true;
-}
-
-// Commits an entire span of a radial gradient similar to
-// swglcommitLinearGradient, but given a varying 2D position scaled to
-// gradient-space and a radius at which the distance from the origin maps to the
-// start of the gradient table.
-#define swgl_commitRadialGradientRGBA8(sampler, address, size, repeat, pos, \
- radius) \
- do { \
- bool drawn = false; \
- if (blend_key) { \
- drawn = \
- commitRadialGradient<true>(sampler, address, size, repeat, pos, \
- radius, swgl_OutRGBA8, swgl_SpanLength); \
- } else { \
- drawn = \
- commitRadialGradient<false>(sampler, address, size, repeat, pos, \
- radius, swgl_OutRGBA8, swgl_SpanLength); \
- } \
- if (drawn) { \
- swgl_OutRGBA8 += swgl_SpanLength; \
- swgl_SpanLength = 0; \
- } \
- } while (0)
-
-// Extension to set a clip mask image to be sampled during blending. The offset
-// specifies the positioning of the clip mask image relative to the viewport
-// origin. The bounding box specifies the rectangle relative to the clip mask's
-// origin that constrains sampling within the clip mask. Blending must be
-// enabled for this to work.
-static sampler2D swgl_ClipMask = nullptr;
-static IntPoint swgl_ClipMaskOffset = {0, 0};
-static IntRect swgl_ClipMaskBounds = {0, 0, 0, 0};
-#define swgl_clipMask(mask, offset, bb_origin, bb_size) \
- do { \
- if (bb_size != vec2_scalar(0.0f, 0.0f)) { \
- swgl_ClipFlags |= SWGL_CLIP_FLAG_MASK; \
- swgl_ClipMask = mask; \
- swgl_ClipMaskOffset = make_ivec2(offset); \
- swgl_ClipMaskBounds = \
- IntRect(make_ivec2(bb_origin), make_ivec2(bb_size)); \
- } \
- } while (0)
-
-// Extension to enable anti-aliasing for the given edges of a quad.
-// Blending must be enable for this to work.
-static int swgl_AAEdgeMask = 0;
-
-static ALWAYS_INLINE int calcAAEdgeMask(bool on) { return on ? 0xF : 0; }
-static ALWAYS_INLINE int calcAAEdgeMask(int mask) { return mask; }
-static ALWAYS_INLINE int calcAAEdgeMask(bvec4_scalar mask) {
- return (mask.x ? 1 : 0) | (mask.y ? 2 : 0) | (mask.z ? 4 : 0) |
- (mask.w ? 8 : 0);
-}
-
-#define swgl_antiAlias(edges) \
- do { \
- swgl_AAEdgeMask = calcAAEdgeMask(edges); \
- if (swgl_AAEdgeMask) { \
- swgl_ClipFlags |= SWGL_CLIP_FLAG_AA; \
- } \
- } while (0)
-
-#define swgl_blendDropShadow(color) \
- do { \
- swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \
- swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_DROP_SHADOW); \
- swgl_BlendColorRGBA8 = packColor<uint32_t>(color); \
- } while (0)
-
-#define swgl_blendSubpixelText(color) \
- do { \
- swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \
- swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_SUBPIXEL_TEXT); \
- swgl_BlendColorRGBA8 = packColor<uint32_t>(color); \
- swgl_BlendAlphaRGBA8 = alphas(swgl_BlendColorRGBA8); \
- } while (0)
-
-// Dispatch helper used by the GLSL translator to swgl_drawSpan functions.
-// The number of pixels committed is tracked by checking for the difference in
-// swgl_SpanLength. Any varying interpolants used will be advanced past the
-// committed part of the span in case the fragment shader must be executed for
-// any remaining pixels that were not committed by the span shader.
-#define DISPATCH_DRAW_SPAN(self, format) \
- do { \
- int total = self->swgl_SpanLength; \
- self->swgl_drawSpan##format(); \
- int drawn = total - self->swgl_SpanLength; \
- if (drawn) self->step_interp_inputs(drawn); \
- return drawn; \
- } while (0)
diff --git a/third_party/webrender/swgl/src/swgl_fns.rs b/third_party/webrender/swgl/src/swgl_fns.rs
index fdb55058afe..0cb60c6d4c8 100644
--- a/third_party/webrender/swgl/src/swgl_fns.rs
+++ b/third_party/webrender/swgl/src/swgl_fns.rs
@@ -14,12 +14,8 @@ macro_rules! debug {
($($x:tt)*) => {};
}
-#[repr(C)]
-struct LockedTexture {
- _private: [u8; 0],
-}
+extern "C" {}
-#[allow(dead_code)]
extern "C" {
fn ActiveTexture(texture: GLenum);
fn BindTexture(target: GLenum, texture: GLuint);
@@ -65,7 +61,19 @@ extern "C" {
level: GLint,
);
fn CheckFramebufferStatus(target: GLenum) -> GLenum;
- fn InvalidateFramebuffer(target: GLenum, num_attachments: GLsizei, attachments: *const GLenum);
+ fn InvalidateFramebuffer(
+ target: GLenum,
+ num_attachments: GLsizei,
+ attachments: *const GLenum,
+ );
+ fn TexStorage3D(
+ target: GLenum,
+ levels: GLint,
+ internal_format: GLenum,
+ width: GLsizei,
+ height: GLsizei,
+ depth: GLsizei,
+ );
fn TexImage2D(
target: GLenum,
level: GLint,
@@ -77,6 +85,18 @@ extern "C" {
ty: GLenum,
data: *const c_void,
);
+ fn TexImage3D(
+ target: GLenum,
+ level: GLint,
+ internal_format: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ depth: GLsizei,
+ border: GLint,
+ format: GLenum,
+ ty: GLenum,
+ data: *const c_void,
+ );
fn TexSubImage2D(
target: GLenum,
level: GLint,
@@ -88,6 +108,19 @@ extern "C" {
ty: GLenum,
data: *const c_void,
);
+ fn TexSubImage3D(
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ zoffset: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ depth: GLsizei,
+ format: GLenum,
+ ty: GLenum,
+ data: *const c_void,
+ );
fn GenerateMipmap(target: GLenum);
fn GetUniformLocation(program: GLuint, name: *const GLchar) -> GLint;
fn BindAttribLocation(program: GLuint, index: GLuint, name: *const GLchar);
@@ -119,19 +152,26 @@ extern "C" {
transpose: GLboolean,
value: *const GLfloat,
);
+
fn DrawElementsInstanced(
mode: GLenum,
count: GLsizei,
type_: GLenum,
- indices: GLintptr,
+ indices: *const c_void,
instancecount: GLsizei,
);
fn EnableVertexAttribArray(index: GLuint);
fn VertexAttribDivisor(index: GLuint, divisor: GLuint);
fn LinkProgram(program: GLuint);
- fn GetLinkStatus(program: GLuint) -> GLint;
fn UseProgram(program: GLuint);
fn SetViewport(x: GLint, y: GLint, width: GLsizei, height: GLsizei);
+ fn FramebufferTextureLayer(
+ target: GLenum,
+ attachment: GLenum,
+ texture: GLuint,
+ level: GLint,
+ layer: GLint,
+ );
fn FramebufferRenderbuffer(
target: GLenum,
attachment: GLenum,
@@ -145,31 +185,6 @@ extern "C" {
fn ClearColor(r: GLfloat, g: GLfloat, b: GLfloat, a: GLfloat);
fn ClearDepth(depth: GLdouble);
fn Clear(mask: GLbitfield);
- fn ClearTexSubImage(
- target: GLenum,
- level: GLint,
- xoffset: GLint,
- yoffset: GLint,
- zoffset: GLint,
- width: GLsizei,
- height: GLsizei,
- depth: GLsizei,
- format: GLenum,
- ty: GLenum,
- data: *const c_void,
- );
- fn ClearTexImage(target: GLenum, level: GLint, format: GLenum, ty: GLenum, data: *const c_void);
- fn ClearColorRect(
- fbo: GLuint,
- xoffset: GLint,
- yoffset: GLint,
- width: GLsizei,
- height: GLsizei,
- r: GLfloat,
- g: GLfloat,
- b: GLfloat,
- a: GLfloat,
- );
fn PixelStorei(name: GLenum, param: GLint);
fn ReadPixels(
x: GLint,
@@ -210,6 +225,17 @@ extern "C" {
width: GLsizei,
height: GLsizei,
);
+ fn CopyTexSubImage3D(
+ target: GLenum,
+ level: GLint,
+ xoffset: GLint,
+ yoffset: GLint,
+ zoffset: GLint,
+ x: GLint,
+ y: GLint,
+ width: GLsizei,
+ height: GLsizei,
+ );
fn BlitFramebuffer(
src_x0: GLint,
src_y0: GLint,
@@ -227,33 +253,22 @@ extern "C" {
fn GetString(name: GLenum) -> *const c_char;
fn GetStringi(name: GLenum, index: GLuint) -> *const c_char;
fn GetError() -> GLenum;
- fn InitDefaultFramebuffer(
- x: i32,
- y: i32,
- width: i32,
- height: i32,
- stride: i32,
- buf: *mut c_void,
- );
+ fn InitDefaultFramebuffer(width: i32, height: i32);
fn GetColorBuffer(
fbo: GLuint,
flush: GLboolean,
width: *mut i32,
height: *mut i32,
- stride: *mut i32,
) -> *mut c_void;
- fn ResolveFramebuffer(fbo: GLuint);
fn SetTextureBuffer(
tex: GLuint,
internal_format: GLenum,
width: GLsizei,
height: GLsizei,
- stride: GLsizei,
buf: *mut c_void,
min_width: GLsizei,
min_height: GLsizei,
);
- fn SetTextureParameter(tex: GLuint, pname: GLenum, param: GLint);
fn DeleteTexture(n: GLuint);
fn DeleteRenderbuffer(n: GLuint);
fn DeleteFramebuffer(n: GLuint);
@@ -262,64 +277,23 @@ extern "C" {
fn DeleteQuery(n: GLuint);
fn DeleteShader(shader: GLuint);
fn DeleteProgram(program: GLuint);
- fn LockFramebuffer(fbo: GLuint) -> *mut LockedTexture;
- fn LockTexture(tex: GLuint) -> *mut LockedTexture;
- fn LockResource(resource: *mut LockedTexture);
- fn UnlockResource(resource: *mut LockedTexture);
- fn GetResourceBuffer(
- resource: *mut LockedTexture,
- width: *mut i32,
- height: *mut i32,
- stride: *mut i32,
- ) -> *mut c_void;
fn Composite(
- locked_dst: *mut LockedTexture,
- locked_src: *mut LockedTexture,
+ src_id: GLuint,
src_x: GLint,
src_y: GLint,
src_width: GLsizei,
src_height: GLsizei,
dst_x: GLint,
dst_y: GLint,
- dst_width: GLsizei,
- dst_height: GLsizei,
opaque: GLboolean,
flip: GLboolean,
- filter: GLenum,
- clip_x: GLint,
- clip_y: GLint,
- clip_width: GLsizei,
- clip_height: GLsizei,
- );
- fn CompositeYUV(
- locked_dst: *mut LockedTexture,
- locked_y: *mut LockedTexture,
- locked_u: *mut LockedTexture,
- locked_v: *mut LockedTexture,
- color_space: YUVColorSpace,
- color_depth: GLuint,
- src_x: GLint,
- src_y: GLint,
- src_width: GLsizei,
- src_height: GLsizei,
- dst_x: GLint,
- dst_y: GLint,
- dst_width: GLsizei,
- dst_height: GLsizei,
- flip: GLboolean,
- clip_x: GLint,
- clip_y: GLint,
- clip_width: GLsizei,
- clip_height: GLsizei,
);
fn CreateContext() -> *mut c_void;
- fn ReferenceContext(ctx: *mut c_void);
fn DestroyContext(ctx: *mut c_void);
fn MakeCurrent(ctx: *mut c_void);
- fn ReportMemory(size_of_op: unsafe extern "C" fn(ptr: *const c_void) -> usize) -> usize;
}
-#[derive(Clone, Copy)]
+#[derive(Clone)]
pub struct Context(*mut c_void);
impl Context {
@@ -327,12 +301,6 @@ impl Context {
Context(unsafe { CreateContext() })
}
- pub fn reference(&self) {
- unsafe {
- ReferenceContext(self.0);
- }
- }
-
pub fn destroy(&self) {
unsafe {
DestroyContext(self.0);
@@ -345,56 +313,18 @@ impl Context {
}
}
- pub fn init_default_framebuffer(
- &self,
- x: i32,
- y: i32,
- width: i32,
- height: i32,
- stride: i32,
- buf: *mut c_void,
- ) {
+ pub fn init_default_framebuffer(&self, width: i32, height: i32) {
unsafe {
- InitDefaultFramebuffer(x, y, width, height, stride, buf);
+ InitDefaultFramebuffer(width, height);
}
}
- pub fn get_color_buffer(&self, fbo: GLuint, flush: bool) -> (*mut c_void, i32, i32, i32) {
+ pub fn get_color_buffer(&self, fbo: GLuint, flush: bool) -> (*mut c_void, i32, i32) {
unsafe {
let mut width: i32 = 0;
let mut height: i32 = 0;
- let mut stride: i32 = 0;
- let data_ptr = GetColorBuffer(
- fbo,
- flush as GLboolean,
- &mut width,
- &mut height,
- &mut stride,
- );
- (data_ptr, width, height, stride)
- }
- }
-
- pub fn resolve_framebuffer(&self, fbo: GLuint) {
- unsafe {
- ResolveFramebuffer(fbo);
- }
- }
-
- pub fn clear_color_rect(
- &self,
- fbo: GLuint,
- xoffset: GLint,
- yoffset: GLint,
- width: GLsizei,
- height: GLsizei,
- r: f32,
- g: f32,
- b: f32,
- a: f32,
- ) {
- unsafe {
- ClearColorRect(fbo, xoffset, yoffset, width, height, r, g, b, a);
+ let data_ptr = GetColorBuffer(fbo, flush as GLboolean, &mut width, &mut height);
+ (data_ptr, width, height)
}
}
@@ -404,7 +334,6 @@ impl Context {
internal_format: GLenum,
width: GLsizei,
height: GLsizei,
- stride: GLsizei,
buf: *mut c_void,
min_width: GLsizei,
min_height: GLsizei,
@@ -415,7 +344,6 @@ impl Context {
internal_format,
width,
height,
- stride,
buf,
min_width,
min_height,
@@ -423,37 +351,32 @@ impl Context {
}
}
- pub fn set_texture_parameter(&self, tex: GLuint, pname: GLenum, param: GLint) {
- unsafe {
- SetTextureParameter(tex, pname, param);
- }
- }
-
- pub fn lock_framebuffer(&self, fbo: GLuint) -> Option<LockedResource> {
- unsafe {
- let resource = LockFramebuffer(fbo);
- if resource != ptr::null_mut() {
- Some(LockedResource(resource))
- } else {
- None
- }
- }
- }
-
- pub fn lock_texture(&self, tex: GLuint) -> Option<LockedResource> {
+ pub fn composite(
+ &self,
+ src_id: GLuint,
+ src_x: GLint,
+ src_y: GLint,
+ src_width: GLsizei,
+ src_height: GLint,
+ dst_x: GLint,
+ dst_y: GLint,
+ opaque: bool,
+ flip: bool,
+ ) {
unsafe {
- let resource = LockTexture(tex);
- if resource != ptr::null_mut() {
- Some(LockedResource(resource))
- } else {
- None
- }
+ Composite(
+ src_id,
+ src_x,
+ src_y,
+ src_width,
+ src_height,
+ dst_x,
+ dst_y,
+ opaque as GLboolean,
+ flip as GLboolean,
+ );
}
}
-
- pub fn report_memory(size_of_op: unsafe extern "C" fn(ptr: *const c_void) -> usize) -> usize {
- unsafe { ReportMemory(size_of_op) }
- }
}
impl From<*mut c_void> for Context {
@@ -488,7 +411,6 @@ fn calculate_length(width: GLsizei, height: GLsizei, format: GLenum, pixel_type:
UNSIGNED_SHORT => 2,
SHORT => 2,
FLOAT => 4,
- UNSIGNED_INT_8_8_8_8_REV => 1,
_ => panic!("unsupported pixel_type for read_pixels: {:?}", pixel_type),
};
@@ -563,8 +485,8 @@ impl Gl for Context {
let u = str::from_utf8(s).unwrap();
const PREFIX: &'static str = "// shader: ";
if let Some(start) = u.find(PREFIX) {
- if let Some(end) = u[start..].find('\n') {
- let name = u[start + PREFIX.len()..start + end].trim();
+ if let Some(end) = u[start ..].find('\n') {
+ let name = u[start + PREFIX.len() .. start + end].trim();
debug!("shader name: {}", name);
unsafe {
let c_string = CString::new(name).unwrap();
@@ -1033,6 +955,7 @@ impl Gl for Context {
panic!();
}
+ // FIXME: Does not verify buffer size -- unsafe!
fn tex_image_3d(
&self,
target: GLenum,
@@ -1046,7 +969,24 @@ impl Gl for Context {
ty: GLenum,
opt_data: Option<&[u8]>,
) {
- panic!();
+ unsafe {
+ let pdata = match opt_data {
+ Some(data) => data.as_ptr() as *const GLvoid,
+ None => ptr::null(),
+ };
+ TexImage3D(
+ target,
+ level,
+ internal_format,
+ width,
+ height,
+ depth,
+ border,
+ format,
+ ty,
+ pdata,
+ );
+ }
}
fn copy_tex_image_2d(
@@ -1091,7 +1031,11 @@ impl Gl for Context {
width: GLsizei,
height: GLsizei,
) {
- panic!();
+ unsafe {
+ CopyTexSubImage3D(
+ target, level, xoffset, yoffset, zoffset, x, y, width, height,
+ );
+ }
}
fn tex_sub_image_2d(
@@ -1173,7 +1117,22 @@ impl Gl for Context {
data: &[u8],
) {
debug!("tex_sub_image_3d");
- panic!();
+ //panic!();
+ unsafe {
+ TexSubImage3D(
+ target,
+ level,
+ xoffset,
+ yoffset,
+ zoffset,
+ width,
+ height,
+ depth,
+ format,
+ ty,
+ data.as_ptr() as *const c_void,
+ );
+ }
}
fn tex_sub_image_3d_pbo(
@@ -1190,7 +1149,21 @@ impl Gl for Context {
ty: GLenum,
offset: usize,
) {
- panic!();
+ unsafe {
+ TexSubImage3D(
+ target,
+ level,
+ xoffset,
+ yoffset,
+ zoffset,
+ width,
+ height,
+ depth,
+ format,
+ ty,
+ offset as *const c_void,
+ );
+ }
}
fn tex_storage_2d(
@@ -1216,7 +1189,10 @@ impl Gl for Context {
height: GLsizei,
depth: GLsizei,
) {
- panic!();
+ //panic!();
+ unsafe {
+ TexStorage3D(target, levels, internal_format, width, height, depth);
+ }
}
fn get_tex_image_into_buffer(
@@ -1376,7 +1352,10 @@ impl Gl for Context {
"framebuffer_texture_layer {} {} {} {} {}",
target, attachment, texture, level, layer
);
- panic!();
+ //panic!();
+ unsafe {
+ FramebufferTextureLayer(target, attachment, texture, level, layer);
+ }
}
fn blit_framebuffer(
@@ -1498,9 +1477,7 @@ impl Gl for Context {
}
fn draw_arrays(&self, mode: GLenum, first: GLint, count: GLsizei) {
- unsafe {
- DrawElementsInstanced(mode, count, NONE, first as GLintptr, 1);
- }
+ panic!();
}
fn draw_arrays_instanced(
@@ -1510,9 +1487,7 @@ impl Gl for Context {
count: GLsizei,
primcount: GLsizei,
) {
- unsafe {
- DrawElementsInstanced(mode, count, NONE, first as GLintptr, primcount);
- }
+ panic!();
}
fn draw_elements(
@@ -1528,7 +1503,13 @@ impl Gl for Context {
);
//panic!();
unsafe {
- DrawElementsInstanced(mode, count, element_type, indices_offset as GLintptr, 1);
+ DrawElementsInstanced(
+ mode,
+ count,
+ element_type,
+ indices_offset as *const c_void,
+ 1,
+ );
}
}
@@ -1550,7 +1531,7 @@ impl Gl for Context {
mode,
count,
element_type,
- indices_offset as GLintptr,
+ indices_offset as *const c_void,
primcount,
);
}
@@ -1843,8 +1824,8 @@ impl Gl for Context {
}
fn get_program_info_log(&self, program: GLuint) -> String {
- debug!("get_program_info_log {}", program);
- String::new()
+ panic!();
+ //String::new()
}
#[inline]
@@ -1854,7 +1835,7 @@ impl Gl for Context {
assert!(!result.is_empty());
//#define GL_LINK_STATUS 0x8B82
if pname == 0x8b82 {
- result[0] = GetLinkStatus(program);
+ result[0] = 1;
}
}
@@ -2118,7 +2099,7 @@ impl Gl for Context {
//ptr::null()
}
- fn client_wait_sync(&self, sync: GLsync, flags: GLbitfield, timeout: GLuint64) -> GLenum {
+ fn client_wait_sync(&self, sync: GLsync, flags: GLbitfield, timeout: GLuint64) {
panic!();
}
@@ -2191,7 +2172,7 @@ impl Gl for Context {
// GL_KHR_blend_equation_advanced
fn blend_barrier_khr(&self) {
- // No barrier required, so nothing to do
+ panic!();
}
// GL_CHROMIUM_copy_texture
@@ -2269,158 +2250,4 @@ impl Gl for Context {
) {
unimplemented!("Not supported by SWGL");
}
-
- fn buffer_storage(
- &self,
- target: GLenum,
- size: GLsizeiptr,
- data: *const GLvoid,
- flags: GLbitfield,
- ) {
- unimplemented!("Not supported by SWGL");
- }
-
- fn flush_mapped_buffer_range(&self, target: GLenum, offset: GLintptr, length: GLsizeiptr) {
- unimplemented!("Not supported by SWGL");
- }
-}
-
-/// A resource that is intended for sharing between threads.
-/// Locked resources such as textures or framebuffers will
-/// not allow any further modifications while it remains
-/// locked. The resource will be unlocked when LockedResource
-/// is dropped.
-pub struct LockedResource(*mut LockedTexture);
-
-unsafe impl Send for LockedResource {}
-unsafe impl Sync for LockedResource {}
-
-#[repr(C)]
-pub enum YUVColorSpace {
- Rec601 = 0,
- Rec709,
- Rec2020,
- Identity,
-}
-
-impl LockedResource {
- /// Composites from a locked resource to another locked resource. The band
- /// offset and height are relative to the destination rectangle and specify
- /// how to clip the composition into appropriate range for this band.
- pub fn composite(
- &self,
- locked_src: &LockedResource,
- src_x: GLint,
- src_y: GLint,
- src_width: GLsizei,
- src_height: GLsizei,
- dst_x: GLint,
- dst_y: GLint,
- dst_width: GLsizei,
- dst_height: GLsizei,
- opaque: bool,
- flip: bool,
- filter: GLenum,
- clip_x: GLint,
- clip_y: GLint,
- clip_width: GLsizei,
- clip_height: GLsizei,
- ) {
- unsafe {
- Composite(
- self.0,
- locked_src.0,
- src_x,
- src_y,
- src_width,
- src_height,
- dst_x,
- dst_y,
- dst_width,
- dst_height,
- opaque as GLboolean,
- flip as GLboolean,
- filter,
- clip_x,
- clip_y,
- clip_width,
- clip_height,
- );
- }
- }
-
- /// Composites from locked resources representing YUV planes
- pub fn composite_yuv(
- &self,
- locked_y: &LockedResource,
- locked_u: &LockedResource,
- locked_v: &LockedResource,
- color_space: YUVColorSpace,
- color_depth: GLuint,
- src_x: GLint,
- src_y: GLint,
- src_width: GLsizei,
- src_height: GLsizei,
- dst_x: GLint,
- dst_y: GLint,
- dst_width: GLsizei,
- dst_height: GLsizei,
- flip: bool,
- clip_x: GLint,
- clip_y: GLint,
- clip_width: GLsizei,
- clip_height: GLsizei,
- ) {
- unsafe {
- CompositeYUV(
- self.0,
- locked_y.0,
- locked_u.0,
- locked_v.0,
- color_space,
- color_depth,
- src_x,
- src_y,
- src_width,
- src_height,
- dst_x,
- dst_y,
- dst_width,
- dst_height,
- flip as GLboolean,
- clip_x,
- clip_y,
- clip_width,
- clip_height,
- );
- }
- }
-
- /// Get the underlying buffer for a locked resource
- pub fn get_buffer(&self) -> (*mut c_void, i32, i32, i32) {
- unsafe {
- let mut width: i32 = 0;
- let mut height: i32 = 0;
- let mut stride: i32 = 0;
- let data_ptr = GetResourceBuffer(self.0, &mut width, &mut height, &mut stride);
- (data_ptr, width, height, stride)
- }
- }
-}
-
-impl Clone for LockedResource {
- fn clone(&self) -> Self {
- unsafe {
- LockResource(self.0);
- }
- LockedResource(self.0)
- }
-}
-
-impl Drop for LockedResource {
- fn drop(&mut self) {
- unsafe {
- UnlockResource(self.0);
- }
- }
}
diff --git a/third_party/webrender/swgl/src/texture.h b/third_party/webrender/swgl/src/texture.h
index fdace241eb5..0219d078bcf 100644
--- a/third_party/webrender/swgl/src/texture.h
+++ b/third_party/webrender/swgl/src/texture.h
@@ -2,884 +2,19 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-namespace glsl {
-
-using PackedRGBA8 = V16<uint8_t>;
-using WideRGBA8 = V16<uint16_t>;
-using HalfRGBA8 = V8<uint16_t>;
-
-SI WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); }
-
-template <int N>
-UNUSED SI VectorType<uint8_t, N> genericPackWide(VectorType<uint16_t, N> p) {
- typedef VectorType<uint8_t, N> packed_type;
- // Generic conversions only mask off the low byte without actually clamping
- // like a real pack. First force the word to all 1s if it overflows, and then
- // add on the sign bit to cause it to roll over to 0 if it was negative.
- p = (p | (p > 255)) + (p >> 15);
- return CONVERT(p, packed_type);
-}
-
-SI PackedRGBA8 pack(WideRGBA8 p) {
-#if USE_SSE2
- return _mm_packus_epi16(lowHalf(p), highHalf(p));
-#elif USE_NEON
- return vcombine_u8(vqmovn_u16(lowHalf(p)), vqmovn_u16(highHalf(p)));
-#else
- return genericPackWide(p);
-#endif
-}
-
-using PackedR8 = V4<uint8_t>;
-using WideR8 = V4<uint16_t>;
-
-SI WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); }
-
-SI PackedR8 pack(WideR8 p) {
-#if USE_SSE2
- auto m = expand(p);
- auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m));
- return SHUFFLE(r, r, 0, 1, 2, 3);
-#elif USE_NEON
- return lowHalf(bit_cast<V8<uint8_t>>(vqmovn_u16(expand(p))));
-#else
- return genericPackWide(p);
-#endif
-}
-
-using PackedRG8 = V8<uint8_t>;
-using WideRG8 = V8<uint16_t>;
-
-SI PackedRG8 pack(WideRG8 p) {
-#if USE_SSE2
- return lowHalf(bit_cast<V16<uint8_t>>(_mm_packus_epi16(p, p)));
-#elif USE_NEON
- return bit_cast<V8<uint8_t>>(vqmovn_u16(p));
-#else
- return genericPackWide(p);
-#endif
-}
-
-SI I32 clampCoord(I32 coord, int limit, int base = 0) {
-#if USE_SSE2
- return _mm_min_epi16(_mm_max_epi16(coord, _mm_set1_epi32(base)),
- _mm_set1_epi32(limit - 1));
-#else
- return clamp(coord, base, limit - 1);
-#endif
-}
-
-SI int clampCoord(int coord, int limit, int base = 0) {
- return min(max(coord, base), limit - 1);
-}
-
-template <typename T, typename S>
-SI T clamp2D(T P, S sampler) {
- return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height)};
-}
-
-SI float to_float(uint32_t x) { return x * (1.f / 255.f); }
-
-SI vec4 pixel_to_vec4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
- U32 pixels = {a, b, c, d};
- return vec4(cast((pixels >> 16) & 0xFF), cast((pixels >> 8) & 0xFF),
- cast(pixels & 0xFF), cast(pixels >> 24)) *
- (1.0f / 255.0f);
-}
-
-SI vec4 pixel_float_to_vec4(Float a, Float b, Float c, Float d) {
- return vec4(Float{a.x, b.x, c.x, d.x}, Float{a.y, b.y, c.y, d.y},
- Float{a.z, b.z, c.z, d.z}, Float{a.w, b.w, c.w, d.w});
-}
-
-SI ivec4 pixel_int_to_ivec4(I32 a, I32 b, I32 c, I32 d) {
- return ivec4(I32{a.x, b.x, c.x, d.x}, I32{a.y, b.y, c.y, d.y},
- I32{a.z, b.z, c.z, d.z}, I32{a.w, b.w, c.w, d.w});
-}
-
-SI vec4_scalar pixel_to_vec4(uint32_t p) {
- U32 i = {(p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF, p >> 24};
- Float f = cast(i) * (1.0f / 255.0f);
- return vec4_scalar(f.x, f.y, f.z, f.w);
-}
-
-template <typename S>
-SI vec4 fetchOffsetsRGBA8(S sampler, I32 offset) {
- return pixel_to_vec4(sampler->buf[offset.x], sampler->buf[offset.y],
- sampler->buf[offset.z], sampler->buf[offset.w]);
-}
-
-template <typename S>
-vec4 texelFetchRGBA8(S sampler, ivec2 P) {
- I32 offset = P.x + P.y * sampler->stride;
- return fetchOffsetsRGBA8(sampler, offset);
-}
-
-template <typename S>
-SI Float fetchOffsetsR8(S sampler, I32 offset) {
- U32 i = {
- ((uint8_t*)sampler->buf)[offset.x], ((uint8_t*)sampler->buf)[offset.y],
- ((uint8_t*)sampler->buf)[offset.z], ((uint8_t*)sampler->buf)[offset.w]};
- return cast(i) * (1.0f / 255.0f);
-}
-
-template <typename S>
-vec4 texelFetchR8(S sampler, ivec2 P) {
- I32 offset = P.x + P.y * sampler->stride;
- return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f);
-}
-
-template <typename S>
-SI vec4 fetchOffsetsRG8(S sampler, I32 offset) {
- uint16_t* buf = (uint16_t*)sampler->buf;
- U16 pixels = {buf[offset.x], buf[offset.y], buf[offset.z], buf[offset.w]};
- Float r = CONVERT(pixels & 0xFF, Float) * (1.0f / 255.0f);
- Float g = CONVERT(pixels >> 8, Float) * (1.0f / 255.0f);
- return vec4(r, g, 0.0f, 1.0f);
-}
-
-template <typename S>
-vec4 texelFetchRG8(S sampler, ivec2 P) {
- I32 offset = P.x + P.y * sampler->stride;
- return fetchOffsetsRG8(sampler, offset);
-}
-
template <typename S>
-SI Float fetchOffsetsR16(S sampler, I32 offset) {
- U32 i = {
- ((uint16_t*)sampler->buf)[offset.x], ((uint16_t*)sampler->buf)[offset.y],
- ((uint16_t*)sampler->buf)[offset.z], ((uint16_t*)sampler->buf)[offset.w]};
- return cast(i) * (1.0f / 65535.0f);
-}
-
-template <typename S>
-vec4 texelFetchR16(S sampler, ivec2 P) {
- I32 offset = P.x + P.y * sampler->stride;
- return vec4(fetchOffsetsR16(sampler, offset), 0.0f, 0.0f, 1.0f);
-}
-
-template <typename S>
-SI vec4 fetchOffsetsFloat(S sampler, I32 offset) {
- return pixel_float_to_vec4(
- *(Float*)&sampler->buf[offset.x], *(Float*)&sampler->buf[offset.y],
- *(Float*)&sampler->buf[offset.z], *(Float*)&sampler->buf[offset.w]);
-}
-
-vec4 texelFetchFloat(sampler2D sampler, ivec2 P) {
- I32 offset = P.x * 4 + P.y * sampler->stride;
- return fetchOffsetsFloat(sampler, offset);
-}
-
-template <typename S>
-SI vec4 fetchOffsetsYUV422(S sampler, I32 offset) {
- // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R.
- // Offset is aligned to a chunk rather than a pixel, and selector specifies
- // pixel within the chunk.
- I32 selector = offset & 1;
- offset &= ~1;
- uint16_t* buf = (uint16_t*)sampler->buf;
- U32 pixels = {*(uint32_t*)&buf[offset.x], *(uint32_t*)&buf[offset.y],
- *(uint32_t*)&buf[offset.z], *(uint32_t*)&buf[offset.w]};
- Float b = CONVERT((pixels >> 8) & 0xFF, Float) * (1.0f / 255.0f);
- Float r = CONVERT((pixels >> 24), Float) * (1.0f / 255.0f);
- Float g =
- CONVERT(if_then_else(-selector, pixels >> 16, pixels) & 0xFF, Float) *
- (1.0f / 255.0f);
- return vec4(r, g, b, 1.0f);
-}
-
-template <typename S>
-vec4 texelFetchYUV422(S sampler, ivec2 P) {
- I32 offset = P.x + P.y * sampler->stride;
- return fetchOffsetsYUV422(sampler, offset);
-}
-
-vec4 texelFetch(sampler2D sampler, ivec2 P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
- switch (sampler->format) {
- case TextureFormat::RGBA32F:
- return texelFetchFloat(sampler, P);
- case TextureFormat::RGBA8:
- return texelFetchRGBA8(sampler, P);
- case TextureFormat::R8:
- return texelFetchR8(sampler, P);
- case TextureFormat::RG8:
- return texelFetchRG8(sampler, P);
- case TextureFormat::R16:
- return texelFetchR16(sampler, P);
- case TextureFormat::YUV422:
- return texelFetchYUV422(sampler, P);
- default:
- assert(false);
- return vec4();
- }
-}
-
-vec4 texelFetch(sampler2DRGBA32F sampler, ivec2 P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
- assert(sampler->format == TextureFormat::RGBA32F);
- return texelFetchFloat(sampler, P);
-}
-
-vec4 texelFetch(sampler2DRGBA8 sampler, ivec2 P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
+static PackedRGBA8 textureLinearPackedRGBA8(S sampler, ivec2 i, int zoffset) {
assert(sampler->format == TextureFormat::RGBA8);
- return texelFetchRGBA8(sampler, P);
-}
-
-vec4 texelFetch(sampler2DR8 sampler, ivec2 P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
- assert(sampler->format == TextureFormat::R8);
- return texelFetchR8(sampler, P);
-}
-
-vec4 texelFetch(sampler2DRG8 sampler, ivec2 P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
- assert(sampler->format == TextureFormat::RG8);
- return texelFetchRG8(sampler, P);
-}
-
-vec4_scalar texelFetch(sampler2D sampler, ivec2_scalar P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
- if (sampler->format == TextureFormat::RGBA32F) {
- return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
- } else {
- assert(sampler->format == TextureFormat::RGBA8);
- return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
- }
-}
-
-vec4_scalar texelFetch(sampler2DRGBA32F sampler, ivec2_scalar P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
- assert(sampler->format == TextureFormat::RGBA32F);
- return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
-}
-
-vec4_scalar texelFetch(sampler2DRGBA8 sampler, ivec2_scalar P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
- assert(sampler->format == TextureFormat::RGBA8);
- return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
-}
-
-vec4_scalar texelFetch(sampler2DR8 sampler, ivec2_scalar P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
- assert(sampler->format == TextureFormat::R8);
- return vec4_scalar{
- to_float(((uint8_t*)sampler->buf)[P.x + P.y * sampler->stride]), 0.0f,
- 0.0f, 1.0f};
-}
-
-vec4_scalar texelFetch(sampler2DRG8 sampler, ivec2_scalar P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
- assert(sampler->format == TextureFormat::RG8);
- uint16_t pixel = ((uint16_t*)sampler->buf)[P.x + P.y * sampler->stride];
- return vec4_scalar{to_float(pixel & 0xFF), to_float(pixel >> 8), 0.0f, 1.0f};
-}
-
-vec4 texelFetch(sampler2DRect sampler, ivec2 P) {
- P = clamp2D(P, sampler);
- switch (sampler->format) {
- case TextureFormat::RGBA8:
- return texelFetchRGBA8(sampler, P);
- case TextureFormat::R8:
- return texelFetchR8(sampler, P);
- case TextureFormat::RG8:
- return texelFetchRG8(sampler, P);
- case TextureFormat::R16:
- return texelFetchR16(sampler, P);
- case TextureFormat::YUV422:
- return texelFetchYUV422(sampler, P);
- default:
- assert(false);
- return vec4();
- }
-}
-
-template <typename S>
-SI ivec4 fetchOffsetsInt(S sampler, I32 offset) {
- return pixel_int_to_ivec4(
- *(I32*)&sampler->buf[offset.x], *(I32*)&sampler->buf[offset.y],
- *(I32*)&sampler->buf[offset.z], *(I32*)&sampler->buf[offset.w]);
-}
-
-ivec4 texelFetch(isampler2D sampler, ivec2 P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
- assert(sampler->format == TextureFormat::RGBA32I);
- I32 offset = P.x * 4 + P.y * sampler->stride;
- return fetchOffsetsInt(sampler, offset);
-}
-
-ivec4_scalar texelFetch(isampler2D sampler, ivec2_scalar P, int lod) {
- assert(lod == 0);
- P = clamp2D(P, sampler);
- assert(sampler->format == TextureFormat::RGBA32I);
- return *(ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
-}
-
-SI vec4_scalar* texelFetchPtr(sampler2D sampler, ivec2_scalar P, int min_x,
- int max_x, int min_y, int max_y) {
- P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x);
- P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y);
- assert(sampler->format == TextureFormat::RGBA32F);
- return (vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
-}
-
-SI ivec4_scalar* texelFetchPtr(isampler2D sampler, ivec2_scalar P, int min_x,
- int max_x, int min_y, int max_y) {
- P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x);
- P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y);
- assert(sampler->format == TextureFormat::RGBA32I);
- return (ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
-}
-
-template <typename S>
-SI I32 texelFetchPtr(S sampler, ivec2 P, int min_x, int max_x, int min_y,
- int max_y) {
- P.x = clampCoord(P.x, int(sampler->width) - max_x, -min_x);
- P.y = clampCoord(P.y, int(sampler->height) - max_y, -min_y);
- return P.x * 4 + P.y * sampler->stride;
-}
-
-template <typename S, typename P>
-SI P texelFetchUnchecked(S sampler, P* ptr, int x, int y = 0) {
- return ptr[x + y * (sampler->stride >> 2)];
-}
-
-SI vec4 texelFetchUnchecked(sampler2D sampler, I32 offset, int x, int y = 0) {
- assert(sampler->format == TextureFormat::RGBA32F);
- return fetchOffsetsFloat(sampler, offset + (x * 4 + y * sampler->stride));
-}
-
-SI ivec4 texelFetchUnchecked(isampler2D sampler, I32 offset, int x, int y = 0) {
- assert(sampler->format == TextureFormat::RGBA32I);
- return fetchOffsetsInt(sampler, offset + (x * 4 + y * sampler->stride));
-}
-
-#define texelFetchOffset(sampler, P, lod, offset) \
- texelFetch(sampler, (P) + (offset), lod)
-
-// Scale texture coords for quantization, subtract offset for filtering
-// (assuming coords already offset to texel centers), and round to nearest
-// 1/scale increment
-template <typename T>
-SI T linearQuantize(T P, float scale) {
- return P * scale + (0.5f - 0.5f * scale);
-}
-
-// Helper version that also scales normalized texture coords for sampler
-template <typename T, typename S>
-SI T samplerScale(S sampler, T P) {
- P.x *= sampler->width;
- P.y *= sampler->height;
- return P;
-}
-
-template <typename T>
-SI T samplerScale(UNUSED sampler2DRect sampler, T P) {
- return P;
-}
-
-template <typename T, typename S>
-SI T linearQuantize(T P, float scale, S sampler) {
- return linearQuantize(samplerScale(sampler, P), scale);
-}
-
-// Compute clamped offset of first row for linear interpolation
-template <typename S, typename I>
-SI auto computeRow(S sampler, I i, size_t margin = 1) -> decltype(i.x) {
- return clampCoord(i.x, sampler->width - margin) +
- clampCoord(i.y, sampler->height) * sampler->stride;
-}
-
-// Compute clamped offset of second row for linear interpolation from first row
-template <typename S, typename I>
-SI auto computeNextRowOffset(S sampler, I i) -> decltype(i.x) {
- return if_then_else(i.y >= 0 && i.y < int32_t(sampler->height) - 1,
- sampler->stride, 0);
-}
-
-// Convert X coordinate to a 2^7 scale fraction for interpolation
-template <typename S>
-SI I16 computeFracX(S sampler, ivec2 i, ivec2 frac) {
- auto overread = i.x > int32_t(sampler->width) - 2;
- return CONVERT((((frac.x & (i.x >= 0)) | overread) & 0x7F) - overread, I16);
-}
-
-// Convert Y coordinate to a 2^7 scale fraction for interpolation
-SI I16 computeFracNoClamp(I32 frac) { return CONVERT(frac & 0x7F, I16); }
-SI I16 computeFracY(ivec2 frac) { return computeFracNoClamp(frac.y); }
-
-struct WidePlanarRGBA8 {
- V8<uint16_t> rg;
- V8<uint16_t> ba;
-};
-
-template <typename S>
-SI WidePlanarRGBA8 textureLinearPlanarRGBA8(S sampler, ivec2 i) {
- assert(sampler->format == TextureFormat::RGBA8);
-
- ivec2 frac = i;
- i >>= 7;
-
- I32 row0 = computeRow(sampler, i);
- I32 row1 = row0 + computeNextRowOffset(sampler, i);
- I16 fracx = computeFracX(sampler, i, frac);
- I16 fracy = computeFracY(frac);
-
- auto a0 =
- CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>);
- auto a1 =
- CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.x]), V8<int16_t>);
- a0 += ((a1 - a0) * fracy.x) >> 7;
-
- auto b0 =
- CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.y]), V8<int16_t>);
- auto b1 =
- CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.y]), V8<int16_t>);
- b0 += ((b1 - b0) * fracy.y) >> 7;
-
- auto abl = zipLow(a0, b0);
- auto abh = zipHigh(a0, b0);
- abl += ((abh - abl) * fracx.xyxyxyxy) >> 7;
-
- auto c0 =
- CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.z]), V8<int16_t>);
- auto c1 =
- CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.z]), V8<int16_t>);
- c0 += ((c1 - c0) * fracy.z) >> 7;
-
- auto d0 =
- CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.w]), V8<int16_t>);
- auto d1 =
- CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.w]), V8<int16_t>);
- d0 += ((d1 - d0) * fracy.w) >> 7;
-
- auto cdl = zipLow(c0, d0);
- auto cdh = zipHigh(c0, d0);
- cdl += ((cdh - cdl) * fracx.zwzwzwzw) >> 7;
-
- auto rg = V8<uint16_t>(zip2Low(abl, cdl));
- auto ba = V8<uint16_t>(zip2High(abl, cdl));
- return WidePlanarRGBA8{rg, ba};
-}
-
-template <typename S>
-vec4 textureLinearRGBA8(S sampler, vec2 P) {
- ivec2 i(linearQuantize(P, 128, sampler));
- auto planar = textureLinearPlanarRGBA8(sampler, i);
- auto rg = CONVERT(planar.rg, V8<float>);
- auto ba = CONVERT(planar.ba, V8<float>);
- auto r = lowHalf(rg);
- auto g = highHalf(rg);
- auto b = lowHalf(ba);
- auto a = highHalf(ba);
- return vec4(b, g, r, a) * (1.0f / 255.0f);
-}
-
-template <typename S>
-static inline U16 textureLinearUnpackedR8(S sampler, ivec2 i) {
- assert(sampler->format == TextureFormat::R8);
- ivec2 frac = i;
- i >>= 7;
-
- I32 row0 = computeRow(sampler, i);
- I32 row1 = row0 + computeNextRowOffset(sampler, i);
- I16 fracx = computeFracX(sampler, i, frac);
- I16 fracy = computeFracY(frac);
-
- uint8_t* buf = (uint8_t*)sampler->buf;
- auto a0 = unaligned_load<V2<uint8_t>>(&buf[row0.x]);
- auto b0 = unaligned_load<V2<uint8_t>>(&buf[row0.y]);
- auto c0 = unaligned_load<V2<uint8_t>>(&buf[row0.z]);
- auto d0 = unaligned_load<V2<uint8_t>>(&buf[row0.w]);
- auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>);
-
- auto a1 = unaligned_load<V2<uint8_t>>(&buf[row1.x]);
- auto b1 = unaligned_load<V2<uint8_t>>(&buf[row1.y]);
- auto c1 = unaligned_load<V2<uint8_t>>(&buf[row1.z]);
- auto d1 = unaligned_load<V2<uint8_t>>(&buf[row1.w]);
- auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>);
-
- abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7;
-
- abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);
- auto abcdl = lowHalf(abcd0);
- auto abcdh = highHalf(abcd0);
- abcdl += ((abcdh - abcdl) * fracx) >> 7;
-
- return U16(abcdl);
-}
-
-template <typename S>
-vec4 textureLinearR8(S sampler, vec2 P) {
- assert(sampler->format == TextureFormat::R8);
-
- ivec2 i(linearQuantize(P, 128, sampler));
- Float r = CONVERT(textureLinearUnpackedR8(sampler, i), Float);
- return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f);
-}
-
-struct WidePlanarRG8 {
- V8<uint16_t> rg;
-};
-
-template <typename S>
-SI WidePlanarRG8 textureLinearPlanarRG8(S sampler, ivec2 i) {
- assert(sampler->format == TextureFormat::RG8);
-
- ivec2 frac = i;
- i >>= 7;
-
- I32 row0 = computeRow(sampler, i);
- I32 row1 = row0 + computeNextRowOffset(sampler, i);
- I16 fracx = computeFracX(sampler, i, frac);
- I16 fracy = computeFracY(frac);
-
- uint16_t* buf = (uint16_t*)sampler->buf;
-
- // Load RG bytes for two adjacent pixels - rgRG
- auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]);
- auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]);
- auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>);
- // Load two pixels for next row
- auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]);
- auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]);
- auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>);
- // Blend rows
- ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7;
-
- auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]);
- auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]);
- auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>);
- auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]);
- auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]);
- auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>);
- // Blend rows
- cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7;
-
- // ab = a.rgRG,b.rgRG
- // cd = c.rgRG,d.rgRG
- // ... ac = ar,cr,ag,cg,aR,cR,aG,cG
- // ... bd = br,dr,bg,dg,bR,dR,bG,dG
- auto ac = zipLow(ab0, cd0);
- auto bd = zipHigh(ab0, cd0);
- // ar,br,cr,dr,ag,bg,cg,dg
- // aR,bR,cR,dR,aG,bG,cG,dG
- auto abcdl = zipLow(ac, bd);
- auto abcdh = zipHigh(ac, bd);
- // Blend columns
- abcdl += ((abcdh - abcdl) * fracx.xyzwxyzw) >> 7;
-
- auto rg = V8<uint16_t>(abcdl);
- return WidePlanarRG8{rg};
-}
-
-template <typename S>
-vec4 textureLinearRG8(S sampler, vec2 P) {
- ivec2 i(linearQuantize(P, 128, sampler));
- auto planar = textureLinearPlanarRG8(sampler, i);
- auto rg = CONVERT(planar.rg, V8<float>) * (1.0f / 255.0f);
- auto r = lowHalf(rg);
- auto g = highHalf(rg);
- return vec4(r, g, 0.0f, 1.0f);
-}
-
-// Samples R16 texture with linear filtering and returns results packed as
-// signed I16. One bit of precision is shifted away from the bottom end to
-// accommodate the sign bit, so only 15 bits of precision is left.
-template <typename S>
-static inline I16 textureLinearUnpackedR16(S sampler, ivec2 i) {
- assert(sampler->format == TextureFormat::R16);
-
- ivec2 frac = i;
+ ivec2 frac = i & 0x7F;
i >>= 7;
- I32 row0 = computeRow(sampler, i);
- I32 row1 = row0 + computeNextRowOffset(sampler, i);
-
+ I32 row0 = clampCoord(i.x, sampler->width) +
+ clampCoord(i.y, sampler->height) * sampler->stride + zoffset;
+ I32 row1 = row0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) &
+ I32(sampler->stride));
I16 fracx =
- CONVERT(
- ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F,
- I16)
- << 8;
- I16 fracy = computeFracY(frac) << 8;
-
- // Sample the 16 bit data for both rows
- uint16_t* buf = (uint16_t*)sampler->buf;
- auto a0 = unaligned_load<V2<uint16_t>>(&buf[row0.x]);
- auto b0 = unaligned_load<V2<uint16_t>>(&buf[row0.y]);
- auto c0 = unaligned_load<V2<uint16_t>>(&buf[row0.z]);
- auto d0 = unaligned_load<V2<uint16_t>>(&buf[row0.w]);
- auto abcd0 = CONVERT(combine(a0, b0, c0, d0) >> 1, V8<int16_t>);
-
- auto a1 = unaligned_load<V2<uint16_t>>(&buf[row1.x]);
- auto b1 = unaligned_load<V2<uint16_t>>(&buf[row1.y]);
- auto c1 = unaligned_load<V2<uint16_t>>(&buf[row1.z]);
- auto d1 = unaligned_load<V2<uint16_t>>(&buf[row1.w]);
- auto abcd1 = CONVERT(combine(a1, b1, c1, d1) >> 1, V8<int16_t>);
-
- // The samples occupy 15 bits and the fraction occupies 15 bits, so that when
- // they are multiplied together, the new scaled sample will fit in the high
- // 14 bits of the result. It is left shifted once to make it 15 bits again
- // for the final multiply.
-#if USE_SSE2
- abcd0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(abcd1 - abcd0, fracy.xxyyzzww))
- << 1;
-#elif USE_NEON
- // NEON has a convenient instruction that does both the multiply and the
- // doubling, so doesn't need an extra shift.
- abcd0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(abcd1 - abcd0, fracy.xxyyzzww));
-#else
- abcd0 += CONVERT((CONVERT(abcd1 - abcd0, V8<int32_t>) *
- CONVERT(fracy.xxyyzzww, V8<int32_t>)) >>
- 16,
- V8<int16_t>)
- << 1;
-#endif
-
- abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);
- auto abcdl = lowHalf(abcd0);
- auto abcdh = highHalf(abcd0);
-#if USE_SSE2
- abcdl += lowHalf(bit_cast<V8<int16_t>>(
- _mm_mulhi_epi16(expand(abcdh - abcdl), expand(fracx))))
- << 1;
-#elif USE_NEON
- abcdl += bit_cast<V4<int16_t>>(vqrdmulh_s16(abcdh - abcdl, fracx));
-#else
- abcdl += CONVERT((CONVERT(abcdh - abcdl, V4<int32_t>) *
- CONVERT(fracx, V4<int32_t>)) >>
- 16,
- V4<int16_t>)
- << 1;
-#endif
-
- return abcdl;
-}
-
-template <typename S>
-vec4 textureLinearR16(S sampler, vec2 P) {
- assert(sampler->format == TextureFormat::R16);
-
- ivec2 i(linearQuantize(P, 128, sampler));
- Float r = CONVERT(textureLinearUnpackedR16(sampler, i), Float);
- return vec4(r * (1.0f / 32767.0f), 0.0f, 0.0f, 1.0f);
-}
-
-using PackedRGBA32F = V16<float>;
-using WideRGBA32F = V16<float>;
-
-template <typename S>
-vec4 textureLinearRGBA32F(S sampler, vec2 P) {
- assert(sampler->format == TextureFormat::RGBA32F);
- P = samplerScale(sampler, P);
- P -= 0.5f;
- vec2 f = floor(P);
- vec2 r = P - f;
- ivec2 i(f);
- ivec2 c(clampCoord(i.x, sampler->width - 1),
- clampCoord(i.y, sampler->height));
- r.x = if_then_else(i.x >= 0, if_then_else(i.x < sampler->width - 1, r.x, 1.0),
- 0.0f);
- I32 offset0 = c.x * 4 + c.y * sampler->stride;
- I32 offset1 = offset0 + computeNextRowOffset(sampler, i);
-
- Float c0 = mix(mix(*(Float*)&sampler->buf[offset0.x],
- *(Float*)&sampler->buf[offset0.x + 4], r.x),
- mix(*(Float*)&sampler->buf[offset1.x],
- *(Float*)&sampler->buf[offset1.x + 4], r.x),
- r.y);
- Float c1 = mix(mix(*(Float*)&sampler->buf[offset0.y],
- *(Float*)&sampler->buf[offset0.y + 4], r.x),
- mix(*(Float*)&sampler->buf[offset1.y],
- *(Float*)&sampler->buf[offset1.y + 4], r.x),
- r.y);
- Float c2 = mix(mix(*(Float*)&sampler->buf[offset0.z],
- *(Float*)&sampler->buf[offset0.z + 4], r.x),
- mix(*(Float*)&sampler->buf[offset1.z],
- *(Float*)&sampler->buf[offset1.z + 4], r.x),
- r.y);
- Float c3 = mix(mix(*(Float*)&sampler->buf[offset0.w],
- *(Float*)&sampler->buf[offset0.w + 4], r.x),
- mix(*(Float*)&sampler->buf[offset1.w],
- *(Float*)&sampler->buf[offset1.w + 4], r.x),
- r.y);
- return pixel_float_to_vec4(c0, c1, c2, c3);
-}
-
-struct WidePlanarYUV8 {
- U16 y;
- U16 u;
- U16 v;
-};
-
-template <typename S>
-SI WidePlanarYUV8 textureLinearPlanarYUV422(S sampler, ivec2 i) {
- assert(sampler->format == TextureFormat::YUV422);
-
- ivec2 frac = i;
- i >>= 7;
-
- I32 row0 = computeRow(sampler, i, 2);
- // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R.
- // Get the selector for the pixel within the chunk.
- I32 selector = row0 & 1;
- // Align the row index to the chunk.
- row0 &= ~1;
- I32 row1 = row0 + computeNextRowOffset(sampler, i);
- // G only needs to be clamped to a pixel boundary for safe interpolation,
- // whereas the BR fraction needs to be clamped 1 extra pixel inside to a chunk
- // boundary.
- frac.x &= (i.x >= 0);
- auto fracx =
- CONVERT(combine(frac.x | (i.x > int32_t(sampler->width) - 3),
- (frac.x >> 1) | (i.x > int32_t(sampler->width) - 3)) &
- 0x7F,
- V8<int16_t>);
- I16 fracy = computeFracY(frac);
-
- uint16_t* buf = (uint16_t*)sampler->buf;
-
- // Load bytes for two adjacent chunks - g0,b,g1,r,G0,B,G1,R
- // We always need to interpolate between (b,r) and (B,R).
- // Depending on selector we need to either interpolate between g0 and g1
- // or between g1 and G0. So for now we just interpolate both cases for g
- // and will select the appropriate one on output.
- auto a0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.x]), V8<int16_t>);
- auto a1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.x]), V8<int16_t>);
- // Combine with next row.
- a0 += ((a1 - a0) * fracy.x) >> 7;
-
- auto b0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.y]), V8<int16_t>);
- auto b1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.y]), V8<int16_t>);
- b0 += ((b1 - b0) * fracy.y) >> 7;
-
- auto c0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.z]), V8<int16_t>);
- auto c1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.z]), V8<int16_t>);
- c0 += ((c1 - c0) * fracy.z) >> 7;
-
- auto d0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.w]), V8<int16_t>);
- auto d1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.w]), V8<int16_t>);
- d0 += ((d1 - d0) * fracy.w) >> 7;
-
- // Shuffle things around so we end up with g0,g0,g0,g0,b,b,b,b and
- // g1,g1,g1,g1,r,r,r,r.
- auto abl = zipLow(a0, b0);
- auto cdl = zipLow(c0, d0);
- auto g0b = zip2Low(abl, cdl);
- auto g1r = zip2High(abl, cdl);
-
- // Need to zip g1,B,G0,R. Instead of using a bunch of complicated masks and
- // and shifts, just shuffle here instead... We finally end up with
- // g1,g1,g1,g1,B,B,B,B and G0,G0,G0,G0,R,R,R,R.
- auto abh = SHUFFLE(a0, b0, 2, 10, 5, 13, 4, 12, 7, 15);
- auto cdh = SHUFFLE(c0, d0, 2, 10, 5, 13, 4, 12, 7, 15);
- auto g1B = zip2Low(abh, cdh);
- auto G0R = zip2High(abh, cdh);
-
- // Finally interpolate between adjacent columns.
- g0b += ((g1B - g0b) * fracx) >> 7;
- g1r += ((G0R - g1r) * fracx) >> 7;
-
- // Choose either g0 or g1 based on selector.
- return WidePlanarYUV8{
- U16(if_then_else(CONVERT(-selector, I16), lowHalf(g1r), lowHalf(g0b))),
- U16(highHalf(g0b)), U16(highHalf(g1r))};
-}
-
-template <typename S>
-vec4 textureLinearYUV422(S sampler, vec2 P) {
- ivec2 i(linearQuantize(P, 128, sampler));
- auto planar = textureLinearPlanarYUV422(sampler, i);
- auto y = CONVERT(planar.y, Float) * (1.0f / 255.0f);
- auto u = CONVERT(planar.u, Float) * (1.0f / 255.0f);
- auto v = CONVERT(planar.v, Float) * (1.0f / 255.0f);
- return vec4(v, y, u, 1.0f);
-}
-
-SI vec4 texture(sampler2D sampler, vec2 P) {
- if (sampler->filter == TextureFilter::LINEAR) {
- switch (sampler->format) {
- case TextureFormat::RGBA32F:
- return textureLinearRGBA32F(sampler, P);
- case TextureFormat::RGBA8:
- return textureLinearRGBA8(sampler, P);
- case TextureFormat::R8:
- return textureLinearR8(sampler, P);
- case TextureFormat::RG8:
- return textureLinearRG8(sampler, P);
- case TextureFormat::R16:
- return textureLinearR16(sampler, P);
- case TextureFormat::YUV422:
- return textureLinearYUV422(sampler, P);
- default:
- assert(false);
- return vec4();
- }
- } else {
- ivec2 coord(roundzero(P.x, sampler->width),
- roundzero(P.y, sampler->height));
- return texelFetch(sampler, coord, 0);
- }
-}
-
-vec4 texture(sampler2DRect sampler, vec2 P) {
- if (sampler->filter == TextureFilter::LINEAR) {
- switch (sampler->format) {
- case TextureFormat::RGBA8:
- return textureLinearRGBA8(sampler, P);
- case TextureFormat::R8:
- return textureLinearR8(sampler, P);
- case TextureFormat::RG8:
- return textureLinearRG8(sampler, P);
- case TextureFormat::R16:
- return textureLinearR16(sampler, P);
- case TextureFormat::YUV422:
- return textureLinearYUV422(sampler, P);
- default:
- assert(false);
- return vec4();
- }
- } else {
- ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f));
- return texelFetch(sampler, coord);
- }
-}
-
-template <typename S>
-vec4_scalar texture(S sampler, vec2_scalar P) {
- return force_scalar(texture(sampler, vec2(P)));
-}
-
-ivec2_scalar textureSize(sampler2D sampler, int) {
- return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
-}
-
-ivec2_scalar textureSize(sampler2DRect sampler) {
- return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
-}
-
-template <typename S>
-static WideRGBA8 textureLinearUnpackedRGBA8(S sampler, ivec2 i) {
- assert(sampler->format == TextureFormat::RGBA8);
- ivec2 frac = i;
- i >>= 7;
-
- I32 row0 = computeRow(sampler, i);
- I32 row1 = row0 + computeNextRowOffset(sampler, i);
- I16 fracx = computeFracX(sampler, i, frac);
- I16 fracy = computeFracY(frac);
+ CONVERT(frac.x & (i.x >= 0 && i.x < int32_t(sampler->width) - 1), I16);
+ I16 fracy = CONVERT(frac.y, I16);
auto a0 =
CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>);
@@ -913,233 +48,80 @@ static WideRGBA8 textureLinearUnpackedRGBA8(S sampler, ivec2 i) {
auto cdh = combine(highHalf(c0), highHalf(d0));
cdl += ((cdh - cdl) * fracx.zzzzwwww) >> 7;
- return combine(HalfRGBA8(abl), HalfRGBA8(cdl));
+ return pack(combine(HalfRGBA8(abl), HalfRGBA8(cdl)));
}
template <typename S>
-static PackedRGBA8 textureLinearPackedRGBA8(S sampler, ivec2 i) {
- return pack(textureLinearUnpackedRGBA8(sampler, i));
+static inline void textureLinearCommit4(S sampler, ivec2 i, int zoffset,
+ uint32_t* buf) {
+ commit_span(buf, textureLinearPackedRGBA8(sampler, i, zoffset));
}
template <typename S>
-static PackedRGBA8 textureNearestPackedRGBA8(S sampler, ivec2 i) {
+static void textureLinearCommit8(S sampler, ivec2_scalar i, int zoffset,
+ uint32_t* buf) {
assert(sampler->format == TextureFormat::RGBA8);
- I32 row = computeRow(sampler, i, 0);
- return combine(unaligned_load<V4<uint8_t>>(&sampler->buf[row.x]),
- unaligned_load<V4<uint8_t>>(&sampler->buf[row.y]),
- unaligned_load<V4<uint8_t>>(&sampler->buf[row.z]),
- unaligned_load<V4<uint8_t>>(&sampler->buf[row.w]));
-}
-
-template <typename S>
-static PackedR8 textureLinearPackedR8(S sampler, ivec2 i) {
- return pack(textureLinearUnpackedR8(sampler, i));
-}
-
-template <typename S>
-static WideRG8 textureLinearUnpackedRG8(S sampler, ivec2 i) {
- assert(sampler->format == TextureFormat::RG8);
- ivec2 frac = i & 0x7F;
+ ivec2_scalar frac = i & 0x7F;
i >>= 7;
- I32 row0 = computeRow(sampler, i);
- I32 row1 = row0 + computeNextRowOffset(sampler, i);
- I16 fracx = computeFracX(sampler, i, frac);
- I16 fracy = computeFracY(frac);
-
- uint16_t* buf = (uint16_t*)sampler->buf;
-
- // Load RG bytes for two adjacent pixels - rgRG
- auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]);
- auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]);
- auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>);
- // Load two pixels for next row
- auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]);
- auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]);
- auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>);
- // Blend rows
- ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7;
-
- auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]);
- auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]);
- auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>);
- auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]);
- auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]);
- auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>);
- // Blend rows
- cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7;
-
- // ab = a.rgRG,b.rgRG
- // cd = c.rgRG,d.rgRG
- // ... ac = a.rg,c.rg,a.RG,c.RG
- // ... bd = b.rg,d.rg,b.RG,d.RG
- auto ac = zip2Low(ab0, cd0);
- auto bd = zip2High(ab0, cd0);
- // a.rg,b.rg,c.rg,d.rg
- // a.RG,b.RG,c.RG,d.RG
- auto abcdl = zip2Low(ac, bd);
- auto abcdh = zip2High(ac, bd);
- // Blend columns
- abcdl += ((abcdh - abcdl) * fracx.xxyyzzww) >> 7;
-
- return WideRG8(abcdl);
-}
-
-template <typename S>
-static PackedRG8 textureLinearPackedRG8(S sampler, ivec2 i) {
- return pack(textureLinearUnpackedRG8(sampler, i));
-}
-
-template <int N>
-static ALWAYS_INLINE VectorType<uint16_t, N> addsat(VectorType<uint16_t, N> x,
- VectorType<uint16_t, N> y) {
- auto r = x + y;
- return r | (r < x);
-}
-
-template <typename P, typename S>
-static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurHorizontal(
- S sampler, const ivec2_scalar& i, int minX, int maxX, int radius,
- float coeff, float coeffStep) {
- // Packed and unpacked vectors for a chunk of the given pixel type.
- typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
- typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
-
- // Pre-scale the coefficient by 8 bits of fractional precision, so that when
- // the sample is multiplied by it, it will yield a 16 bit unsigned integer
- // that will use all 16 bits of precision to accumulate the sum.
- coeff *= 1 << 8;
- float coeffStep2 = coeffStep * coeffStep;
-
- int row = computeRow(sampler, i);
- P* buf = (P*)sampler->buf;
- auto pixelsRight = unaligned_load<V4<P>>(&buf[row]);
- auto pixelsLeft = pixelsRight;
- auto sum = CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) *
- uint16_t(coeff + 0.5f);
-
- // Here we use some trickery to reuse the pixels within a chunk, shifted over
- // by one pixel, to get the next sample for the entire chunk. This allows us
- // to sample only one pixel for each offset across the entire chunk in both
- // the left and right directions. To avoid clamping within the loop to the
- // texture bounds, we compute the valid radius that doesn't require clamping
- // and fall back to a slower clamping loop outside of that valid radius.
- int offset = 1;
- // The left bound is how much we can offset the sample before the start of
- // the row bounds.
- int leftBound = i.x - max(minX, 0);
- // The right bound is how much we can offset the sample before the end of the
- // row bounds.
- int rightBound = min(maxX, sampler->width - 1) - i.x;
- int validRadius = min(radius, min(leftBound, rightBound - (4 - 1)));
- for (; offset <= validRadius; offset++) {
- // Overwrite the pixel that needs to be shifted out with the new pixel, and
- // shift it into the correct location.
- pixelsRight.x = unaligned_load<P>(&buf[row + offset + 4 - 1]);
- pixelsRight = pixelsRight.yzwx;
- pixelsLeft = pixelsLeft.wxyz;
- pixelsLeft.x = unaligned_load<P>(&buf[row - offset]);
-
- // Accumulate the Gaussian coefficients step-wise.
- coeff *= coeffStep;
- coeffStep *= coeffStep2;
-
- // Both left and right samples at this offset use the same coefficient.
- sum = addsat(sum,
- (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) +
- CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) *
- uint16_t(coeff + 0.5f));
- }
-
- for (; offset <= radius; offset++) {
- pixelsRight.x =
- unaligned_load<P>(&buf[row + min(offset + 4 - 1, rightBound)]);
- pixelsRight = pixelsRight.yzwx;
- pixelsLeft = pixelsLeft.wxyz;
- pixelsLeft.x = unaligned_load<P>(&buf[row - min(offset, leftBound)]);
-
- coeff *= coeffStep;
- coeffStep *= coeffStep2;
-
- sum = addsat(sum,
- (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) +
- CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) *
- uint16_t(coeff + 0.5f));
- }
-
- // Shift away the intermediate precision.
- return sum >> 8;
-}
-
-template <typename P, typename S>
-static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurVertical(
- S sampler, const ivec2_scalar& i, int minY, int maxY, int radius,
- float coeff, float coeffStep) {
- // Packed and unpacked vectors for a chunk of the given pixel type.
- typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
- typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
-
- // Pre-scale the coefficient by 8 bits of fractional precision, so that when
- // the sample is multiplied by it, it will yield a 16 bit unsigned integer
- // that will use all 16 bits of precision to accumulate the sum.
- coeff *= 1 << 8;
- float coeffStep2 = coeffStep * coeffStep;
-
- int rowAbove = computeRow(sampler, i);
- int rowBelow = rowAbove;
- P* buf = (P*)sampler->buf;
- auto pixels = unaligned_load<V4<P>>(&buf[rowAbove]);
- auto sum = CONVERT(bit_cast<packed_type>(pixels), unpacked_type) *
- uint16_t(coeff + 0.5f);
-
- // For the vertical loop we can't be quite as creative with reusing old values
- // as we were in the horizontal loop. We just do the obvious implementation of
- // loading a chunk from each row in turn and accumulating it into the sum. We
- // compute a valid radius within which we don't need to clamp the sampled row
- // and use that to avoid any clamping in the main inner loop. We fall back to
- // a slower clamping loop outside of that valid radius.
- int offset = 1;
- int belowBound = i.y - max(minY, 0);
- int aboveBound = min(maxY, sampler->height - 1) - i.y;
- int validRadius = min(radius, min(belowBound, aboveBound));
- for (; offset <= validRadius; offset++) {
- rowAbove += sampler->stride;
- rowBelow -= sampler->stride;
- auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]);
- auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]);
-
- // Accumulate the Gaussian coefficients step-wise.
- coeff *= coeffStep;
- coeffStep *= coeffStep2;
-
- // Both above and below samples at this offset use the same coefficient.
- sum = addsat(sum,
- (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) +
- CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) *
- uint16_t(coeff + 0.5f));
+ uint32_t* row0 =
+ &sampler
+ ->buf[clampCoord(i.x, sampler->width) +
+ clampCoord(i.y, sampler->height) * sampler->stride + zoffset];
+ uint32_t* row1 =
+ row0 +
+ ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) ? sampler->stride : 0);
+ int16_t fracx = i.x >= 0 && i.x < int32_t(sampler->width) - 1 ? frac.x : 0;
+ int16_t fracy = frac.y;
+
+ U32 pix0 = unaligned_load<U32>(row0);
+ U32 pix0n = unaligned_load<U32>(row0 + 4);
+ uint32_t pix0x = row0[8];
+ U32 pix1 = unaligned_load<U32>(row1);
+ U32 pix1n = unaligned_load<U32>(row1 + 4);
+ uint32_t pix1x = row1[8];
+
+ {
+ auto ab0 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0, pix0, 0, 1, 1, 2)),
+ V16<int16_t>);
+ auto ab1 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1, pix1, 0, 1, 1, 2)),
+ V16<int16_t>);
+ ab0 += ((ab1 - ab0) * fracy) >> 7;
+
+ auto cd0 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0, pix0n, 2, 3, 3, 4)),
+ V16<int16_t>);
+ auto cd1 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1, pix1n, 2, 3, 3, 4)),
+ V16<int16_t>);
+ cd0 += ((cd1 - cd0) * fracy) >> 7;
+
+ auto abcdl = combine(lowHalf(ab0), lowHalf(cd0));
+ auto abcdh = combine(highHalf(ab0), highHalf(cd0));
+ abcdl += ((abcdh - abcdl) * fracx) >> 7;
+
+ commit_span(buf, pack(WideRGBA8(abcdl)));
}
- for (; offset <= radius; offset++) {
- if (offset <= aboveBound) {
- rowAbove += sampler->stride;
- }
- if (offset <= belowBound) {
- rowBelow -= sampler->stride;
- }
- auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]);
- auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]);
-
- coeff *= coeffStep;
- coeffStep *= coeffStep2;
-
- sum = addsat(sum,
- (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) +
- CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) *
- uint16_t(coeff + 0.5f));
+ {
+ auto ab0 =
+ CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0n, pix0n, 0, 1, 1, 2)),
+ V16<int16_t>);
+ auto ab1 =
+ CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1n, pix1n, 0, 1, 1, 2)),
+ V16<int16_t>);
+ ab0 += ((ab1 - ab0) * fracy) >> 7;
+
+ auto cd0 =
+ CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0n, U32(pix0x), 2, 3, 3, 4)),
+ V16<int16_t>);
+ auto cd1 =
+ CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1n, U32(pix1x), 2, 3, 3, 4)),
+ V16<int16_t>);
+ cd0 += ((cd1 - cd0) * fracy) >> 7;
+
+ auto abcdl = combine(lowHalf(ab0), lowHalf(cd0));
+ auto abcdh = combine(highHalf(ab0), highHalf(cd0));
+ abcdl += ((abcdh - abcdl) * fracx) >> 7;
+
+ commit_span(buf + 4, pack(WideRGBA8(abcdl)));
}
-
- // Shift away the intermediate precision.
- return sum >> 8;
}
-
-} // namespace glsl
diff --git a/third_party/webrender/swgl/src/vector_type.h b/third_party/webrender/swgl/src/vector_type.h
index 43364ffcce2..8ec5876c340 100644
--- a/third_party/webrender/swgl/src/vector_type.h
+++ b/third_party/webrender/swgl/src/vector_type.h
@@ -39,16 +39,6 @@ SI VectorType<T, 16> combine(VectorType<T, 8> a, VectorType<T, 8> b) {
}
template <typename T>
-SI VectorType<T, 2> lowHalf(VectorType<T, 4> a) {
- return __builtin_shufflevector(a, a, 0, 1);
-}
-
-template <typename T>
-SI VectorType<T, 2> highHalf(VectorType<T, 4> a) {
- return __builtin_shufflevector(a, a, 2, 3);
-}
-
-template <typename T>
SI VectorType<T, 4> lowHalf(VectorType<T, 8> a) {
return __builtin_shufflevector(a, a, 0, 1, 2, 3);
}
@@ -114,7 +104,7 @@ struct VectorType {
};
};
- VectorType() : data{0} {}
+ VectorType() : data{0} { }
constexpr VectorType(const VectorType& rhs) : data(rhs.data) {}
// GCC vector extensions only support broadcasting scalars on arithmetic ops,
@@ -315,27 +305,10 @@ struct VectorType {
return VectorType<T, N * 2>::wrap(data, high.data);
}
-# define xxxx swizzle(0, 0, 0, 0)
-# define yyyy swizzle(1, 1, 1, 1)
-# define zzzz swizzle(2, 2, 2, 2)
-# define wwww swizzle(3, 3, 3, 3)
-# define xxyy swizzle(0, 0, 1, 1)
-# define xxzz swizzle(0, 0, 2, 2)
-# define yyww swizzle(1, 1, 3, 3)
-# define zzww swizzle(2, 2, 3, 3)
# define xyxy swizzle(0, 1, 0, 1)
-# define xzxz swizzle(0, 2, 0, 2)
-# define ywyw swizzle(1, 3, 1, 3)
# define zwzw swizzle(2, 3, 2, 3)
-# define zwxy swizzle(2, 3, 0, 1)
# define zyxw swizzle(2, 1, 0, 3)
-# define xxyz swizzle(0, 0, 1, 2)
-# define xyyz swizzle(0, 1, 1, 2)
# define xyzz swizzle(0, 1, 2, 2)
-# define xzyw swizzle(0, 2, 1, 3)
-# define yzwx swizzle(1, 2, 3, 0)
-# define wxyz swizzle(3, 0, 1, 2)
-# define wzyx swizzle(3, 2, 1, 0)
# define xxxxyyyy XXXXYYYY()
VectorType<T, 8> XXXXYYYY() const {
return swizzle(0, 0, 0, 0).combine(swizzle(1, 1, 1, 1));
@@ -358,10 +331,6 @@ struct VectorType {
VectorType<T, 8> XXYYZZWW() const {
return swizzle(0, 0, 1, 1).combine(swizzle(2, 2, 3, 3));
}
-# define xxxxyyyyzzzzwwww XXXXYYYYZZZZWWWW()
- VectorType<T, 16> XXXXYYYYZZZZWWWW() {
- return XXXXYYYY().combine(ZZZZWWWW());
- }
};
template <typename T>
@@ -374,17 +343,6 @@ struct VectorType<T, 2> {
};
T elements[2];
};
-
- SI VectorType wrap(const data_type& data) {
- VectorType v;
- v.data = data;
- return v;
- }
-
- VectorType operator&(VectorType x) const { return wrap(data & x.data); }
- VectorType operator&(T x) const { return wrap(data & x); }
- VectorType operator|(VectorType x) const { return wrap(data | x.data); }
- VectorType operator|(T x) const { return wrap(data | x); }
};
# define CONVERT(vector, type) ((type)(vector))
@@ -411,32 +369,6 @@ SI VectorType<T, N * 2> expand(VectorType<T, N> a) {
}
#endif
-template <typename T, int N>
-SI VectorType<T, N * 4> combine(VectorType<T, N> a, VectorType<T, N> b,
- VectorType<T, N> c, VectorType<T, N> d) {
- return combine(combine(a, b), combine(c, d));
-}
-
-template <typename T, int N>
-SI VectorType<T, N> combineLow(VectorType<T, N> a, VectorType<T, N> b) {
- return combine(lowHalf(a), lowHalf(b));
-}
-
-template <typename T, int N>
-SI VectorType<T, N> combineHigh(VectorType<T, N> a, VectorType<T, N> b) {
- return combine(highHalf(a), highHalf(b));
-}
-
-template <typename T, int N>
-SI VectorType<T, N * 2> repeat2(VectorType<T, N> a) {
- return combine(a, a);
-}
-
-template <typename T, int N>
-SI VectorType<T, N * 4> repeat4(VectorType<T, N> a) {
- return combine(a, a, a, a);
-}
-
template <typename T>
SI VectorType<T, 4> zipLow(VectorType<T, 4> a, VectorType<T, 4> b) {
return SHUFFLE(a, b, 0, 4, 1, 5);
@@ -478,23 +410,6 @@ SI VectorType<T, 8> zip2High(VectorType<T, 8> a, VectorType<T, 8> b) {
return SHUFFLE(a, b, 4, 5, 12, 13, 6, 7, 14, 15);
}
-#ifdef __clang__
-template <typename T>
-SI VectorType<T, 8> zip(VectorType<T, 4> a, VectorType<T, 4> b) {
- return SHUFFLE(a, b, 0, 4, 1, 5, 2, 6, 3, 7);
-}
-
-template <typename T>
-SI VectorType<T, 16> zip(VectorType<T, 8> a, VectorType<T, 8> b) {
- return SHUFFLE(a, b, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
-}
-#else
-template <typename T, int N>
-SI VectorType<T, N * 2> zip(VectorType<T, N> a, VectorType<T, N> b) {
- return combine(zipLow(a, b), zipHigh(a, b));
-}
-#endif
-
template <typename T>
struct Unaligned {
template <typename P>