diff options
Diffstat (limited to 'third_party/webrender/swgl/src')
-rw-r--r-- | third_party/webrender/swgl/src/blend.h | 864 | ||||
-rw-r--r-- | third_party/webrender/swgl/src/composite.h | 1069 | ||||
-rw-r--r-- | third_party/webrender/swgl/src/gl.cc | 3164 | ||||
-rw-r--r-- | third_party/webrender/swgl/src/gl_defs.h | 42 | ||||
-rw-r--r-- | third_party/webrender/swgl/src/glsl.h | 1308 | ||||
-rw-r--r-- | third_party/webrender/swgl/src/lib.rs | 2 | ||||
-rw-r--r-- | third_party/webrender/swgl/src/program.h | 82 | ||||
-rw-r--r-- | third_party/webrender/swgl/src/rasterize.h | 1670 | ||||
-rw-r--r-- | third_party/webrender/swgl/src/swgl_ext.h | 1826 | ||||
-rw-r--r-- | third_party/webrender/swgl/src/swgl_fns.rs | 513 | ||||
-rw-r--r-- | third_party/webrender/swgl/src/texture.h | 1162 | ||||
-rw-r--r-- | third_party/webrender/swgl/src/vector_type.h | 87 |
12 files changed, 3223 insertions, 8566 deletions
diff --git a/third_party/webrender/swgl/src/blend.h b/third_party/webrender/swgl/src/blend.h deleted file mode 100644 index 8bc1c93994e..00000000000 --- a/third_party/webrender/swgl/src/blend.h +++ /dev/null @@ -1,864 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -static ALWAYS_INLINE HalfRGBA8 packRGBA8(I32 a, I32 b) { -#if USE_SSE2 - return _mm_packs_epi32(a, b); -#elif USE_NEON - return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b)); -#else - return CONVERT(combine(a, b), HalfRGBA8); -#endif -} - -static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v, - float scale = 255.0f) { - ivec4 i = round_pixel(v, scale); - HalfRGBA8 xz = packRGBA8(i.z, i.x); - HalfRGBA8 yw = packRGBA8(i.y, i.w); - HalfRGBA8 xyzwl = zipLow(xz, yw); - HalfRGBA8 xyzwh = zipHigh(xz, yw); - HalfRGBA8 lo = zip2Low(xyzwl, xyzwh); - HalfRGBA8 hi = zip2High(xyzwl, xyzwh); - return combine(lo, hi); -} - -static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(Float alpha, - float scale = 255.0f) { - I32 i = round_pixel(alpha, scale); - HalfRGBA8 c = packRGBA8(i, i); - c = zipLow(c, c); - return zip(c, c); -} - -static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(float alpha, - float scale = 255.0f) { - I32 i = round_pixel(alpha, scale); - return repeat2(packRGBA8(i, i)); -} - -UNUSED static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v, - float scale = 255.0f) { - I32 i = round_pixel((Float){v.z, v.y, v.x, v.w}, scale); - return repeat2(packRGBA8(i, i)); -} - -static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8() { - return pack_pixels_RGBA8(fragment_shader->gl_FragColor); -} - -static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(WideRGBA32F v, - float scale = 255.0f) { - ivec4 i = round_pixel(bit_cast<vec4>(v), scale); - return combine(packRGBA8(i.x, i.y), packRGBA8(i.z, i.w)); -} - -static ALWAYS_INLINE WideR8 packR8(I32 a) { -#if USE_SSE2 - return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a))); -#elif USE_NEON - return vqmovun_s32(a); -#else - return CONVERT(a, WideR8); -#endif -} - -static ALWAYS_INLINE WideR8 pack_pixels_R8(Float c, float scale = 255.0f) { - return packR8(round_pixel(c, scale)); -} - -static ALWAYS_INLINE WideR8 pack_pixels_R8() { - return pack_pixels_R8(fragment_shader->gl_FragColor.x); -} - -// Load a partial span > 0 and < 4 pixels. -template <typename V, typename P> -static ALWAYS_INLINE V partial_load_span(const P* src, int span) { - return bit_cast<V>( - (span >= 2 - ? combine(unaligned_load<V2<P>>(src), - V2<P>{span > 2 ? unaligned_load<P>(src + 2) : P(0), 0}) - : V4<P>{unaligned_load<P>(src), 0, 0, 0})); -} - -// Store a partial span > 0 and < 4 pixels. -template <typename V, typename P> -static ALWAYS_INLINE void partial_store_span(P* dst, V src, int span) { - auto pixels = bit_cast<V4<P>>(src); - if (span >= 2) { - unaligned_store(dst, lowHalf(pixels)); - if (span > 2) { - unaligned_store(dst + 2, pixels.z); - } - } else { - unaligned_store(dst, pixels.x); - } -} - -// Dispatcher that chooses when to load a full or partial span -template <typename V, typename P> -static ALWAYS_INLINE V load_span(const P* src, int span) { - if (span >= 4) { - return unaligned_load<V, P>(src); - } else { - return partial_load_span<V, P>(src, span); - } -} - -// Dispatcher that chooses when to store a full or partial span -template <typename V, typename P> -static ALWAYS_INLINE void store_span(P* dst, V src, int span) { - if (span >= 4) { - unaligned_store<V, P>(dst, src); - } else { - partial_store_span<V, P>(dst, src, span); - } -} - -template <typename T> -static ALWAYS_INLINE T muldiv256(T x, T y) { - return (x * y) >> 8; -} - -// (x*y + x) >> 8, cheap approximation of (x*y) / 255 -template <typename T> -static ALWAYS_INLINE T muldiv255(T x, T y) { - return (x * y + x) >> 8; -} - -template <typename V> -static ALWAYS_INLINE WideRGBA8 pack_span(uint32_t*, const V& v, - float scale = 255.0f) { - return pack_pixels_RGBA8(v, scale); -} - -template <typename C> -static ALWAYS_INLINE WideR8 pack_span(uint8_t*, C c, float scale = 255.0f) { - return pack_pixels_R8(c, scale); -} - -// Helper functions to apply a color modulus when available. -struct NoColor {}; - -template <typename P> -static ALWAYS_INLINE P applyColor(P src, NoColor) { - return src; -} - -struct InvertColor {}; - -template <typename P> -static ALWAYS_INLINE P applyColor(P src, InvertColor) { - return 255 - src; -} - -template <typename P> -static ALWAYS_INLINE P applyColor(P src, P color) { - return muldiv255(color, src); -} - -static ALWAYS_INLINE WideRGBA8 applyColor(PackedRGBA8 src, WideRGBA8 color) { - return applyColor(unpack(src), color); -} - -template <typename P, typename C> -static ALWAYS_INLINE auto packColor(P* buf, C color) { - return pack_span(buf, color, 255.0f); -} - -template <typename P> -static ALWAYS_INLINE NoColor packColor(UNUSED P* buf, NoColor noColor) { - return noColor; -} - -template <typename P> -static ALWAYS_INLINE InvertColor packColor(UNUSED P* buf, - InvertColor invertColor) { - return invertColor; -} - -// Single argument variation that takes an explicit destination buffer type. -template <typename P, typename C> -static ALWAYS_INLINE auto packColor(C color) { - // Just pass in a typed null pointer, as the pack routines never use the - // pointer's value, just its type. - return packColor((P*)0, color); -} - -// Byte-wise addition for when x or y is a signed 8-bit value stored in the -// low byte of a larger type T only with zeroed-out high bits, where T is -// greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used -// upon signed operands, using up all the precision in a 16 bit integer, and -// potentially losing the sign bit in the last >> 8 shift. Due to the -// properties of two's complement arithmetic, even though we've discarded the -// sign bit, we can still represent a negative number under addition (without -// requiring any extra sign bits), just that any negative number will behave -// like a large unsigned number under addition, generating a single carry bit -// on overflow that we need to discard. Thus, just doing a byte-wise add will -// overflow without the troublesome carry, giving us only the remaining 8 low -// bits we actually need while keeping the high bits at zero. -template <typename T> -static ALWAYS_INLINE T addlow(T x, T y) { - typedef VectorType<uint8_t, sizeof(T)> bytes; - return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y)); -} - -// Replace color components of each pixel with the pixel's alpha values. -template <typename T> -static ALWAYS_INLINE T alphas(T c) { - return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15); -} - -// Replace the alpha values of the first vector with alpha values from the -// second, while leaving the color components unmodified. -template <typename T> -static ALWAYS_INLINE T set_alphas(T c, T a) { - return SHUFFLE(c, a, 0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31); -} - -// Miscellaneous helper functions for working with packed RGBA8 data. -static ALWAYS_INLINE HalfRGBA8 if_then_else(V8<int16_t> c, HalfRGBA8 t, - HalfRGBA8 e) { - return bit_cast<HalfRGBA8>((c & t) | (~c & e)); -} - -template <typename T, typename C, int N> -static ALWAYS_INLINE VectorType<T, N> if_then_else(VectorType<C, N> c, - VectorType<T, N> t, - VectorType<T, N> e) { - return combine(if_then_else(lowHalf(c), lowHalf(t), lowHalf(e)), - if_then_else(highHalf(c), highHalf(t), highHalf(e))); -} - -static ALWAYS_INLINE HalfRGBA8 min(HalfRGBA8 x, HalfRGBA8 y) { -#if USE_SSE2 - return bit_cast<HalfRGBA8>( - _mm_min_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y))); -#elif USE_NEON - return vminq_u16(x, y); -#else - return if_then_else(x < y, x, y); -#endif -} - -template <typename T, int N> -static ALWAYS_INLINE VectorType<T, N> min(VectorType<T, N> x, - VectorType<T, N> y) { - return combine(min(lowHalf(x), lowHalf(y)), min(highHalf(x), highHalf(y))); -} - -static ALWAYS_INLINE HalfRGBA8 max(HalfRGBA8 x, HalfRGBA8 y) { -#if USE_SSE2 - return bit_cast<HalfRGBA8>( - _mm_max_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y))); -#elif USE_NEON - return vmaxq_u16(x, y); -#else - return if_then_else(x > y, x, y); -#endif -} - -template <typename T, int N> -static ALWAYS_INLINE VectorType<T, N> max(VectorType<T, N> x, - VectorType<T, N> y) { - return combine(max(lowHalf(x), lowHalf(y)), max(highHalf(x), highHalf(y))); -} - -template <typename T, int N> -static ALWAYS_INLINE VectorType<T, N> recip(VectorType<T, N> v) { - return combine(recip(lowHalf(v)), recip(highHalf(v))); -} - -// Helper to get the reciprocal if the value is non-zero, or otherwise default -// to the supplied fallback value. -template <typename V> -static ALWAYS_INLINE V recip_or(V v, float f) { - return if_then_else(v != V(0.0f), recip(v), V(f)); -} - -template <typename T, int N> -static ALWAYS_INLINE VectorType<T, N> inversesqrt(VectorType<T, N> v) { - return combine(inversesqrt(lowHalf(v)), inversesqrt(highHalf(v))); -} - -// Extract the alpha components so that we can cheaply calculate the reciprocal -// on a single SIMD register. Then multiply the duplicated alpha reciprocal with -// the pixel data. 0 alpha is treated as transparent black. -static ALWAYS_INLINE WideRGBA32F unpremultiply(WideRGBA32F v) { - Float a = recip_or((Float){v[3], v[7], v[11], v[15]}, 0.0f); - return v * a.xxxxyyyyzzzzwwww; -} - -// Packed RGBA32F data is AoS in BGRA order. Transpose it to SoA and swizzle to -// RGBA to unpack. -static ALWAYS_INLINE vec4 unpack(PackedRGBA32F c) { - return bit_cast<vec4>( - SHUFFLE(c, c, 2, 6, 10, 14, 1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15)); -} - -// The following lum/sat functions mostly follow the KHR_blend_equation_advanced -// specification but are rearranged to work on premultiplied data. -static ALWAYS_INLINE Float lumv3(vec3 v) { - return v.x * 0.30f + v.y * 0.59f + v.z * 0.11f; -} - -static ALWAYS_INLINE Float minv3(vec3 v) { return min(min(v.x, v.y), v.z); } - -static ALWAYS_INLINE Float maxv3(vec3 v) { return max(max(v.x, v.y), v.z); } - -static inline vec3 clip_color(vec3 v, Float lum, Float alpha) { - Float mincol = max(-minv3(v), lum); - Float maxcol = max(maxv3(v), alpha - lum); - return lum + v * (lum * (alpha - lum) * recip_or(mincol * maxcol, 0.0f)); -} - -static inline vec3 set_lum(vec3 base, vec3 ref, Float alpha) { - return clip_color(base - lumv3(base), lumv3(ref), alpha); -} - -static inline vec3 set_lum_sat(vec3 base, vec3 sref, vec3 lref, Float alpha) { - vec3 diff = base - minv3(base); - Float sbase = maxv3(diff); - Float ssat = maxv3(sref) - minv3(sref); - // The sbase range is rescaled to ssat. If sbase has 0 extent, then rescale - // to black, as per specification. - return set_lum(diff * ssat * recip_or(sbase, 0.0f), lref, alpha); -} - -// Flags the reflect the current blend-stage clipping to be applied. -enum SWGLClipFlag { - SWGL_CLIP_FLAG_MASK = 1 << 0, - SWGL_CLIP_FLAG_AA = 1 << 1, - SWGL_CLIP_FLAG_BLEND_OVERRIDE = 1 << 2, -}; -static int swgl_ClipFlags = 0; -static BlendKey swgl_BlendOverride = BLEND_KEY_NONE; -static WideRGBA8 swgl_BlendColorRGBA8 = {0}; -static WideRGBA8 swgl_BlendAlphaRGBA8 = {0}; - -// A pointer into the color buffer for the start of the span. -static void* swgl_SpanBuf = nullptr; -// A pointer into the clip mask for the start of the span. -static uint8_t* swgl_ClipMaskBuf = nullptr; - -static ALWAYS_INLINE WideR8 expand_mask(UNUSED uint8_t* buf, WideR8 mask) { - return mask; -} -static ALWAYS_INLINE WideRGBA8 expand_mask(UNUSED uint32_t* buf, WideR8 mask) { - WideRG8 maskRG = zip(mask, mask); - return zip(maskRG, maskRG); -} - -// Loads a chunk of clip masks. The current pointer into the color buffer is -// used to reconstruct the relative position within the span. From there, the -// pointer into the clip mask can be generated from the start of the clip mask -// span. -template <typename P> -static ALWAYS_INLINE uint8_t* get_clip_mask(P* buf) { - return &swgl_ClipMaskBuf[buf - (P*)swgl_SpanBuf]; -} - -template <typename P> -static ALWAYS_INLINE auto load_clip_mask(P* buf, int span) - -> decltype(expand_mask(buf, 0)) { - return expand_mask(buf, - unpack(load_span<PackedR8>(get_clip_mask(buf), span))); -} - -// Temporarily removes masking from the blend stage, assuming the caller will -// handle it. -static ALWAYS_INLINE void override_clip_mask() { - blend_key = BlendKey(blend_key - MASK_BLEND_KEY_NONE); -} - -// Restores masking to the blend stage, assuming it was previously overridden. -static ALWAYS_INLINE void restore_clip_mask() { - blend_key = BlendKey(MASK_BLEND_KEY_NONE + blend_key); -} - -// A pointer to the start of the opaque destination region of the span for AA. -static const uint8_t* swgl_OpaqueStart = nullptr; -// The size, in bytes, of the opaque region. -static uint32_t swgl_OpaqueSize = 0; -// AA coverage distance offsets for the left and right edges. -static Float swgl_LeftAADist = 0.0f; -static Float swgl_RightAADist = 0.0f; -// AA coverage slope values used for accumulating coverage for each step. -static Float swgl_AASlope = 0.0f; - -// Get the amount of pixels we need to process before the start of the opaque -// region. -template <typename P> -static ALWAYS_INLINE int get_aa_opaque_start(P* buf) { - return max(int((P*)swgl_OpaqueStart - buf), 0); -} - -// Assuming we are already in the opaque part of the span, return the remaining -// size of the opaque part. -template <typename P> -static ALWAYS_INLINE int get_aa_opaque_size(P* buf) { - return max(int((P*)&swgl_OpaqueStart[swgl_OpaqueSize] - buf), 0); -} - -// Temporarily removes anti-aliasing from the blend stage, assuming the caller -// will handle it. -static ALWAYS_INLINE void override_aa() { - blend_key = BlendKey(blend_key - AA_BLEND_KEY_NONE); -} - -// Restores anti-aliasing to the blend stage, assuming it was previously -// overridden. -static ALWAYS_INLINE void restore_aa() { - blend_key = BlendKey(AA_BLEND_KEY_NONE + blend_key); -} - -static PREFER_INLINE WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst, - WideRGBA8 src, int span = 4) { - WideRGBA8 dst = unpack(pdst); - const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0, 0xFFFF, 0xFFFF, - 0xFFFF, 0, 0xFFFF, 0xFFFF, 0xFFFF, 0, - 0xFFFF, 0xFFFF, 0xFFFF, 0}; - const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF, - 0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF}; - const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255, - 0, 0, 0, 255, 0, 0, 0, 255}; - -// clang-format off - // Computes AA for the given pixel based on the offset of the pixel within - // destination row. Given the initial coverage offsets for the left and right - // edges, the offset is scaled by the slope and accumulated to find the - // minimum coverage value for the pixel. A final weight is generated that - // can be used to scale the source pixel. -#define DO_AA(format, body) \ - do { \ - int offset = int((const uint8_t*)buf - swgl_OpaqueStart); \ - if (uint32_t(offset) >= swgl_OpaqueSize) { \ - Float delta = swgl_AASlope * float(offset); \ - Float dist = clamp(min(swgl_LeftAADist + delta.x, \ - swgl_RightAADist + delta.y), \ - 0.0f, 256.0f); \ - auto aa = pack_pixels_##format(dist, 1.0f); \ - body; \ - } \ - } while (0) - - // Each blend case is preceded by the MASK_ variant. The MASK_ case first - // loads the mask values and multiplies the source value by them. After, it - // falls through to the normal blending case using the masked source. The - // AA_ variations may further precede the blend cases, in which case the - // source value is further modified before use. -#define BLEND_CASE_KEY(key) \ - case AA_##key: \ - DO_AA(RGBA8, src = muldiv256(src, aa)); \ - goto key; \ - case AA_MASK_##key: \ - DO_AA(RGBA8, src = muldiv256(src, aa)); \ - FALLTHROUGH; \ - case MASK_##key: \ - src = muldiv255(src, load_clip_mask(buf, span)); \ - FALLTHROUGH; \ - case key: key - -#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__)) - - switch (blend_key) { - BLEND_CASE(GL_ONE, GL_ZERO): - return src; - BLEND_CASE(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, - GL_ONE_MINUS_SRC_ALPHA): - // dst + src.a*(src.rgb1 - dst) - // use addlow for signed overflow - return addlow(dst, muldiv255(alphas(src), (src | ALPHA_OPAQUE) - dst)); - BLEND_CASE(GL_ONE, GL_ONE_MINUS_SRC_ALPHA): - return src + dst - muldiv255(dst, alphas(src)); - BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR): - return dst - muldiv255(dst, src); - BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE): - return dst - (muldiv255(dst, src) & RGB_MASK); - BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA): - return dst - muldiv255(dst, alphas(src)); - BLEND_CASE(GL_ZERO, GL_SRC_COLOR): - return muldiv255(src, dst); - BLEND_CASE(GL_ONE, GL_ONE): - return src + dst; - BLEND_CASE(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA): - return src + dst - (muldiv255(dst, src) & ALPHA_MASK); - BLEND_CASE(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE): - // src*(1-dst.a) + dst*1 = src - src*dst.a + dst - return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK); - BLEND_CASE(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR): - // src*k + (1-src)*dst = src*k + dst - - // src*dst = dst + src*(k - dst) use addlow - // for signed overflow - return addlow( - dst, muldiv255(src, repeat2(ctx->blendcolor) - dst)); - - // We must explicitly handle the masked/anti-aliased secondary blend case. - // The secondary color as well as the source must be multiplied by the - // weights. - case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { - WideRGBA8 secondary = - applyColor(dst, - packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); - return src + dst - secondary; - } - case MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { - WideRGBA8 secondary = - applyColor(dst, - packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); - WideRGBA8 mask = load_clip_mask(buf, span); - return muldiv255(src, mask) + dst - muldiv255(secondary, mask); - } - case AA_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { - WideRGBA8 secondary = - applyColor(dst, - packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); - DO_AA(RGBA8, { - src = muldiv256(src, aa); - secondary = muldiv256(secondary, aa); - }); - return src + dst - secondary; - } - case AA_MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { - WideRGBA8 secondary = - applyColor(dst, - packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor)); - WideRGBA8 mask = load_clip_mask(buf, span); - DO_AA(RGBA8, mask = muldiv256(mask, aa)); - return muldiv255(src, mask) + dst - muldiv255(secondary, mask); - } - - BLEND_CASE(GL_MIN): - return min(src, dst); - BLEND_CASE(GL_MAX): - return max(src, dst); - - // The KHR_blend_equation_advanced spec describes the blend equations such - // that the unpremultiplied values Cs, Cd, As, Ad and function f combine to - // the result: - // Cr = f(Cs,Cd)*As*Ad + Cs*As*(1-Ad) + Cd*AD*(1-As) - // Ar = As*Ad + As*(1-Ad) + Ad*(1-As) - // However, working with unpremultiplied values requires expensive math to - // unpremultiply and premultiply again during blending. We can use the fact - // that premultiplied value P = C*A and simplify the equations such that no - // unpremultiplied colors are necessary, allowing us to stay with integer - // math that avoids floating-point conversions in the common case. Some of - // the blend modes require division or sqrt, in which case we do convert - // to (possibly transposed/unpacked) floating-point to implement the mode. - // However, most common modes can still use cheaper premultiplied integer - // math. As an example, the multiply mode f(Cs,Cd) = Cs*Cd is simplified - // to: - // Cr = Cs*Cd*As*Ad + Cs*As*(1-Ad) + Cd*Ad*(1-As) - // .. Pr = Ps*Pd + Ps - Ps*Ad + Pd - Pd*As - // Ar = As*Ad + As - As*Ad + Ad - Ad*As - // .. Ar = As + Ad - As*Ad - // Note that the alpha equation is the same for all blend equations, such - // that so long as the implementation results in As + Ad - As*Ad, we can - // avoid using separate instructions to compute the alpha result, which is - // dependent on the math used to implement each blend mode. The exact - // reductions used to get the final math for every blend mode are too - // involved to show here in comments, but mostly follows from replacing - // Cs*As and Cd*Ad with Ps and Ps while factoring out as many common terms - // as possible. - - BLEND_CASE(GL_MULTIPLY_KHR): { - WideRGBA8 diff = muldiv255(alphas(src) - (src & RGB_MASK), - alphas(dst) - (dst & RGB_MASK)); - return src + dst + (diff & RGB_MASK) - alphas(diff); - } - BLEND_CASE(GL_SCREEN_KHR): - return src + dst - muldiv255(src, dst); - BLEND_CASE(GL_OVERLAY_KHR): { - WideRGBA8 srcA = alphas(src); - WideRGBA8 dstA = alphas(dst); - WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst); - return src + dst + - if_then_else(dst * 2 <= dstA, (diff & RGB_MASK) - alphas(diff), - -diff); - } - BLEND_CASE(GL_DARKEN_KHR): - return src + dst - - max(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src))); - BLEND_CASE(GL_LIGHTEN_KHR): - return src + dst - - min(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src))); - - BLEND_CASE(GL_COLORDODGE_KHR): { - // Color-dodge and color-burn require division, so we convert to FP math - // here, but avoid transposing to a vec4. - WideRGBA32F srcF = CONVERT(src, WideRGBA32F); - WideRGBA32F srcA = alphas(srcF); - WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); - WideRGBA32F dstA = alphas(dstF); - return pack_pixels_RGBA8( - srcA * set_alphas( - min(dstA, dstF * srcA * recip_or(srcA - srcF, 255.0f)), - dstF) + - srcF * (255.0f - dstA) + dstF * (255.0f - srcA), - 1.0f / 255.0f); - } - BLEND_CASE(GL_COLORBURN_KHR): { - WideRGBA32F srcF = CONVERT(src, WideRGBA32F); - WideRGBA32F srcA = alphas(srcF); - WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); - WideRGBA32F dstA = alphas(dstF); - return pack_pixels_RGBA8( - srcA * set_alphas((dstA - min(dstA, (dstA - dstF) * srcA * - recip_or(srcF, 255.0f))), - dstF) + - srcF * (255.0f - dstA) + dstF * (255.0f - srcA), - 1.0f / 255.0f); - } - BLEND_CASE(GL_HARDLIGHT_KHR): { - WideRGBA8 srcA = alphas(src); - WideRGBA8 dstA = alphas(dst); - WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst); - return src + dst + - if_then_else(src * 2 <= srcA, (diff & RGB_MASK) - alphas(diff), - -diff); - } - - BLEND_CASE(GL_SOFTLIGHT_KHR): { - // Soft-light requires an unpremultiply that can't be factored out as - // well as a sqrt, so we convert to FP math here, but avoid transposing - // to a vec4. - WideRGBA32F srcF = CONVERT(src, WideRGBA32F); - WideRGBA32F srcA = alphas(srcF); - WideRGBA32F dstF = CONVERT(dst, WideRGBA32F); - WideRGBA32F dstA = alphas(dstF); - WideRGBA32F dstU = unpremultiply(dstF); - WideRGBA32F scale = srcF + srcF - srcA; - return pack_pixels_RGBA8( - dstF * (255.0f + - set_alphas( - scale * - if_then_else(scale < 0.0f, 1.0f - dstU, - min((16.0f * dstU - 12.0f) * dstU + 3.0f, - inversesqrt(dstU) - 1.0f)), - WideRGBA32F(0.0f))) + - srcF * (255.0f - dstA), - 1.0f / 255.0f); - } - BLEND_CASE(GL_DIFFERENCE_KHR): { - WideRGBA8 diff = - min(muldiv255(dst, alphas(src)), muldiv255(src, alphas(dst))); - return src + dst - diff - (diff & RGB_MASK); - } - BLEND_CASE(GL_EXCLUSION_KHR): { - WideRGBA8 diff = muldiv255(src, dst); - return src + dst - diff - (diff & RGB_MASK); - } - - // The HSL blend modes are non-separable and require complicated use of - // division. It is advantageous to convert to FP and transpose to vec4 - // math to more easily manipulate the individual color components. -#define DO_HSL(rgb) \ - do { \ - vec4 srcV = unpack(CONVERT(src, PackedRGBA32F)); \ - vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F)); \ - Float srcA = srcV.w * (1.0f / 255.0f); \ - Float dstA = dstV.w * (1.0f / 255.0f); \ - Float srcDstA = srcV.w * dstA; \ - vec3 srcC = vec3(srcV) * dstA; \ - vec3 dstC = vec3(dstV) * srcA; \ - return pack_pixels_RGBA8(vec4(rgb + vec3(srcV) - srcC + vec3(dstV) - dstC, \ - srcV.w + dstV.w - srcDstA), \ - 1.0f); \ - } while (0) - - BLEND_CASE(GL_HSL_HUE_KHR): - DO_HSL(set_lum_sat(srcC, dstC, dstC, srcDstA)); - BLEND_CASE(GL_HSL_SATURATION_KHR): - DO_HSL(set_lum_sat(dstC, srcC, dstC, srcDstA)); - BLEND_CASE(GL_HSL_COLOR_KHR): - DO_HSL(set_lum(srcC, dstC, srcDstA)); - BLEND_CASE(GL_HSL_LUMINOSITY_KHR): - DO_HSL(set_lum(dstC, srcC, srcDstA)); - - // SWGL-specific extended blend modes. - BLEND_CASE(SWGL_BLEND_DROP_SHADOW): { - // Premultiplied alpha over blend, but with source color set to source alpha - // modulated with a constant color. - WideRGBA8 color = applyColor(alphas(src), swgl_BlendColorRGBA8); - return color + dst - muldiv255(dst, alphas(color)); - } - - BLEND_CASE(SWGL_BLEND_SUBPIXEL_TEXT): - // Premultiplied alpha over blend, but treats the source as a subpixel mask - // modulated with a constant color. - return applyColor(src, swgl_BlendColorRGBA8) + dst - - muldiv255(dst, applyColor(src, swgl_BlendAlphaRGBA8)); - - default: - UNREACHABLE; - // return src; - } - -#undef BLEND_CASE -#undef BLEND_CASE_KEY - // clang-format on -} - -static PREFER_INLINE WideR8 blend_pixels(uint8_t* buf, WideR8 dst, WideR8 src, - int span = 4) { -// clang-format off -#define BLEND_CASE_KEY(key) \ - case AA_##key: \ - DO_AA(R8, src = muldiv256(src, aa)); \ - goto key; \ - case AA_MASK_##key: \ - DO_AA(R8, src = muldiv256(src, aa)); \ - FALLTHROUGH; \ - case MASK_##key: \ - src = muldiv255(src, load_clip_mask(buf, span)); \ - FALLTHROUGH; \ - case key: key - -#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__)) - - switch (blend_key) { - BLEND_CASE(GL_ONE, GL_ZERO): - return src; - BLEND_CASE(GL_ZERO, GL_SRC_COLOR): - return muldiv255(src, dst); - BLEND_CASE(GL_ONE, GL_ONE): - return src + dst; - default: - UNREACHABLE; - // return src; - } - -#undef BLEND_CASE -#undef BLEND_CASE_KEY - // clang-format on -} - -static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r) { - unaligned_store(buf, pack(r)); -} - -static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r, int len) { - partial_store_span(buf, pack(r), len); -} - -static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r) { - return blend_pixels(buf, unaligned_load<PackedRGBA8>(buf), r); -} - -static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r, int len) { - return blend_pixels(buf, partial_load_span<PackedRGBA8>(buf, len), r, len); -} - -static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r) { - unaligned_store(buf, r); -} - -static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r, int len) { - partial_store_span(buf, r, len); -} - -static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r) { - return pack(blend_span(buf, unpack(r))); -} - -static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r, - int len) { - return pack(blend_span(buf, unpack(r), len)); -} - -static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r) { - unaligned_store(buf, pack(r)); -} - -static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r, int len) { - partial_store_span(buf, pack(r), len); -} - -static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r) { - return blend_pixels(buf, unpack(unaligned_load<PackedR8>(buf)), r); -} - -static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r, int len) { - return blend_pixels(buf, unpack(partial_load_span<PackedR8>(buf, len)), r, - len); -} - -static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r) { - unaligned_store(buf, r); -} - -static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r, int len) { - partial_store_span(buf, r, len); -} - -static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r) { - return pack(blend_span(buf, unpack(r))); -} - -static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r, int len) { - return pack(blend_span(buf, unpack(r), len)); -} - -template <bool BLEND, typename P, typename R> -static ALWAYS_INLINE void commit_blend_span(P* buf, R r) { - if (BLEND) { - commit_span(buf, blend_span(buf, r)); - } else { - commit_span(buf, r); - } -} - -template <bool BLEND, typename P, typename R> -static ALWAYS_INLINE void commit_blend_span(P* buf, R r, int len) { - if (BLEND) { - commit_span(buf, blend_span(buf, r, len), len); - } else { - commit_span(buf, r, len); - } -} - -template <typename P, typename R> -static ALWAYS_INLINE void commit_blend_solid_span(P* buf, R r, int len) { - for (P* end = &buf[len & ~3]; buf < end; buf += 4) { - commit_span(buf, blend_span(buf, r)); - } - len &= 3; - if (len > 0) { - partial_store_span(buf, pack(blend_span(buf, r, len)), len); - } -} - -template <bool BLEND> -static void commit_solid_span(uint32_t* buf, WideRGBA8 r, int len) { - commit_blend_solid_span(buf, r, len); -} - -template <> -ALWAYS_INLINE void commit_solid_span<false>(uint32_t* buf, WideRGBA8 r, - int len) { - fill_n(buf, len, bit_cast<U32>(pack(r)).x); -} - -template <bool BLEND> -static void commit_solid_span(uint8_t* buf, WideR8 r, int len) { - commit_blend_solid_span(buf, r, len); -} - -template <> -ALWAYS_INLINE void commit_solid_span<false>(uint8_t* buf, WideR8 r, int len) { - PackedR8 p = pack(r); - if (uintptr_t(buf) & 3) { - int align = 4 - (uintptr_t(buf) & 3); - align = min(align, len); - partial_store_span(buf, p, align); - buf += align; - len -= align; - } - fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(p)); - buf += len & ~3; - len &= 3; - if (len > 0) { - partial_store_span(buf, p, len); - } -} diff --git a/third_party/webrender/swgl/src/composite.h b/third_party/webrender/swgl/src/composite.h deleted file mode 100644 index f88de485fdd..00000000000 --- a/third_party/webrender/swgl/src/composite.h +++ /dev/null @@ -1,1069 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -template <bool COMPOSITE, typename P> -static inline void copy_row(P* dst, const P* src, int span) { - // No scaling, so just do a fast copy. - memcpy(dst, src, span * sizeof(P)); -} - -template <> -void copy_row<true, uint32_t>(uint32_t* dst, const uint32_t* src, int span) { - // No scaling, so just do a fast composite. - auto* end = dst + span; - while (dst + 4 <= end) { - WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src)); - WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst)); - PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - unaligned_store(dst, r); - src += 4; - dst += 4; - } - if (dst < end) { - WideRGBA8 srcpx = unpack(partial_load_span<PackedRGBA8>(src, end - dst)); - WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst)); - auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - partial_store_span(dst, r, end - dst); - } -} - -template <bool COMPOSITE, typename P> -static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth, - int span, int frac) { - // Do scaling with different source and dest widths. - for (P* end = dst + span; dst < end; dst++) { - *dst = *src; - // Step source according to width ratio. - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - } -} - -template <> -void scale_row<true, uint32_t>(uint32_t* dst, int dstWidth, const uint32_t* src, - int srcWidth, int span, int frac) { - // Do scaling with different source and dest widths. - // Gather source pixels four at a time for better packing. - auto* end = dst + span; - for (; dst + 4 <= end; dst += 4) { - U32 srcn; - srcn.x = *src; - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - srcn.y = *src; - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - srcn.z = *src; - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - srcn.w = *src; - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn)); - WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst)); - PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - unaligned_store(dst, r); - } - if (dst < end) { - // Process any remaining pixels. Try to gather as many pixels as possible - // into a single source chunk for compositing. - U32 srcn = {*src, 0, 0, 0}; - if (end - dst > 1) { - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - srcn.y = *src; - if (end - dst > 2) { - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - srcn.z = *src; - } - } - WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn)); - WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst)); - auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - partial_store_span(dst, r, end - dst); - } -} - -template <bool COMPOSITE = false> -static NO_INLINE void scale_blit(Texture& srctex, const IntRect& srcReq, - Texture& dsttex, const IntRect& dstReq, - bool invertY, const IntRect& clipRect) { - assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 && - dsttex.internal_format == GL_RGBA8)); - // Cache scaling ratios - int srcWidth = srcReq.width(); - int srcHeight = srcReq.height(); - int dstWidth = dstReq.width(); - int dstHeight = dstReq.height(); - // Compute valid dest bounds - IntRect dstBounds = dsttex.sample_bounds(dstReq).intersect(clipRect); - // Compute valid source bounds - IntRect srcBounds = srctex.sample_bounds(srcReq, invertY); - // If srcReq is outside the source texture, we need to clip the sampling - // bounds so that we never sample outside valid source bounds. Get texture - // bounds relative to srcReq and scale to dest-space rounding inward, using - // this rect to limit the dest bounds further. - IntRect srcClip = srctex.bounds() - srcReq.origin(); - if (invertY) { - srcClip.invert_y(srcReq.height()); - } - srcClip.scale(srcWidth, srcHeight, dstWidth, dstHeight, true); - dstBounds.intersect(srcClip); - // Check if clipped sampling bounds are empty - if (dstBounds.is_empty()) { - return; - } - - // Calculate source and dest pointers from clamped offsets - int bpp = srctex.bpp(); - int srcStride = srctex.stride(); - int destStride = dsttex.stride(); - char* dest = dsttex.sample_ptr(dstReq, dstBounds); - // Clip the source bounds by the destination offset. - int fracX = srcWidth * dstBounds.x0; - int fracY = srcHeight * dstBounds.y0; - srcBounds.x0 = max(fracX / dstWidth, srcBounds.x0); - srcBounds.y0 = max(fracY / dstHeight, srcBounds.y0); - fracX %= dstWidth; - fracY %= dstHeight; - char* src = srctex.sample_ptr(srcReq, srcBounds, invertY); - // Inverted Y must step downward along source rows - if (invertY) { - srcStride = -srcStride; - } - int span = dstBounds.width(); - for (int rows = dstBounds.height(); rows > 0; rows--) { - switch (bpp) { - case 1: - if (srcWidth == dstWidth) - copy_row<COMPOSITE>((uint8_t*)dest, (uint8_t*)src, span); - else - scale_row<COMPOSITE>((uint8_t*)dest, dstWidth, (uint8_t*)src, - srcWidth, span, fracX); - break; - case 2: - if (srcWidth == dstWidth) - copy_row<COMPOSITE>((uint16_t*)dest, (uint16_t*)src, span); - else - scale_row<COMPOSITE>((uint16_t*)dest, dstWidth, (uint16_t*)src, - srcWidth, span, fracX); - break; - case 4: - if (srcWidth == dstWidth) - copy_row<COMPOSITE>((uint32_t*)dest, (uint32_t*)src, span); - else - scale_row<COMPOSITE>((uint32_t*)dest, dstWidth, (uint32_t*)src, - srcWidth, span, fracX); - break; - default: - assert(false); - break; - } - dest += destStride; - // Step source according to height ratio. - for (fracY += srcHeight; fracY >= dstHeight; fracY -= dstHeight) { - src += srcStride; - } - } -} - -template <bool COMPOSITE> -static void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV, - float srcDU, sampler2D sampler) { - vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); - for (; span >= 4; span -= 4) { - auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv)); - unaligned_store(dest, srcpx); - dest += 4; - uv.x += 4 * srcDU; - } - if (span > 0) { - auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv)); - partial_store_span(dest, srcpx, span); - } -} - -template <> -void linear_row_blit<true>(uint32_t* dest, int span, const vec2_scalar& srcUV, - float srcDU, sampler2D sampler) { - vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); - for (; span >= 4; span -= 4) { - WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv)); - WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest)); - PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - unaligned_store(dest, r); - - dest += 4; - uv.x += 4 * srcDU; - } - if (span > 0) { - WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv)); - WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, span)); - PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - partial_store_span(dest, r, span); - } -} - -template <bool COMPOSITE> -static void linear_row_blit(uint8_t* dest, int span, const vec2_scalar& srcUV, - float srcDU, sampler2D sampler) { - vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); - for (; span >= 4; span -= 4) { - auto srcpx = textureLinearPackedR8(sampler, ivec2(uv)); - unaligned_store(dest, srcpx); - dest += 4; - uv.x += 4 * srcDU; - } - if (span > 0) { - auto srcpx = textureLinearPackedR8(sampler, ivec2(uv)); - partial_store_span(dest, srcpx, span); - } -} - -template <bool COMPOSITE> -static void linear_row_blit(uint16_t* dest, int span, const vec2_scalar& srcUV, - float srcDU, sampler2D sampler) { - vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); - for (; span >= 4; span -= 4) { - auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv)); - unaligned_store(dest, srcpx); - dest += 4; - uv.x += 4 * srcDU; - } - if (span > 0) { - auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv)); - partial_store_span(dest, srcpx, span); - } -} - -template <bool COMPOSITE = false> -static NO_INLINE void linear_blit(Texture& srctex, const IntRect& srcReq, - Texture& dsttex, const IntRect& dstReq, - bool invertY, const IntRect& clipRect) { - assert(srctex.internal_format == GL_RGBA8 || - srctex.internal_format == GL_R8 || srctex.internal_format == GL_RG8); - assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 && - dsttex.internal_format == GL_RGBA8)); - // Compute valid dest bounds - IntRect dstBounds = dsttex.sample_bounds(dstReq); - dstBounds.intersect(clipRect); - // Check if sampling bounds are empty - if (dstBounds.is_empty()) { - return; - } - // Initialize sampler for source texture - sampler2D_impl sampler; - init_sampler(&sampler, srctex); - sampler.filter = TextureFilter::LINEAR; - // Compute source UVs - vec2_scalar srcUV(srcReq.x0, srcReq.y0); - vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(), - float(srcReq.height()) / dstReq.height()); - // Inverted Y must step downward along source rows - if (invertY) { - srcUV.y += srcReq.height(); - srcDUV.y = -srcDUV.y; - } - // Skip to clamped source start - srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f); - // Scale UVs by lerp precision - srcUV = linearQuantize(srcUV, 128); - srcDUV *= 128.0f; - // Calculate dest pointer from clamped offsets - int bpp = dsttex.bpp(); - int destStride = dsttex.stride(); - char* dest = dsttex.sample_ptr(dstReq, dstBounds); - int span = dstBounds.width(); - for (int rows = dstBounds.height(); rows > 0; rows--) { - switch (bpp) { - case 1: - linear_row_blit<COMPOSITE>((uint8_t*)dest, span, srcUV, srcDUV.x, - &sampler); - break; - case 2: - linear_row_blit<COMPOSITE>((uint16_t*)dest, span, srcUV, srcDUV.x, - &sampler); - break; - case 4: - linear_row_blit<COMPOSITE>((uint32_t*)dest, span, srcUV, srcDUV.x, - &sampler); - break; - default: - assert(false); - break; - } - dest += destStride; - srcUV.y += srcDUV.y; - } -} - -extern "C" { - -void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, - GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, - GLbitfield mask, GLenum filter) { - assert(mask == GL_COLOR_BUFFER_BIT); - Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER); - if (!srcfb) return; - Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER); - if (!dstfb) return; - Texture& srctex = ctx->textures[srcfb->color_attachment]; - if (!srctex.buf) return; - Texture& dsttex = ctx->textures[dstfb->color_attachment]; - if (!dsttex.buf) return; - assert(!dsttex.locked); - if (srctex.internal_format != dsttex.internal_format) { - assert(false); - return; - } - // Force flipped Y onto dest coordinates - if (srcY1 < srcY0) { - swap(srcY0, srcY1); - swap(dstY0, dstY1); - } - bool invertY = dstY1 < dstY0; - if (invertY) { - swap(dstY0, dstY1); - } - IntRect srcReq = IntRect{srcX0, srcY0, srcX1, srcY1} - srctex.offset; - IntRect dstReq = IntRect{dstX0, dstY0, dstX1, dstY1} - dsttex.offset; - if (srcReq.is_empty() || dstReq.is_empty()) { - return; - } - IntRect clipRect = {0, 0, dstReq.width(), dstReq.height()}; - prepare_texture(srctex); - prepare_texture(dsttex, &dstReq); - if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR && - (srctex.internal_format == GL_RGBA8 || srctex.internal_format == GL_R8 || - srctex.internal_format == GL_RG8)) { - linear_blit(srctex, srcReq, dsttex, dstReq, invertY, dstReq); - } else { - scale_blit(srctex, srcReq, dsttex, dstReq, invertY, clipRect); - } -} - -typedef Texture LockedTexture; - -// Lock the given texture to prevent modification. -LockedTexture* LockTexture(GLuint texId) { - Texture& tex = ctx->textures[texId]; - if (!tex.buf) { - assert(tex.buf != nullptr); - return nullptr; - } - if (__sync_fetch_and_add(&tex.locked, 1) == 0) { - // If this is the first time locking the texture, flush any delayed clears. - prepare_texture(tex); - } - return (LockedTexture*)&tex; -} - -// Lock the given framebuffer's color attachment to prevent modification. -LockedTexture* LockFramebuffer(GLuint fboId) { - Framebuffer& fb = ctx->framebuffers[fboId]; - // Only allow locking a framebuffer if it has a valid color attachment. - if (!fb.color_attachment) { - assert(fb.color_attachment != 0); - return nullptr; - } - return LockTexture(fb.color_attachment); -} - -// Reference an already locked resource -void LockResource(LockedTexture* resource) { - if (!resource) { - return; - } - __sync_fetch_and_add(&resource->locked, 1); -} - -// Remove a lock on a texture that has been previously locked -void UnlockResource(LockedTexture* resource) { - if (!resource) { - return; - } - if (__sync_fetch_and_add(&resource->locked, -1) <= 0) { - // The lock should always be non-zero before unlocking. - assert(0); - } -} - -// Get the underlying buffer for a locked resource -void* GetResourceBuffer(LockedTexture* resource, int32_t* width, - int32_t* height, int32_t* stride) { - *width = resource->width; - *height = resource->height; - *stride = resource->stride(); - return resource->buf; -} - -// Extension for optimized compositing of textures or framebuffers that may be -// safely used across threads. The source and destination must be locked to -// ensure that they can be safely accessed while the SWGL context might be used -// by another thread. Band extents along the Y axis may be used to clip the -// destination rectangle without effecting the integer scaling ratios. -void Composite(LockedTexture* lockedDst, LockedTexture* lockedSrc, GLint srcX, - GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX, - GLint dstY, GLsizei dstWidth, GLsizei dstHeight, - GLboolean opaque, GLboolean flip, GLenum filter, GLint clipX, - GLint clipY, GLsizei clipWidth, GLsizei clipHeight) { - if (!lockedDst || !lockedSrc) { - return; - } - Texture& srctex = *lockedSrc; - Texture& dsttex = *lockedDst; - assert(srctex.bpp() == 4); - assert(dsttex.bpp() == 4); - - IntRect srcReq = - IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - srctex.offset; - IntRect dstReq = - IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset; - // Compute clip rect as relative to the dstReq, as that's the same coords - // as used for the sampling bounds. - IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth, - clipY - dstY + clipHeight}; - - if (opaque) { - // Ensure we have rows of at least 2 pixels when using the linear filter - // to avoid overreading the row. - if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR) { - linear_blit<false>(srctex, srcReq, dsttex, dstReq, flip, clipRect); - } else { - scale_blit<false>(srctex, srcReq, dsttex, dstReq, flip, clipRect); - } - } else { - if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR) { - linear_blit<true>(srctex, srcReq, dsttex, dstReq, flip, clipRect); - } else { - scale_blit<true>(srctex, srcReq, dsttex, dstReq, flip, clipRect); - } - } -} - -} // extern "C" - -// Saturated add helper for YUV conversion. Supported platforms have intrinsics -// to do this natively, but support a slower generic fallback just in case. -static inline V8<int16_t> addsat(V8<int16_t> x, V8<int16_t> y) { -#if USE_SSE2 - return _mm_adds_epi16(x, y); -#elif USE_NEON - return vqaddq_s16(x, y); -#else - auto r = x + y; - // An overflow occurred if the signs of both inputs x and y did not differ - // but yet the sign of the result did differ. - auto overflow = (~(x ^ y) & (r ^ x)) >> 15; - // If there was an overflow, we need to choose the appropriate limit to clamp - // to depending on whether or not the inputs are negative. - auto limit = (x >> 15) ^ 0x7FFF; - // If we didn't overflow, just use the result, and otherwise, use the limit. - return (~overflow & r) | (overflow & limit); -#endif -} - -// Interleave and packing helper for YUV conversion. During transform by the -// color matrix, the color components are de-interleaved as this format is -// usually what comes out of the planar YUV textures. The components thus need -// to be interleaved before finally getting packed to BGRA format. Alpha is -// forced to be opaque. -static inline PackedRGBA8 packYUV(V8<int16_t> gg, V8<int16_t> br) { - return pack(bit_cast<WideRGBA8>(zip(br, gg))) | - PackedRGBA8{0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; -} - -// clang-format off -// Supports YUV color matrixes of the form: -// [R] [1.1643835616438356, 0.0, rv ] [Y - 16] -// [G] = [1.1643835616438358, -gu, -gv ] x [U - 128] -// [B] [1.1643835616438356, bu, 0.0 ] [V - 128] -// We must be able to multiply a YUV input by a matrix coefficient ranging as -// high as ~2.2 in the U/V cases, where U/V can be signed values between -128 -// and 127. The largest fixed-point representation we can thus support without -// overflowing 16 bit integers leaves us 6 bits of fractional precision while -// also supporting a sign bit. The closest representation of the Y coefficient -// ~1.164 in this precision is 74.5/2^6 which is common to all color spaces -// we support. Conversions can still sometimes overflow the precision and -// require clamping back into range, so we use saturated additions to do this -// efficiently at no extra cost. -// clang-format on -struct YUVMatrix { - // These constants are loaded off the "this" pointer via relative addressing - // modes and should be about as quick to load as directly addressed SIMD - // constant memory. - V8<int16_t> rbCoeffs; - V8<int16_t> gCoeffs; - V8<uint16_t> yScale; - V8<int16_t> yBias; - V8<int16_t> uvBias; - V8<int16_t> brMask; - - // Set the coefficients to cancel out and pass through YUV as GBR. All biases - // are set to zero and the BR-mask is set to remove the contribution of Y to - // the BR channels. Scales are set such that the shift by 6 in convert is - // balanced. - YUVMatrix() - : rbCoeffs(1 << 6), - gCoeffs(0), - yScale(1 << (6 + 1)), - yBias(0), - uvBias(0), - brMask(0) {} - - // Convert matrix coefficients to fixed-point representation. - YUVMatrix(double rv, double gu, double gv, double bu) - : rbCoeffs( - zip(I16(int16_t(bu * 64.0 + 0.5)), I16(int16_t(rv * 64.0 + 0.5)))), - gCoeffs(zip(I16(-int16_t(gu * -64.0 + 0.5)), - I16(-int16_t(gv * -64.0 + 0.5)))), - yScale(2 * 74 + 1), - yBias(int16_t(-16 * 74.5) + (1 << 5)), - uvBias(-128), - brMask(-1) {} - - ALWAYS_INLINE PackedRGBA8 convert(V8<int16_t> yy, V8<int16_t> uv) const { - // Bias Y values by -16 and multiply by 74.5. Add 2^5 offset to round to - // nearest 2^6. Note that we have to use an unsigned multiply with a 2x - // scale to represent a fractional scale and to avoid shifting with the sign - // bit. - yy = bit_cast<V8<int16_t>>((bit_cast<V8<uint16_t>>(yy) * yScale) >> 1) + - yBias; - - // Bias U/V values by -128. - uv += uvBias; - - // Compute (R, B) = (74.5*Y + rv*V, 74.5*Y + bu*U) - auto br = rbCoeffs * uv; - br = addsat(yy & brMask, br); - br >>= 6; - - // Compute G = 74.5*Y + -gu*U + -gv*V - auto gg = gCoeffs * uv; - gg = addsat( - yy, - addsat(gg, bit_cast<V8<int16_t>>(bit_cast<V4<uint32_t>>(gg) >> 16))); - gg >>= 6; - - // Interleave B/R and G values. Force alpha to opaque. - return packYUV(gg, br); - } -}; - -enum YUVColorSpace { REC_601 = 0, REC_709, REC_2020, IDENTITY }; - -static const YUVMatrix yuvMatrix[IDENTITY + 1] = { - // clang-format off -// From Rec601: -// [R] [1.1643835616438356, 0.0, 1.5960267857142858 ] [Y - 16] -// [G] = [1.1643835616438358, -0.3917622900949137, -0.8129676472377708 ] x [U - 128] -// [B] [1.1643835616438356, 2.017232142857143, 8.862867620416422e-17] [V - 128] - {1.5960267857142858, -0.3917622900949137, -0.8129676472377708, 2.017232142857143}, - -// From Rec709: -// [R] [1.1643835616438356, 0.0, 1.7927410714285714] [Y - 16] -// [G] = [1.1643835616438358, -0.21324861427372963, -0.532909328559444 ] x [U - 128] -// [B] [1.1643835616438356, 2.1124017857142854, 0.0 ] [V - 128] - {1.7927410714285714, -0.21324861427372963, -0.532909328559444, 2.1124017857142854}, - -// From Re2020: -// [R] [1.16438356164384, 0.0, 1.678674107142860 ] [Y - 16] -// [G] = [1.16438356164384, -0.187326104219343, -0.650424318505057 ] x [U - 128] -// [B] [1.16438356164384, 2.14177232142857, 0.0 ] [V - 128] - {1.678674107142860, -0.187326104219343, -0.650424318505057, 2.14177232142857}, - -// Identity -// [R] [V] -// [G] = [Y] -// [B] [U] - {}, - // clang-format on -}; - -// Helper function for textureLinearRowR8 that samples horizontal taps and -// combines them based on Y fraction with next row. -template <typename S> -static ALWAYS_INLINE V8<int16_t> linearRowTapsR8(S sampler, I32 ix, - int32_t offsety, - int32_t stridey, - int16_t fracy) { - uint8_t* buf = (uint8_t*)sampler->buf + offsety; - auto a0 = unaligned_load<V2<uint8_t>>(&buf[ix.x]); - auto b0 = unaligned_load<V2<uint8_t>>(&buf[ix.y]); - auto c0 = unaligned_load<V2<uint8_t>>(&buf[ix.z]); - auto d0 = unaligned_load<V2<uint8_t>>(&buf[ix.w]); - auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>); - buf += stridey; - auto a1 = unaligned_load<V2<uint8_t>>(&buf[ix.x]); - auto b1 = unaligned_load<V2<uint8_t>>(&buf[ix.y]); - auto c1 = unaligned_load<V2<uint8_t>>(&buf[ix.z]); - auto d1 = unaligned_load<V2<uint8_t>>(&buf[ix.w]); - auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>); - abcd0 += ((abcd1 - abcd0) * fracy) >> 7; - return abcd0; -} - -// Optimized version of textureLinearPackedR8 for Y R8 texture. This assumes -// constant Y and returns a duplicate of the result interleaved with itself -// to aid in later YUV transformation. -template <typename S> -static inline V8<int16_t> textureLinearRowR8(S sampler, I32 ix, int32_t offsety, - int32_t stridey, int16_t fracy) { - assert(sampler->format == TextureFormat::R8); - - // Calculate X fraction and clamp X offset into range. - I32 fracx = ix; - ix >>= 7; - fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F; - ix = clampCoord(ix, sampler->width - 1); - - // Load the sample taps and combine rows. - auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy); - - // Unzip the result and do final horizontal multiply-add base on X fraction. - auto abcdl = SHUFFLE(abcd, abcd, 0, 0, 2, 2, 4, 4, 6, 6); - auto abcdh = SHUFFLE(abcd, abcd, 1, 1, 3, 3, 5, 5, 7, 7); - abcdl += ((abcdh - abcdl) * CONVERT(fracx, I16).xxyyzzww) >> 7; - - // The final result is the packed values interleaved with a duplicate of - // themselves. - return abcdl; -} - -// Optimized version of textureLinearPackedR8 for paired U/V R8 textures. -// Since the two textures have the same dimensions and stride, the addressing -// math can be shared between both samplers. This also allows a coalesced -// multiply in the final stage by packing both U/V results into a single -// operation. -template <typename S> -static inline V8<int16_t> textureLinearRowPairedR8(S sampler, S sampler2, - I32 ix, int32_t offsety, - int32_t stridey, - int16_t fracy) { - assert(sampler->format == TextureFormat::R8 && - sampler2->format == TextureFormat::R8); - assert(sampler->width == sampler2->width && - sampler->height == sampler2->height); - assert(sampler->stride == sampler2->stride); - - // Calculate X fraction and clamp X offset into range. - I32 fracx = ix; - ix >>= 7; - fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F; - ix = clampCoord(ix, sampler->width - 1); - - // Load the sample taps for the first sampler and combine rows. - auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy); - - // Load the sample taps for the second sampler and combine rows. - auto xyzw = linearRowTapsR8(sampler2, ix, offsety, stridey, fracy); - - // We are left with a result vector for each sampler with values for adjacent - // pixels interleaved together in each. We need to unzip these values so that - // we can do the final horizontal multiply-add based on the X fraction. - auto abcdxyzwl = SHUFFLE(abcd, xyzw, 0, 8, 2, 10, 4, 12, 6, 14); - auto abcdxyzwh = SHUFFLE(abcd, xyzw, 1, 9, 3, 11, 5, 13, 7, 15); - abcdxyzwl += ((abcdxyzwh - abcdxyzwl) * CONVERT(fracx, I16).xxyyzzww) >> 7; - - // The final result is the packed values for the first sampler interleaved - // with the packed values for the second sampler. - return abcdxyzwl; -} - -// Casting to int loses some precision while stepping that can offset the -// image, so shift the values by some extra bits of precision to minimize -// this. We support up to 16 bits of image size, 7 bits of quantization, -// and 1 bit for sign, which leaves 8 bits left for extra precision. -const int STEP_BITS = 8; - -// Optimized version of textureLinearPackedR8 for Y R8 texture with -// half-resolution paired U/V R8 textures. This allows us to more efficiently -// pack YUV samples into vectors to substantially reduce math operations even -// further. -template <bool BLEND> -static inline void upscaleYUV42R8(uint32_t* dest, int span, uint8_t* yRow, - I32 yU, int32_t yDU, int32_t yStrideV, - int16_t yFracV, uint8_t* cRow1, - uint8_t* cRow2, I32 cU, int32_t cDU, - int32_t cStrideV, int16_t cFracV, - const YUVMatrix& colorSpace) { - // As much as possible try to utilize the fact that we're only using half - // the UV samples to combine Y and UV samples into single vectors. Here we - // need to initialize several useful vector quantities for stepping fractional - // offsets. For the UV samples, we take the average of the first+second and - // third+fourth samples in a chunk which conceptually correspond to offsets - // 0.5 and 1.5 (in 0..2 range). This allows us to reconstruct intermediate - // samples 0.25, 0.75, 1.25, and 1.75 later. X fraction is shifted over into - // the top 7 bits of an unsigned short so that we can mask off the exact - // fractional bits we need to blend merely by right shifting them into - // position. - cU = (cU.xzxz + cU.ywyw) >> 1; - auto ycFracX = CONVERT(combine(yU, cU), V8<uint16_t>) - << (16 - (STEP_BITS + 7)); - auto ycFracDX = combine(I16(yDU), I16(cDU)) << (16 - (STEP_BITS + 7)); - auto ycFracV = combine(I16(yFracV), I16(cFracV)); - I32 yI = yU >> (STEP_BITS + 7); - I32 cI = cU >> (STEP_BITS + 7); - // Load initial combined YUV samples for each row and blend them. - auto ycSrc0 = - CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x]), - combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x]), - unaligned_load<V2<uint8_t>>(&cRow2[cI.x]))), - V8<int16_t>); - auto ycSrc1 = CONVERT( - combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x + yStrideV]), - combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x + cStrideV]), - unaligned_load<V2<uint8_t>>(&cRow2[cI.x + cStrideV]))), - V8<int16_t>); - auto ycSrc = ycSrc0 + (((ycSrc1 - ycSrc0) * ycFracV) >> 7); - - // Here we shift in results from the next sample while caching results from - // the previous sample. This allows us to reduce the multiplications in the - // inner loop down to only two since we just need to blend the new samples - // horizontally and then vertically once each. - for (uint32_t* end = dest + span; dest < end; dest += 4) { - yU += yDU; - I32 yIn = yU >> (STEP_BITS + 7); - cU += cDU; - I32 cIn = cU >> (STEP_BITS + 7); - // Load combined YUV samples for the next chunk on each row and blend them. - auto ycSrc0n = - CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x]), - combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x]), - unaligned_load<V2<uint8_t>>(&cRow2[cIn.x]))), - V8<int16_t>); - auto ycSrc1n = CONVERT( - combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x + yStrideV]), - combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x + cStrideV]), - unaligned_load<V2<uint8_t>>(&cRow2[cIn.x + cStrideV]))), - V8<int16_t>); - auto ycSrcn = ycSrc0n + (((ycSrc1n - ycSrc0n) * ycFracV) >> 7); - - // The source samples for the chunk may not match the actual tap offsets. - // Since we're upscaling, we know the tap offsets fall within all the - // samples in a 4-wide chunk. Since we can't rely on PSHUFB or similar, - // instead we do laborious shuffling here for the Y samples and then the UV - // samples. - auto yshuf = lowHalf(ycSrc); - auto yshufn = - SHUFFLE(yshuf, yIn.x == yI.w ? lowHalf(ycSrcn).yyyy : lowHalf(ycSrcn), - 1, 2, 3, 4); - if (yI.y == yI.x) { - yshuf = yshuf.xxyz; - yshufn = yshufn.xxyz; - } - if (yI.z == yI.y) { - yshuf = yshuf.xyyz; - yshufn = yshufn.xyyz; - } - if (yI.w == yI.z) { - yshuf = yshuf.xyzz; - yshufn = yshufn.xyzz; - } - - auto cshuf = highHalf(ycSrc); - auto cshufn = - SHUFFLE(cshuf, cIn.x == cI.y ? highHalf(ycSrcn).yyww : highHalf(ycSrcn), - 1, 4, 3, 6); - if (cI.y == cI.x) { - cshuf = cshuf.xxzz; - cshufn = cshufn.xxzz; - } - - // After shuffling, combine the Y and UV samples back into a single vector - // for blending. Shift X fraction into position as unsigned to mask off top - // bits and get rid of low bits to avoid multiplication overflow. - auto yuvPx = combine(yshuf, cshuf); - yuvPx += ((combine(yshufn, cshufn) - yuvPx) * - bit_cast<V8<int16_t>>(ycFracX >> (16 - 7))) >> - 7; - - // Cache the new samples as the current samples on the next iteration. - ycSrc = ycSrcn; - ycFracX += ycFracDX; - yI = yIn; - cI = cIn; - - // De-interleave the Y and UV results. We need to average the UV results - // to produce values for intermediate samples. Taps for UV were collected at - // offsets 0.5 and 1.5, such that if we take a quarter of the difference - // (1.5-0.5)/4, subtract it from even samples, and add it to odd samples, - // we can estimate samples 0.25, 0.75, 1.25, and 1.75. - auto yPx = SHUFFLE(yuvPx, yuvPx, 0, 0, 1, 1, 2, 2, 3, 3); - auto uvPx = SHUFFLE(yuvPx, yuvPx, 4, 6, 4, 6, 5, 7, 5, 7) + - ((SHUFFLE(yuvPx, yuvPx, 4, 6, 5, 7, 4, 6, 5, 7) - - SHUFFLE(yuvPx, yuvPx, 5, 7, 4, 6, 5, 7, 4, 6)) >> - 2); - - commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx)); - } -} - -// This is the inner loop driver of CompositeYUV that processes an axis-aligned -// YUV span, dispatching based on appropriate format and scaling. This is also -// reused by blendYUV to accelerate some cases of texture sampling in the -// shader. -template <bool BLEND = false> -static void linear_row_yuv(uint32_t* dest, int span, sampler2DRect samplerY, - const vec2_scalar& srcUV, float srcDU, - sampler2DRect samplerU, sampler2DRect samplerV, - const vec2_scalar& chromaUV, float chromaDU, - int colorDepth, const YUVMatrix& colorSpace) { - // Calculate varying and constant interp data for Y plane. - I32 yU = cast(init_interp(srcUV.x, srcDU) * (1 << STEP_BITS)); - int32_t yV = int32_t(srcUV.y); - - // Calculate varying and constant interp data for chroma planes. - I32 cU = cast(init_interp(chromaUV.x, chromaDU) * (1 << STEP_BITS)); - int32_t cV = int32_t(chromaUV.y); - - // We need to skip 4 pixels per chunk. - int32_t yDU = int32_t((4 << STEP_BITS) * srcDU); - int32_t cDU = int32_t((4 << STEP_BITS) * chromaDU); - - if (samplerY->width < 2 || samplerU->width < 2) { - // If the source row has less than 2 pixels, it's not safe to use a linear - // filter because it may overread the row. Just convert the single pixel - // with nearest filtering and fill the row with it. - I16 yuv = CONVERT( - round_pixel((Float){texelFetch(samplerY, ivec2(srcUV)).x.x, - texelFetch(samplerU, ivec2(chromaUV)).x.x, - texelFetch(samplerV, ivec2(chromaUV)).x.x, 1.0f}), - I16); - commit_solid_span<BLEND>( - dest, - unpack(colorSpace.convert(V8<int16_t>(yuv.x), - zip(I16(yuv.y), I16(yuv.z)))), - span); - } else if (samplerY->format == TextureFormat::R16) { - // Sample each YUV plane, rescale it to fit in low 8 bits of word, and - // then transform them by the appropriate color space. - assert(colorDepth > 8); - // Need to right shift the sample by the amount of bits over 8 it - // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit - // of precision at the low end already, hence 1 is subtracted from the - // color depth. - int rescaleBits = (colorDepth - 1) - 8; - for (; span >= 4; span -= 4) { - auto yPx = - textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >> - rescaleBits; - auto uPx = - textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >> - rescaleBits; - auto vPx = - textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >> - rescaleBits; - commit_blend_span<BLEND>( - dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx))); - dest += 4; - yU += yDU; - cU += cDU; - } - if (span > 0) { - // Handle any remaining pixels... - auto yPx = - textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >> - rescaleBits; - auto uPx = - textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >> - rescaleBits; - auto vPx = - textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >> - rescaleBits; - commit_blend_span<BLEND>( - dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx)), span); - } - } else { - assert(samplerY->format == TextureFormat::R8); - assert(colorDepth == 8); - - // Calculate varying and constant interp data for Y plane. - int16_t yFracV = yV & 0x7F; - yV >>= 7; - int32_t yOffsetV = clampCoord(yV, samplerY->height) * samplerY->stride; - int32_t yStrideV = - yV >= 0 && yV < int32_t(samplerY->height) - 1 ? samplerY->stride : 0; - - // Calculate varying and constant interp data for chroma planes. - int16_t cFracV = cV & 0x7F; - cV >>= 7; - int32_t cOffsetV = clampCoord(cV, samplerU->height) * samplerU->stride; - int32_t cStrideV = - cV >= 0 && cV < int32_t(samplerU->height) - 1 ? samplerU->stride : 0; - - // If we're sampling the UV planes at half the resolution of the Y plane, - // then try to use half resolution fast-path. - if (yDU >= cDU && cDU > 0 && yDU <= (4 << (STEP_BITS + 7)) && - cDU <= (2 << (STEP_BITS + 7))) { - // Ensure that samples don't fall outside of the valid bounds of each - // planar texture. Step until the initial X coordinates are positive. - for (; (yU.x < 0 || cU.x < 0) && span >= 4; span -= 4) { - auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, - yStrideV, yFracV); - auto uvPx = textureLinearRowPairedR8( - samplerU, samplerV, cU >> STEP_BITS, cOffsetV, cStrideV, cFracV); - commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx)); - dest += 4; - yU += yDU; - cU += cDU; - } - // Calculate the number of aligned chunks that we can step inside the - // bounds of each planar texture without overreading. - int inside = min( - min((((int(samplerY->width) - 4) << (STEP_BITS + 7)) - yU.x) / yDU, - (((int(samplerU->width) - 4) << (STEP_BITS + 7)) - cU.x) / cDU) * - 4, - span & ~3); - if (inside > 0) { - uint8_t* yRow = (uint8_t*)samplerY->buf + yOffsetV; - uint8_t* cRow1 = (uint8_t*)samplerU->buf + cOffsetV; - uint8_t* cRow2 = (uint8_t*)samplerV->buf + cOffsetV; - upscaleYUV42R8<BLEND>(dest, inside, yRow, yU, yDU, yStrideV, yFracV, - cRow1, cRow2, cU, cDU, cStrideV, cFracV, - colorSpace); - span -= inside; - dest += inside; - yU += (inside / 4) * yDU; - cU += (inside / 4) * cDU; - } - // If there are any remaining chunks that weren't inside, handle them - // below. - } - for (; span >= 4; span -= 4) { - // Sample each YUV plane and then transform them by the appropriate - // color space. - auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, - yStrideV, yFracV); - auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS, - cOffsetV, cStrideV, cFracV); - commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx)); - dest += 4; - yU += yDU; - cU += cDU; - } - if (span > 0) { - // Handle any remaining pixels... - auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, - yStrideV, yFracV); - auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS, - cOffsetV, cStrideV, cFracV); - commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx), span); - } - } -} - -static void linear_convert_yuv(Texture& ytex, Texture& utex, Texture& vtex, - YUVColorSpace colorSpace, int colorDepth, - const IntRect& srcReq, Texture& dsttex, - const IntRect& dstReq, bool invertY, - const IntRect& clipRect) { - // Compute valid dest bounds - IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY); - dstBounds.intersect(clipRect); - // Check if sampling bounds are empty - if (dstBounds.is_empty()) { - return; - } - // Initialize samplers for source textures - sampler2DRect_impl sampler[3]; - init_sampler(&sampler[0], ytex); - init_sampler(&sampler[1], utex); - init_sampler(&sampler[2], vtex); - - // Compute source UVs - vec2_scalar srcUV(srcReq.x0, srcReq.y0); - vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(), - float(srcReq.height()) / dstReq.height()); - // Inverted Y must step downward along source rows - if (invertY) { - srcUV.y += srcReq.height(); - srcDUV.y = -srcDUV.y; - } - // Skip to clamped source start - srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f); - // Calculate separate chroma UVs for chroma planes with different scale - vec2_scalar chromaScale(float(utex.width) / ytex.width, - float(utex.height) / ytex.height); - vec2_scalar chromaUV = srcUV * chromaScale; - vec2_scalar chromaDUV = srcDUV * chromaScale; - // Scale UVs by lerp precision. If the row has only 1 pixel, then don't - // quantize so that we can use nearest filtering instead to avoid overreads. - if (ytex.width >= 2 && utex.width >= 2) { - srcUV = linearQuantize(srcUV, 128); - srcDUV *= 128.0f; - chromaUV = linearQuantize(chromaUV, 128); - chromaDUV *= 128.0f; - } - // Calculate dest pointer from clamped offsets - int destStride = dsttex.stride(); - char* dest = dsttex.sample_ptr(dstReq, dstBounds); - int span = dstBounds.width(); - for (int rows = dstBounds.height(); rows > 0; rows--) { - linear_row_yuv((uint32_t*)dest, span, &sampler[0], srcUV, srcDUV.x, - &sampler[1], &sampler[2], chromaUV, chromaDUV.x, colorDepth, - yuvMatrix[colorSpace]); - dest += destStride; - srcUV.y += srcDUV.y; - chromaUV.y += chromaDUV.y; - } -} - -extern "C" { - -// Extension for compositing a YUV surface represented by separate YUV planes -// to a BGRA destination. The supplied color space is used to determine the -// transform from YUV to BGRA after sampling. -void CompositeYUV(LockedTexture* lockedDst, LockedTexture* lockedY, - LockedTexture* lockedU, LockedTexture* lockedV, - YUVColorSpace colorSpace, GLuint colorDepth, GLint srcX, - GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX, - GLint dstY, GLsizei dstWidth, GLsizei dstHeight, - GLboolean flip, GLint clipX, GLint clipY, GLsizei clipWidth, - GLsizei clipHeight) { - if (!lockedDst || !lockedY || !lockedU || !lockedV) { - return; - } - if (colorSpace > IDENTITY) { - assert(false); - return; - } - Texture& ytex = *lockedY; - Texture& utex = *lockedU; - Texture& vtex = *lockedV; - Texture& dsttex = *lockedDst; - // All YUV planes must currently be represented by R8 or R16 textures. - // The chroma (U/V) planes must have matching dimensions. - assert(ytex.bpp() == utex.bpp() && ytex.bpp() == vtex.bpp()); - assert((ytex.bpp() == 1 && colorDepth == 8) || - (ytex.bpp() == 2 && colorDepth > 8)); - // assert(ytex.width == utex.width && ytex.height == utex.height); - assert(utex.width == vtex.width && utex.height == vtex.height); - assert(ytex.offset == utex.offset && ytex.offset == vtex.offset); - assert(dsttex.bpp() == 4); - - IntRect srcReq = - IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - ytex.offset; - IntRect dstReq = - IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset; - // Compute clip rect as relative to the dstReq, as that's the same coords - // as used for the sampling bounds. - IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth, - clipY - dstY + clipHeight}; - // For now, always use a linear filter path that would be required for - // scaling. Further fast-paths for non-scaled video might be desirable in the - // future. - linear_convert_yuv(ytex, utex, vtex, colorSpace, colorDepth, srcReq, dsttex, - dstReq, flip, clipRect); -} - -} // extern "C" diff --git a/third_party/webrender/swgl/src/gl.cc b/third_party/webrender/swgl/src/gl.cc index 6e214547421..f4a69752dde 100644 --- a/third_party/webrender/swgl/src/gl.cc +++ b/third_party/webrender/swgl/src/gl.cc @@ -22,65 +22,15 @@ # define debugf(...) printf(__VA_ARGS__) #endif -// #define PRINT_TIMINGS - #ifdef _WIN32 # define ALWAYS_INLINE __forceinline -# define NO_INLINE __declspec(noinline) - -// Including Windows.h brings a huge amount of namespace polution so just -// define a couple of things manually -typedef int BOOL; -# define WINAPI __stdcall -# define DECLSPEC_IMPORT __declspec(dllimport) -# define WINBASEAPI DECLSPEC_IMPORT -typedef unsigned long DWORD; -typedef long LONG; -typedef __int64 LONGLONG; -# define DUMMYSTRUCTNAME - -typedef union _LARGE_INTEGER { - struct { - DWORD LowPart; - LONG HighPart; - } DUMMYSTRUCTNAME; - struct { - DWORD LowPart; - LONG HighPart; - } u; - LONGLONG QuadPart; -} LARGE_INTEGER; -extern "C" { -WINBASEAPI BOOL WINAPI -QueryPerformanceCounter(LARGE_INTEGER* lpPerformanceCount); - -WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency); -} - #else -// GCC is slower when dealing with always_inline, especially in debug builds. -// When using Clang, use always_inline more aggressively. -# if defined(__clang__) || defined(NDEBUG) -# define ALWAYS_INLINE __attribute__((always_inline)) inline -# else -# define ALWAYS_INLINE inline -# endif -# define NO_INLINE __attribute__((noinline)) -#endif - -// Some functions may cause excessive binary bloat if inlined in debug or with -// GCC builds, so use PREFER_INLINE on these instead of ALWAYS_INLINE. -#if defined(__clang__) && defined(NDEBUG) -# define PREFER_INLINE ALWAYS_INLINE -#else -# define PREFER_INLINE inline +# define ALWAYS_INLINE __attribute__((always_inline)) inline #endif #define UNREACHABLE __builtin_unreachable() -#define UNUSED [[maybe_unused]] - -#define FALLTHROUGH [[fallthrough]] +#define UNUSED __attribute__((unused)) #ifdef MOZILLA_CLIENT # define IMPLICIT __attribute__((annotate("moz_implicit"))) @@ -91,32 +41,19 @@ WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency); #include "gl_defs.h" #include "glsl.h" #include "program.h" -#include "texture.h" using namespace glsl; -typedef ivec2_scalar IntPoint; - struct IntRect { int x0; int y0; int x1; int y1; - IntRect() : x0(0), y0(0), x1(0), y1(0) {} - IntRect(int x0, int y0, int x1, int y1) : x0(x0), y0(y0), x1(x1), y1(y1) {} - IntRect(IntPoint origin, IntPoint size) - : x0(origin.x), - y0(origin.y), - x1(origin.x + size.x), - y1(origin.y + size.y) {} - int width() const { return x1 - x0; } int height() const { return y1 - y0; } bool is_empty() const { return width() <= 0 || height() <= 0; } - IntPoint origin() const { return IntPoint(x0, y0); } - bool same_size(const IntRect& o) const { return width() == o.width() && height() == o.height(); } @@ -133,12 +70,6 @@ struct IntRect { return *this; } - IntRect intersection(const IntRect& o) { - IntRect result = *this; - result.intersect(o); - return result; - } - // Scale from source-space to dest-space, optionally rounding inward IntRect& scale(int srcWidth, int srcHeight, int dstWidth, int dstHeight, bool roundIn = false) { @@ -156,60 +87,15 @@ struct IntRect { swap(y0, y1); } - IntRect& offset(const IntPoint& o) { - x0 += o.x; - y0 += o.y; - x1 += o.x; - y1 += o.y; + IntRect& offset(int dx, int dy) { + x0 += dx; + y0 += dy; + x1 += dx; + y1 += dy; return *this; } - - IntRect operator+(const IntPoint& o) const { - return IntRect(*this).offset(o); - } - IntRect operator-(const IntPoint& o) const { - return IntRect(*this).offset(-o); - } }; -typedef vec2_scalar Point2D; -typedef vec4_scalar Point3D; - -struct IntRange { - int start; - int end; - - int len() const { return end - start; } - - IntRange intersect(IntRange r) const { - return {max(start, r.start), min(end, r.end)}; - } -}; - -struct FloatRange { - float start; - float end; - - float clip(float x) const { return clamp(x, start, end); } - - FloatRange clip(FloatRange r) const { return {clip(r.start), clip(r.end)}; } - - FloatRange merge(FloatRange r) const { - return {min(start, r.start), max(end, r.end)}; - } - - IntRange round() const { - return {int(floor(start + 0.5f)), int(floor(end + 0.5f))}; - } - - IntRange round_out() const { return {int(floor(start)), int(ceil(end))}; } -}; - -template <typename P> -static inline FloatRange x_range(P p0, P p1) { - return {min(p0.x, p1.x), max(p0.x, p1.x)}; -} - struct VertexAttrib { size_t size = 0; // in bytes GLenum type = 0; @@ -237,18 +123,12 @@ static int bytes_for_internal_format(GLenum internal_format) { case GL_R8: case GL_RED: return 1; - case GL_RG8: - case GL_RG: - return 2; case GL_DEPTH_COMPONENT: case GL_DEPTH_COMPONENT16: + return 2; case GL_DEPTH_COMPONENT24: case GL_DEPTH_COMPONENT32: return 4; - case GL_RGB_RAW_422_APPLE: - return 2; - case GL_R16: - return 2; default: debugf("internal format: %x\n", internal_format); assert(0); @@ -268,12 +148,6 @@ static TextureFormat gl_format_to_texture_format(int type) { return TextureFormat::RGBA8; case GL_R8: return TextureFormat::R8; - case GL_RG8: - return TextureFormat::RG8; - case GL_R16: - return TextureFormat::R16; - case GL_RGB_RAW_422_APPLE: - return TextureFormat::YUV422; default: assert(0); return TextureFormat::RGBA8; @@ -287,34 +161,19 @@ struct Query { struct Buffer { char* buf = nullptr; size_t size = 0; - size_t capacity = 0; bool allocate(size_t new_size) { - // If the size remains unchanged, don't allocate anything. - if (new_size == size) { - return false; - } - // If the new size is within the existing capacity of the buffer, just - // reuse the existing buffer. - if (new_size <= capacity) { - size = new_size; - return true; - } - // Otherwise we need to reallocate the buffer to hold up to the requested - // larger size. - char* new_buf = (char*)realloc(buf, new_size); - assert(new_buf); - if (!new_buf) { - // If we fail, null out the buffer rather than leave around the old - // allocation state. + if (new_size != size) { + char* new_buf = (char*)realloc(buf, new_size); + assert(new_buf); + if (new_buf) { + buf = new_buf; + size = new_size; + return true; + } cleanup(); - return false; } - // The reallocation succeeded, so install the buffer. - buf = new_buf; - size = new_size; - capacity = new_size; - return true; + return false; } void cleanup() { @@ -322,7 +181,6 @@ struct Buffer { free(buf); buf = nullptr; size = 0; - capacity = 0; } } @@ -331,6 +189,7 @@ struct Buffer { struct Framebuffer { GLuint color_attachment = 0; + GLint layer = 0; GLuint depth_attachment = 0; }; @@ -364,32 +223,17 @@ struct Texture { GLenum internal_format = 0; int width = 0; int height = 0; + int depth = 0; char* buf = nullptr; size_t buf_size = 0; - uint32_t buf_stride = 0; - uint8_t buf_bpp = 0; GLenum min_filter = GL_NEAREST; GLenum mag_filter = GL_LINEAR; - // The number of active locks on this texture. If this texture has any active - // locks, we need to disallow modifying or destroying the texture as it may - // be accessed by other threads where modifications could lead to races. - int32_t locked = 0; - // When used as an attachment of a framebuffer, rendering to the texture - // behaves as if it is located at the given offset such that the offset is - // subtracted from all transformed vertexes after the viewport is applied. - IntPoint offset; enum FLAGS { - // If the buffer is internally-allocated by SWGL SHOULD_FREE = 1 << 1, - // If the buffer has been cleared to initialize it. Currently this is only - // utilized by depth buffers which need to know when depth runs have reset - // to a valid row state. When unset, the depth runs may contain garbage. - CLEARED = 1 << 2, }; int flags = SHOULD_FREE; bool should_free() const { return bool(flags & SHOULD_FREE); } - bool cleared() const { return bool(flags & CLEARED); } void set_flag(int flag, bool val) { if (val) { @@ -398,14 +242,7 @@ struct Texture { flags &= ~flag; } } - void set_should_free(bool val) { - // buf must be null before SHOULD_FREE can be safely toggled. Otherwise, we - // might accidentally mistakenly realloc an externally allocated buffer as - // if it were an internally allocated one. - assert(!buf); - set_flag(SHOULD_FREE, val); - } - void set_cleared(bool val) { set_flag(CLEARED, val); } + void set_should_free(bool val) { set_flag(SHOULD_FREE, val); } // Delayed-clearing state. When a clear of an FB is requested, we don't // immediately clear each row, as the rows may be subsequently overwritten @@ -418,9 +255,6 @@ struct Texture { uint32_t clear_val = 0; uint32_t* cleared_rows = nullptr; - void init_depth_runs(uint32_t z); - void fill_depth_runs(uint32_t z, const IntRect& scissor); - void enable_delayed_clear(uint32_t val) { delay_clear = height; clear_val = val; @@ -441,88 +275,40 @@ struct Texture { } } - int bpp() const { return buf_bpp; } - void set_bpp() { buf_bpp = bytes_for_internal_format(internal_format); } + int bpp() const { return bytes_for_internal_format(internal_format); } - size_t stride() const { return buf_stride; } - void set_stride() { buf_stride = aligned_stride(buf_bpp * width); } - - // Set an external backing buffer of this texture. - void set_buffer(void* new_buf, size_t new_stride) { - assert(!should_free()); - // Ensure that the supplied stride is at least as big as the row data and - // is aligned to the smaller of either the BPP or word-size. We need to at - // least be able to sample data from within a row and sample whole pixels - // of smaller formats without risking unaligned access. - set_bpp(); - set_stride(); - assert(new_stride >= size_t(bpp() * width) && - new_stride % min(bpp(), sizeof(uint32_t)) == 0); + size_t stride(int b = 0, int min_width = 0) const { + return aligned_stride((b ? b : bpp()) * max(width, min_width)); + } - buf = (char*)new_buf; - buf_size = 0; - buf_stride = new_stride; + size_t layer_stride(int b = 0, int min_width = 0, int min_height = 0) const { + return stride(b ? b : bpp(), min_width) * max(height, min_height); } bool allocate(bool force = false, int min_width = 0, int min_height = 0) { - assert(!locked); // Locked textures shouldn't be reallocated - // If we get here, some GL API call that invalidates the texture was used. - // Mark the buffer as not-cleared to signal this. - set_cleared(false); - // Check if there is either no buffer currently or if we forced validation - // of the buffer size because some dimension might have changed. if ((!buf || force) && should_free()) { - // Initialize the buffer's BPP and stride, since they may have changed. - set_bpp(); - set_stride(); - // Compute new size based on the maximum potential stride, rather than - // the current stride, to hopefully avoid reallocations when size would - // otherwise change too much... - size_t max_stride = max(buf_stride, aligned_stride(buf_bpp * min_width)); - size_t size = max_stride * max(height, min_height); - if ((!buf && size > 0) || size > buf_size) { + size_t size = layer_stride(bpp(), min_width, min_height) * max(depth, 1); + if (!buf || size > buf_size) { // Allocate with a SIMD register-sized tail of padding at the end so we // can safely read or write past the end of the texture with SIMD ops. - // Currently only the flat Z-buffer texture needs this padding due to - // full-register loads and stores in check_depth and discard_depth. In - // case some code in the future accidentally uses a linear filter on a - // texture with less than 2 pixels per row, we also add this padding - // just to be safe. All other texture types and use-cases should be - // safe to omit padding. - size_t padding = - internal_format == GL_DEPTH_COMPONENT24 || max(width, min_width) < 2 - ? sizeof(Float) - : 0; - char* new_buf = (char*)realloc(buf, size + padding); + char* new_buf = (char*)realloc(buf, size + sizeof(Float)); assert(new_buf); if (new_buf) { - // Successfully reallocated the buffer, so go ahead and set it. buf = new_buf; buf_size = size; return true; } - // Allocation failed, so ensure we don't leave stale buffer state. cleanup(); } } - // Nothing changed... return false; } void cleanup() { - assert(!locked); // Locked textures shouldn't be destroyed - if (buf) { - // If we need to toggle SHOULD_FREE state, ensure that buf is nulled out, - // regardless of whether we internally allocated it. This will prevent us - // from wrongly treating buf as having been internally allocated for when - // we go to realloc if it actually was externally allocted. - if (should_free()) { - free(buf); - } + if (buf && should_free()) { + free(buf); buf = nullptr; buf_size = 0; - buf_bpp = 0; - buf_stride = 0; } disable_delayed_clear(); } @@ -530,41 +316,44 @@ struct Texture { ~Texture() { cleanup(); } IntRect bounds() const { return IntRect{0, 0, width, height}; } - IntRect offset_bounds() const { return bounds() + offset; } // Find the valid sampling bounds relative to the requested region IntRect sample_bounds(const IntRect& req, bool invertY = false) const { - IntRect bb = bounds().intersect(req) - req.origin(); + IntRect bb = bounds().intersect(req).offset(-req.x0, -req.y0); if (invertY) bb.invert_y(req.height()); return bb; } // Get a pointer for sampling at the given offset - char* sample_ptr(int x, int y) const { - return buf + y * stride() + x * bpp(); + char* sample_ptr(int x, int y, int z, int bpp, size_t stride) const { + return buf + (height * z + y) * stride + x * bpp; + } + + char* sample_ptr(int x, int y, int z, int bpp) const { + return sample_ptr(x, y, z, bpp, stride(bpp)); + } + + char* sample_ptr(int x, int y, int z) const { + return sample_ptr(x, y, z, bpp()); } // Get a pointer for sampling the requested region and limit to the provided // sampling bounds - char* sample_ptr(const IntRect& req, const IntRect& bounds, + char* sample_ptr(const IntRect& req, const IntRect& bounds, int z, bool invertY = false) const { // Offset the sample pointer by the clamped bounds int x = req.x0 + bounds.x0; // Invert the Y offset if necessary int y = invertY ? req.y1 - 1 - bounds.y0 : req.y0 + bounds.y0; - return sample_ptr(x, y); + return sample_ptr(x, y, z); } }; -// The last vertex attribute is reserved as a null attribute in case a vertex -// attribute is used without being set. -#define MAX_ATTRIBS 17 -#define NULL_ATTRIB 16 +#define MAX_ATTRIBS 16 +#define NULL_ATTRIB 15 struct VertexArray { VertexAttrib attribs[MAX_ATTRIBS]; int max_attrib = -1; - // The GL spec defines element array buffer binding to be part of VAO state. - GLuint element_array_buffer_binding = 0; void validate(); }; @@ -580,67 +369,33 @@ struct Program { FragmentShaderImpl* frag_impl = nullptr; bool deleted = false; - ~Program() { delete impl; } + ~Program() { + delete impl; + } }; -// clang-format off -// Fully-expand GL defines while ignoring more than 4 suffixes +// for GL defines to fully expand #define CONCAT_KEY(prefix, x, y, z, w, ...) prefix##x##y##z##w -// Generate a blend key enum symbol -#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0, 0) -#define MASK_BLEND_KEY(...) CONCAT_KEY(MASK_BLEND_, __VA_ARGS__, 0, 0, 0) -#define AA_BLEND_KEY(...) CONCAT_KEY(AA_BLEND_, __VA_ARGS__, 0, 0, 0) -#define AA_MASK_BLEND_KEY(...) CONCAT_KEY(AA_MASK_BLEND_, __VA_ARGS__, 0, 0, 0) - -// Utility macro to easily generate similar code for all implemented blend modes +#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0) #define FOR_EACH_BLEND_KEY(macro) \ - macro(GL_ONE, GL_ZERO, 0, 0) \ - macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \ - macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \ - macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0) \ - macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE) \ - macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \ - macro(GL_ZERO, GL_SRC_COLOR, 0, 0) \ - macro(GL_ONE, GL_ONE, 0, 0) \ - macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \ - macro(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE) \ - macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, 0, 0) \ - macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0) \ - macro(GL_MIN, 0, 0, 0) \ - macro(GL_MAX, 0, 0, 0) \ - macro(GL_MULTIPLY_KHR, 0, 0, 0) \ - macro(GL_SCREEN_KHR, 0, 0, 0) \ - macro(GL_OVERLAY_KHR, 0, 0, 0) \ - macro(GL_DARKEN_KHR, 0, 0, 0) \ - macro(GL_LIGHTEN_KHR, 0, 0, 0) \ - macro(GL_COLORDODGE_KHR, 0, 0, 0) \ - macro(GL_COLORBURN_KHR, 0, 0, 0) \ - macro(GL_HARDLIGHT_KHR, 0, 0, 0) \ - macro(GL_SOFTLIGHT_KHR, 0, 0, 0) \ - macro(GL_DIFFERENCE_KHR, 0, 0, 0) \ - macro(GL_EXCLUSION_KHR, 0, 0, 0) \ - macro(GL_HSL_HUE_KHR, 0, 0, 0) \ - macro(GL_HSL_SATURATION_KHR, 0, 0, 0) \ - macro(GL_HSL_COLOR_KHR, 0, 0, 0) \ - macro(GL_HSL_LUMINOSITY_KHR, 0, 0, 0) \ - macro(SWGL_BLEND_DROP_SHADOW, 0, 0, 0) \ - macro(SWGL_BLEND_SUBPIXEL_TEXT, 0, 0, 0) + macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE) \ + macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \ + macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0) \ + macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE) \ + macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0) macro( \ + GL_ZERO, GL_SRC_COLOR, 0, 0) macro(GL_ONE, GL_ONE, 0, 0) \ + macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \ + macro(GL_ONE, GL_ZERO, 0, 0) macro( \ + GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE) \ + macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, \ + 0, 0) \ + macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0) #define DEFINE_BLEND_KEY(...) BLEND_KEY(__VA_ARGS__), -#define DEFINE_MASK_BLEND_KEY(...) MASK_BLEND_KEY(__VA_ARGS__), -#define DEFINE_AA_BLEND_KEY(...) AA_BLEND_KEY(__VA_ARGS__), -#define DEFINE_AA_MASK_BLEND_KEY(...) AA_MASK_BLEND_KEY(__VA_ARGS__), enum BlendKey : uint8_t { + BLEND_KEY_NONE = 0, FOR_EACH_BLEND_KEY(DEFINE_BLEND_KEY) - FOR_EACH_BLEND_KEY(DEFINE_MASK_BLEND_KEY) - FOR_EACH_BLEND_KEY(DEFINE_AA_BLEND_KEY) - FOR_EACH_BLEND_KEY(DEFINE_AA_MASK_BLEND_KEY) - BLEND_KEY_NONE = BLEND_KEY(GL_ONE, GL_ZERO), - MASK_BLEND_KEY_NONE = MASK_BLEND_KEY(GL_ONE, GL_ZERO), - AA_BLEND_KEY_NONE = AA_BLEND_KEY(GL_ONE, GL_ZERO), - AA_MASK_BLEND_KEY_NONE = AA_MASK_BLEND_KEY(GL_ONE, GL_ZERO), }; -// clang-format on const size_t MAX_TEXTURE_UNITS = 16; @@ -704,10 +459,8 @@ struct ObjectStore { O* find(size_t i) const { return i < size ? objects[i] : nullptr; } - template <typename T> - void on_erase(T*, ...) {} - template <typename T> - void on_erase(T* o, decltype(&T::on_erase)) { + template <typename T> void on_erase(T*, ...) {} + template <typename T> void on_erase(T* o, decltype(&T::on_erase)) { o->on_erase(); } @@ -727,8 +480,6 @@ struct ObjectStore { }; struct Context { - int32_t references = 1; - ObjectStore<Query> queries; ObjectStore<Buffer> buffers; ObjectStore<Texture> textures; @@ -756,7 +507,7 @@ struct Context { bool scissortest = false; IntRect scissor = {0, 0, 0, 0}; - GLfloat clearcolor[4] = {0, 0, 0, 0}; + uint32_t clearcolor = 0; GLdouble cleardepth = 1; int unpack_row_length = 0; @@ -766,10 +517,14 @@ struct Context { struct TextureUnit { GLuint texture_2d_binding = 0; + GLuint texture_3d_binding = 0; + GLuint texture_2d_array_binding = 0; GLuint texture_rectangle_binding = 0; void unlink(GLuint n) { ::unlink(texture_2d_binding, n); + ::unlink(texture_3d_binding, n); + ::unlink(texture_2d_array_binding, n); ::unlink(texture_rectangle_binding, n); } }; @@ -784,6 +539,7 @@ struct Context { GLuint pixel_pack_buffer_binding = 0; GLuint pixel_unpack_buffer_binding = 0; GLuint array_buffer_binding = 0; + GLuint element_array_buffer_binding = 0; GLuint time_elapsed_query = 0; GLuint samples_passed_query = 0; GLuint renderbuffer_binding = 0; @@ -800,9 +556,13 @@ struct Context { case GL_ARRAY_BUFFER: return array_buffer_binding; case GL_ELEMENT_ARRAY_BUFFER: - return vertex_arrays[current_vertex_array].element_array_buffer_binding; + return element_array_buffer_binding; case GL_TEXTURE_2D: return texture_units[active_texture_unit].texture_2d_binding; + case GL_TEXTURE_2D_ARRAY: + return texture_units[active_texture_unit].texture_2d_array_binding; + case GL_TEXTURE_3D: + return texture_units[active_texture_unit].texture_3d_binding; case GL_TEXTURE_RECTANGLE: return texture_units[active_texture_unit].texture_rectangle_binding; case GL_TIME_ELAPSED: @@ -830,17 +590,16 @@ struct Context { return textures[texture_units[unit].texture_2d_binding]; } - Texture& get_texture(sampler2DRect, int unit) { - return textures[texture_units[unit].texture_rectangle_binding]; + Texture& get_texture(sampler2DArray, int unit) { + return textures[texture_units[unit].texture_2d_array_binding]; } - IntRect apply_scissor(IntRect bb, - const IntPoint& origin = IntPoint(0, 0)) const { - return scissortest ? bb.intersect(scissor - origin) : bb; + Texture& get_texture(sampler2DRect, int unit) { + return textures[texture_units[unit].texture_rectangle_binding]; } - IntRect apply_scissor(const Texture& t) const { - return apply_scissor(t.bounds(), t.offset); + IntRect apply_scissor(IntRect bb) const { + return scissortest ? bb.intersect(scissor) : bb; } }; static Context* ctx = nullptr; @@ -851,12 +610,14 @@ static BlendKey blend_key = BLEND_KEY_NONE; static void prepare_texture(Texture& t, const IntRect* skip = nullptr); template <typename S> +static inline void init_depth(S* s, Texture& t) { + s->depth = max(t.depth, 1); + s->height_stride = s->stride * t.height; +} + +template <typename S> static inline void init_filter(S* s, Texture& t) { - // If the width is not at least 2 pixels, then we can't safely sample the end - // of the row with a linear filter. In that case, just punt to using nearest - // filtering instead. - s->filter = t.width >= 2 ? gl_filter_to_texture_filter(t.mag_filter) - : TextureFilter::NEAREST; + s->filter = gl_filter_to_texture_filter(t.mag_filter); } template <typename S> @@ -864,44 +625,20 @@ static inline void init_sampler(S* s, Texture& t) { prepare_texture(t); s->width = t.width; s->height = t.height; - s->stride = t.stride(); int bpp = t.bpp(); - if (bpp >= 4) - s->stride /= 4; - else if (bpp == 2) - s->stride /= 2; - else - assert(bpp == 1); - // Use uint32_t* for easier sampling, but need to cast to uint8_t* or - // uint16_t* for formats with bpp < 4. + s->stride = t.stride(bpp); + if (bpp >= 4) s->stride /= 4; + // Use uint32_t* for easier sampling, but need to cast to uint8_t* for formats + // with bpp < 4. s->buf = (uint32_t*)t.buf; s->format = gl_format_to_texture_format(t.internal_format); } template <typename S> -static inline void null_sampler(S* s) { - // For null texture data, just make the sampler provide a 1x1 buffer that is - // transparent black. Ensure buffer holds at least a SIMD vector of zero data - // for SIMD padding of unaligned loads. - static const uint32_t zeroBuf[sizeof(Float) / sizeof(uint32_t)] = {0}; - s->width = 1; - s->height = 1; - s->stride = s->width; - s->buf = (uint32_t*)zeroBuf; - s->format = TextureFormat::RGBA8; -} - -template <typename S> -static inline void null_filter(S* s) { - s->filter = TextureFilter::NEAREST; -} - -template <typename S> S* lookup_sampler(S* s, int texture) { Texture& t = ctx->get_texture(s, texture); if (!t.buf) { - null_sampler(s); - null_filter(s); + *s = S(); } else { init_sampler(s, t); init_filter(s, t); @@ -913,13 +650,26 @@ template <typename S> S* lookup_isampler(S* s, int texture) { Texture& t = ctx->get_texture(s, texture); if (!t.buf) { - null_sampler(s); + *s = S(); } else { init_sampler(s, t); } return s; } +template <typename S> +S* lookup_sampler_array(S* s, int texture) { + Texture& t = ctx->get_texture(s, texture); + if (!t.buf) { + *s = S(); + } else { + init_sampler(s, t); + init_depth(s, t); + init_filter(s, t); + } + return s; +} + int bytes_per_type(GLenum type) { switch (type) { case GL_INT: @@ -983,40 +733,21 @@ void load_attrib(T& attrib, VertexAttrib& va, uint32_t start, int instance, attrib = T(load_attrib_scalar<scalar_type>(va, src)); } else { // Specialized for WR's primitive vertex order/winding. + // Triangles must be indexed at offsets 0, 1, 2. + // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3. + // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2. + // Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so that the + // points form a convex path that can be traversed by the rasterizer. if (!count) return; - assert(count >= 2 && count <= 4); + assert(count == 3 || count == 4); char* src = (char*)va.buf + va.stride * start + va.offset; - switch (count) { - case 2: { - // Lines must be indexed at offsets 0, 1. - // Line vertexes fill vertex shader SIMD lanes as 0, 1, 1, 0. - scalar_type lanes[2] = { - load_attrib_scalar<scalar_type>(va, src), - load_attrib_scalar<scalar_type>(va, src + va.stride)}; - attrib = (T){lanes[0], lanes[1], lanes[1], lanes[0]}; - break; - } - case 3: { - // Triangles must be indexed at offsets 0, 1, 2. - // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2. - scalar_type lanes[3] = { - load_attrib_scalar<scalar_type>(va, src), - load_attrib_scalar<scalar_type>(va, src + va.stride), - load_attrib_scalar<scalar_type>(va, src + va.stride * 2)}; - attrib = (T){lanes[0], lanes[1], lanes[2], lanes[2]}; - break; - } - default: - // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, - // 1, 3. Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so - // that the points form a convex path that can be traversed by the - // rasterizer. - attrib = (T){load_attrib_scalar<scalar_type>(va, src), - load_attrib_scalar<scalar_type>(va, src + va.stride), - load_attrib_scalar<scalar_type>(va, src + va.stride * 3), - load_attrib_scalar<scalar_type>(va, src + va.stride * 2)}; - break; - } + attrib = (T){ + load_attrib_scalar<scalar_type>(va, src), + load_attrib_scalar<scalar_type>(va, src + va.stride), + load_attrib_scalar<scalar_type>(va, src + va.stride * 2 + + (count > 3 ? va.stride : 0)), + load_attrib_scalar<scalar_type>(va, src + va.stride * 2) + }; } } @@ -1076,6 +807,7 @@ void Enable(GLenum cap) { switch (cap) { case GL_BLEND: ctx->blend = true; + blend_key = ctx->blend_key; break; case GL_DEPTH_TEST: ctx->depthtest = true; @@ -1090,6 +822,7 @@ void Disable(GLenum cap) { switch (cap) { case GL_BLEND: ctx->blend = false; + blend_key = BLEND_KEY_NONE; break; case GL_DEPTH_TEST: ctx->depthtest = false; @@ -1103,18 +836,10 @@ void Disable(GLenum cap) { GLenum GetError() { return GL_NO_ERROR; } static const char* const extensions[] = { - "GL_ARB_blend_func_extended", - "GL_ARB_clear_texture", - "GL_ARB_copy_image", - "GL_ARB_draw_instanced", - "GL_ARB_explicit_attrib_location", - "GL_ARB_instanced_arrays", - "GL_ARB_invalidate_subdata", - "GL_ARB_texture_storage", - "GL_EXT_timer_query", - "GL_KHR_blend_equation_advanced", - "GL_KHR_blend_equation_advanced_coherent", - "GL_APPLE_rgb_422", + "GL_ARB_blend_func_extended", "GL_ARB_copy_image", + "GL_ARB_draw_instanced", "GL_ARB_explicit_attrib_location", + "GL_ARB_instanced_arrays", "GL_ARB_invalidate_subdata", + "GL_ARB_texture_storage", "GL_EXT_timer_query", }; void GetIntegerv(GLenum pname, GLint* params) { @@ -1128,7 +853,7 @@ void GetIntegerv(GLenum pname, GLint* params) { params[0] = 1 << 15; break; case GL_MAX_ARRAY_TEXTURE_LAYERS: - params[0] = 0; + params[0] = 1 << 15; break; case GL_READ_FRAMEBUFFER_BINDING: params[0] = ctx->read_framebuffer_binding; @@ -1145,12 +870,6 @@ void GetIntegerv(GLenum pname, GLint* params) { case GL_NUM_EXTENSIONS: params[0] = sizeof(extensions) / sizeof(extensions[0]); break; - case GL_MAJOR_VERSION: - params[0] = 3; - break; - case GL_MINOR_VERSION: - params[0] = 2; - break; default: debugf("unhandled glGetIntegerv parameter %x\n", pname); assert(false); @@ -1177,8 +896,6 @@ const char* GetString(GLenum name) { return "Software WebRender"; case GL_VERSION: return "3.2"; - case GL_SHADING_LANGUAGE_VERSION: - return "1.50"; default: debugf("unhandled glGetString parameter %x\n", name); assert(false); @@ -1254,23 +971,17 @@ GLenum remap_blendfunc(GLenum rgb, GLenum a) { return a; } -// Generate a hashed blend key based on blend func and equation state. This -// allows all the blend state to be processed down to a blend key that can be -// dealt with inside a single switch statement. -static void hash_blend_key() { - GLenum srgb = ctx->blendfunc_srgb; - GLenum drgb = ctx->blendfunc_drgb; - GLenum sa = ctx->blendfunc_sa; - GLenum da = ctx->blendfunc_da; - GLenum equation = ctx->blend_equation; +void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) { + ctx->blendfunc_srgb = srgb; + ctx->blendfunc_drgb = drgb; + sa = remap_blendfunc(srgb, sa); + da = remap_blendfunc(drgb, da); + ctx->blendfunc_sa = sa; + ctx->blendfunc_da = da; + #define HASH_BLEND_KEY(x, y, z, w) ((x << 4) | (y) | (z << 24) | (w << 20)) - // Basic non-separate blend funcs used the two argument form int hash = HASH_BLEND_KEY(srgb, drgb, 0, 0); - // Separate alpha blend funcs use the 4 argument hash if (srgb != sa || drgb != da) hash |= HASH_BLEND_KEY(0, 0, sa, da); - // Any other blend equation than the default func_add ignores the func and - // instead generates a one-argument hash based on the equation - if (equation != GL_FUNC_ADD) hash = HASH_BLEND_KEY(equation, 0, 0, 0); switch (hash) { #define MAP_BLEND_KEY(...) \ case HASH_BLEND_KEY(__VA_ARGS__): \ @@ -1278,22 +989,14 @@ static void hash_blend_key() { break; FOR_EACH_BLEND_KEY(MAP_BLEND_KEY) default: - debugf("blendfunc: %x, %x, separate: %x, %x, equation: %x\n", srgb, drgb, - sa, da, equation); + debugf("blendfunc: %x, %x, separate: %x, %x\n", srgb, drgb, sa, da); assert(false); break; } -} -void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) { - ctx->blendfunc_srgb = srgb; - ctx->blendfunc_drgb = drgb; - sa = remap_blendfunc(srgb, sa); - da = remap_blendfunc(drgb, da); - ctx->blendfunc_sa = sa; - ctx->blendfunc_da = da; - - hash_blend_key(); + if (ctx->blend) { + blend_key = ctx->blend_key; + } } void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { @@ -1302,12 +1005,8 @@ void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { } void BlendEquation(GLenum mode) { - assert(mode == GL_FUNC_ADD || mode == GL_MIN || mode == GL_MAX || - (mode >= GL_MULTIPLY_KHR && mode <= GL_HSL_LUMINOSITY_KHR)); - if (mode != ctx->blend_equation) { - ctx->blend_equation = mode; - hash_blend_key(); - } + assert(mode == GL_FUNC_ADD); + ctx->blend_equation = mode; } void DepthMask(GLboolean flag) { ctx->depthmask = flag; } @@ -1328,10 +1027,8 @@ void SetScissor(GLint x, GLint y, GLsizei width, GLsizei height) { } void ClearColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { - ctx->clearcolor[0] = r; - ctx->clearcolor[1] = g; - ctx->clearcolor[2] = b; - ctx->clearcolor[3] = a; + I32 c = round_pixel((Float){b, g, r, a}); + ctx->clearcolor = bit_cast<uint32_t>(CONVERT(c, U8)); } void ClearDepth(GLdouble depth) { ctx->cleardepth = depth; } @@ -1369,6 +1066,7 @@ void DeleteBuffer(GLuint n) { unlink(ctx->pixel_pack_buffer_binding, n); unlink(ctx->pixel_unpack_buffer_binding, n); unlink(ctx->array_buffer_binding, n); + unlink(ctx->element_array_buffer_binding, n); } } @@ -1434,45 +1132,26 @@ void DeleteProgram(GLuint n) { void LinkProgram(GLuint program) { Program& p = ctx->programs[program]; assert(p.impl); - if (!p.impl) { - return; - } assert(p.impl->interpolants_size() <= sizeof(Interpolants)); if (!p.vert_impl) p.vert_impl = p.impl->get_vertex_shader(); if (!p.frag_impl) p.frag_impl = p.impl->get_fragment_shader(); } -GLint GetLinkStatus(GLuint program) { - if (auto* p = ctx->programs.find(program)) { - return p->impl ? 1 : 0; - } - return 0; -} - void BindAttribLocation(GLuint program, GLuint index, char* name) { Program& p = ctx->programs[program]; assert(p.impl); - if (!p.impl) { - return; - } p.impl->bind_attrib(name, index); } GLint GetAttribLocation(GLuint program, char* name) { Program& p = ctx->programs[program]; assert(p.impl); - if (!p.impl) { - return -1; - } return p.impl->get_attrib(name); } GLint GetUniformLocation(GLuint program, char* name) { Program& p = ctx->programs[program]; assert(p.impl); - if (!p.impl) { - return -1; - } GLint loc = p.impl->get_uniform(name); // debugf("location: %d\n", loc); return loc; @@ -1482,15 +1161,7 @@ static uint64_t get_time_value() { #ifdef __MACH__ return mach_absolute_time(); #elif defined(_WIN32) - LARGE_INTEGER time; - static bool have_frequency = false; - static LARGE_INTEGER frequency; - if (!have_frequency) { - QueryPerformanceFrequency(&frequency); - have_frequency = true; - } - QueryPerformanceCounter(&time); - return time.QuadPart * 1000000000ULL / frequency.QuadPart; + return uint64_t(clock()) * (1000000000ULL / CLOCKS_PER_SEC); #else return ({ struct timespec tp; @@ -1583,113 +1254,60 @@ void PixelStorei(GLenum name, GLint param) { static GLenum remap_internal_format(GLenum format) { switch (format) { case GL_DEPTH_COMPONENT: - return GL_DEPTH_COMPONENT24; + return GL_DEPTH_COMPONENT16; case GL_RGBA: return GL_RGBA8; case GL_RED: return GL_R8; - case GL_RG: - return GL_RG8; - case GL_RGB_422_APPLE: - return GL_RGB_RAW_422_APPLE; default: return format; } } -} // extern "C" - -static bool format_requires_conversion(GLenum external_format, - GLenum internal_format) { - switch (external_format) { - case GL_RGBA: - return internal_format == GL_RGBA8; - default: - return false; - } -} - -static inline void copy_bgra8_to_rgba8(uint32_t* dest, const uint32_t* src, - int width) { - for (; width >= 4; width -= 4, dest += 4, src += 4) { - U32 p = unaligned_load<U32>(src); - U32 rb = p & 0x00FF00FF; - unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16)); - } - for (; width > 0; width--, dest++, src++) { - uint32_t p = *src; - uint32_t rb = p & 0x00FF00FF; - *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16); - } -} - -static void convert_copy(GLenum external_format, GLenum internal_format, - uint8_t* dst_buf, size_t dst_stride, - const uint8_t* src_buf, size_t src_stride, - size_t width, size_t height) { - switch (external_format) { - case GL_RGBA: - if (internal_format == GL_RGBA8) { - for (; height; height--) { - copy_bgra8_to_rgba8((uint32_t*)dst_buf, (const uint32_t*)src_buf, - width); - dst_buf += dst_stride; - src_buf += src_stride; - } - return; - } - break; - default: - break; - } - size_t row_bytes = width * bytes_for_internal_format(internal_format); - for (; height; height--) { - memcpy(dst_buf, src_buf, row_bytes); - dst_buf += dst_stride; - src_buf += src_stride; +void TexStorage3D(GLenum target, GLint levels, GLenum internal_format, + GLsizei width, GLsizei height, GLsizei depth) { + assert(levels == 1); + Texture& t = ctx->textures[ctx->get_binding(target)]; + internal_format = remap_internal_format(internal_format); + bool changed = false; + if (t.width != width || t.height != height || t.depth != depth || + t.internal_format != internal_format) { + changed = true; + t.internal_format = internal_format; + t.width = width; + t.height = height; + t.depth = depth; } + t.disable_delayed_clear(); + t.allocate(changed); } -static void set_tex_storage(Texture& t, GLenum external_format, GLsizei width, - GLsizei height, void* buf = nullptr, - GLsizei stride = 0, GLsizei min_width = 0, - GLsizei min_height = 0) { - GLenum internal_format = remap_internal_format(external_format); +static void set_tex_storage(Texture& t, GLenum internal_format, + GLsizei width, GLsizei height, + bool should_free = true, void* buf = nullptr, + GLsizei min_width = 0, GLsizei min_height = 0) { + internal_format = remap_internal_format(internal_format); bool changed = false; - if (t.width != width || t.height != height || + if (t.width != width || t.height != height || t.depth != 0 || t.internal_format != internal_format) { changed = true; t.internal_format = internal_format; t.width = width; t.height = height; + t.depth = 0; } - // If we are changed from an internally managed buffer to an externally - // supplied one or vice versa, ensure that we clean up old buffer state. - // However, if we have to convert the data from a non-native format, then - // always treat it as internally managed since we will need to copy to an - // internally managed native format buffer. - bool should_free = buf == nullptr || format_requires_conversion( - external_format, internal_format); - if (t.should_free() != should_free) { - changed = true; - t.cleanup(); + if (t.should_free() != should_free || buf != nullptr) { + if (t.should_free()) { + t.cleanup(); + } t.set_should_free(should_free); - } - // If now an external buffer, explicitly set it... - if (!should_free) { - t.set_buffer(buf, stride); + t.buf = (char*)buf; + t.buf_size = 0; } t.disable_delayed_clear(); t.allocate(changed, min_width, min_height); - // If we have a buffer that needs format conversion, then do that now. - if (buf && should_free) { - convert_copy(external_format, internal_format, (uint8_t*)t.buf, t.stride(), - (const uint8_t*)buf, stride, width, height); - } } -extern "C" { - void TexStorage2D(GLenum target, GLint levels, GLenum internal_format, GLsizei width, GLsizei height) { assert(levels == 1); @@ -1701,19 +1319,12 @@ GLenum internal_format_for_data(GLenum format, GLenum ty) { if (format == GL_RED && ty == GL_UNSIGNED_BYTE) { return GL_R8; } else if ((format == GL_RGBA || format == GL_BGRA) && - (ty == GL_UNSIGNED_BYTE || ty == GL_UNSIGNED_INT_8_8_8_8_REV)) { + ty == GL_UNSIGNED_BYTE) { return GL_RGBA8; } else if (format == GL_RGBA && ty == GL_FLOAT) { return GL_RGBA32F; } else if (format == GL_RGBA_INTEGER && ty == GL_INT) { return GL_RGBA32I; - } else if (format == GL_RG && ty == GL_UNSIGNED_BYTE) { - return GL_RG8; - } else if (format == GL_RGB_422_APPLE && - ty == GL_UNSIGNED_SHORT_8_8_REV_APPLE) { - return GL_RGB_RAW_422_APPLE; - } else if (format == GL_RED && ty == GL_UNSIGNED_SHORT) { - return GL_R16; } else { debugf("unknown internal format for format %x, type %x\n", format, ty); assert(false); @@ -1721,6 +1332,20 @@ GLenum internal_format_for_data(GLenum format, GLenum ty) { } } +static inline void copy_bgra8_to_rgba8(uint32_t* dest, uint32_t* src, + int width) { + for (; width >= 4; width -= 4, dest += 4, src += 4) { + U32 p = unaligned_load<U32>(src); + U32 rb = p & 0x00FF00FF; + unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16)); + } + for (; width > 0; width--, dest++, src++) { + uint32_t p = *src; + uint32_t rb = p & 0x00FF00FF; + *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16); + } +} + static Buffer* get_pixel_pack_buffer() { return ctx->pixel_pack_buffer_binding ? &ctx->buffers[ctx->pixel_pack_buffer_binding] @@ -1750,10 +1375,7 @@ static void* get_pixel_unpack_buffer_data(void* data) { void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum ty, void* data) { - if (level != 0) { - assert(false); - return; - } + if (level != 0) { assert(false); return; } data = get_pixel_unpack_buffer_data(data); if (!data) return; Texture& t = ctx->textures[ctx->get_binding(target)]; @@ -1765,33 +1387,84 @@ void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei row_length = ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width; assert(t.internal_format == internal_format_for_data(format, ty)); - int src_bpp = format_requires_conversion(format, t.internal_format) - ? bytes_for_internal_format(format) - : t.bpp(); - if (!src_bpp || !t.buf) return; - convert_copy(format, t.internal_format, - (uint8_t*)t.sample_ptr(xoffset, yoffset), t.stride(), - (const uint8_t*)data, row_length * src_bpp, width, height); + int bpp = t.bpp(); + if (!bpp || !t.buf) return; + size_t dest_stride = t.stride(bpp); + char* dest = t.sample_ptr(xoffset, yoffset, 0, bpp, dest_stride); + char* src = (char*)data; + for (int y = 0; y < height; y++) { + if (t.internal_format == GL_RGBA8 && format != GL_BGRA) { + copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width); + } else { + memcpy(dest, src, width * bpp); + } + dest += dest_stride; + src += row_length * bpp; + } } void TexImage2D(GLenum target, GLint level, GLint internal_format, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum ty, void* data) { - if (level != 0) { - assert(false); - return; - } + if (level != 0) { assert(false); return; } assert(border == 0); TexStorage2D(target, 1, internal_format, width, height); TexSubImage2D(target, 0, 0, 0, width, height, format, ty, data); } +void TexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset, + GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, + GLenum format, GLenum ty, void* data) { + if (level != 0) { assert(false); return; } + data = get_pixel_unpack_buffer_data(data); + if (!data) return; + Texture& t = ctx->textures[ctx->get_binding(target)]; + prepare_texture(t); + assert(ctx->unpack_row_length == 0 || ctx->unpack_row_length >= width); + GLsizei row_length = + ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width; + if (format == GL_BGRA) { + assert(ty == GL_UNSIGNED_BYTE); + assert(t.internal_format == GL_RGBA8); + } else { + assert(t.internal_format == internal_format_for_data(format, ty)); + } + int bpp = t.bpp(); + if (!bpp || !t.buf) return; + char* src = (char*)data; + assert(xoffset + width <= t.width); + assert(yoffset + height <= t.height); + assert(zoffset + depth <= t.depth); + size_t dest_stride = t.stride(bpp); + for (int z = 0; z < depth; z++) { + char* dest = t.sample_ptr(xoffset, yoffset, zoffset + z, bpp, dest_stride); + for (int y = 0; y < height; y++) { + if (t.internal_format == GL_RGBA8 && format != GL_BGRA) { + copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width); + } else { + memcpy(dest, src, width * bpp); + } + dest += dest_stride; + src += row_length * bpp; + } + } +} + +void TexImage3D(GLenum target, GLint level, GLint internal_format, + GLsizei width, GLsizei height, GLsizei depth, GLint border, + GLenum format, GLenum ty, void* data) { + if (level != 0) { assert(false); return; } + assert(border == 0); + TexStorage3D(target, 1, internal_format, width, height, depth); + TexSubImage3D(target, 0, 0, 0, 0, width, height, depth, format, ty, data); +} + void GenerateMipmap(UNUSED GLenum target) { // TODO: support mipmaps } -void SetTextureParameter(GLuint texid, GLenum pname, GLint param) { - Texture& t = ctx->textures[texid]; +void TexParameteri(GLenum target, GLenum pname, GLint param) { + Texture& t = ctx->textures[ctx->get_binding(target)]; switch (pname) { case GL_TEXTURE_WRAP_S: assert(param == GL_CLAMP_TO_EDGE); @@ -1810,10 +1483,6 @@ void SetTextureParameter(GLuint texid, GLenum pname, GLint param) { } } -void TexParameteri(GLenum target, GLenum pname, GLint param) { - SetTextureParameter(ctx->get_binding(target), pname, param); -} - void GenTextures(int n, GLuint* result) { for (int i = 0; i < n; i++) { Texture t; @@ -1839,7 +1508,9 @@ void GenRenderbuffers(int n, GLuint* result) { void Renderbuffer::on_erase() { for (auto* fb : ctx->framebuffers) { if (fb) { - unlink(fb->color_attachment, texture); + if (unlink(fb->color_attachment, texture)) { + fb->layer = 0; + } unlink(fb->depth_attachment, texture); } } @@ -1875,11 +1546,10 @@ void RenderbufferStorage(GLenum target, GLenum internal_format, GLsizei width, } switch (internal_format) { case GL_DEPTH_COMPONENT: - case GL_DEPTH_COMPONENT16: case GL_DEPTH_COMPONENT24: case GL_DEPTH_COMPONENT32: - // Force depth format to 24 bits... - internal_format = GL_DEPTH_COMPONENT24; + // Force depth format to 16 bits... + internal_format = GL_DEPTH_COMPONENT16; break; } set_tex_storage(ctx->textures[r.texture], internal_format, width, height); @@ -1963,8 +1633,7 @@ void VertexAttribDivisor(GLuint index, GLuint divisor) { va.divisor = divisor; } -void BufferData(GLenum target, GLsizeiptr size, void* data, - UNUSED GLenum usage) { +void BufferData(GLenum target, GLsizeiptr size, void* data, UNUSED GLenum usage) { Buffer& b = ctx->buffers[ctx->get_binding(target)]; if (b.allocate(size)) { ctx->validate_vertex_array = true; @@ -2004,23 +1673,17 @@ GLboolean UnmapBuffer(GLenum target) { void Uniform1i(GLint location, GLint V0) { // debugf("tex: %d\n", (int)ctx->textures.size); - if (vertex_shader) { - vertex_shader->set_uniform_1i(location, V0); - } + vertex_shader->set_uniform_1i(location, V0); } void Uniform4fv(GLint location, GLsizei count, const GLfloat* v) { assert(count == 1); - if (vertex_shader) { - vertex_shader->set_uniform_4fv(location, v); - } + vertex_shader->set_uniform_4fv(location, v); } void UniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat* value) { assert(count == 1); assert(!transpose); - if (vertex_shader) { - vertex_shader->set_uniform_matrix4fv(location, value); - } + vertex_shader->set_uniform_matrix4fv(location, value); } void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget, @@ -2031,7 +1694,24 @@ void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget, Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)]; if (attachment == GL_COLOR_ATTACHMENT0) { fb.color_attachment = texture; + fb.layer = 0; + } else if (attachment == GL_DEPTH_ATTACHMENT) { + fb.depth_attachment = texture; + } else { + assert(0); + } +} + +void FramebufferTextureLayer(GLenum target, GLenum attachment, GLuint texture, + GLint level, GLint layer) { + assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER); + assert(level == 0); + Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)]; + if (attachment == GL_COLOR_ATTACHMENT0) { + fb.color_attachment = texture; + fb.layer = layer; } else if (attachment == GL_DEPTH_ATTACHMENT) { + assert(layer == 0); fb.depth_attachment = texture; } else { assert(0); @@ -2046,6 +1726,7 @@ void FramebufferRenderbuffer(GLenum target, GLenum attachment, Renderbuffer& rb = ctx->renderbuffers[renderbuffer]; if (attachment == GL_COLOR_ATTACHMENT0) { fb.color_attachment = rb.texture; + fb.layer = 0; } else if (attachment == GL_DEPTH_ATTACHMENT) { fb.depth_attachment = rb.texture; } else { @@ -2055,18 +1736,11 @@ void FramebufferRenderbuffer(GLenum target, GLenum attachment, } // extern "C" -static inline Framebuffer* get_framebuffer(GLenum target, - bool fallback = false) { +static inline Framebuffer* get_framebuffer(GLenum target) { if (target == GL_FRAMEBUFFER) { target = GL_DRAW_FRAMEBUFFER; } - Framebuffer* fb = ctx->framebuffers.find(ctx->get_binding(target)); - if (fallback && !fb) { - // If the specified framebuffer isn't found and a fallback is requested, - // use the default framebuffer. - fb = &ctx->framebuffers[0]; - } - return fb; + return ctx->framebuffers.find(ctx->get_binding(target)); } template <typename T> @@ -2092,7 +1766,9 @@ static inline uint32_t clear_chunk(uint16_t value) { return uint32_t(value) | (uint32_t(value) << 16); } -static inline uint32_t clear_chunk(uint32_t value) { return value; } +static inline uint32_t clear_chunk(uint32_t value) { + return value; +} template <typename T> static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) { @@ -2115,22 +1791,20 @@ static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) { } template <typename T> -static void clear_buffer(Texture& t, T value, IntRect bb, int skip_start = 0, - int skip_end = 0) { +static void clear_buffer(Texture& t, T value, int layer, IntRect bb, + int skip_start = 0, int skip_end = 0) { if (!t.buf) return; skip_start = max(skip_start, bb.x0); skip_end = max(skip_end, skip_start); assert(sizeof(T) == t.bpp()); - size_t stride = t.stride(); - // When clearing multiple full-width rows, collapse them into a single large - // "row" to avoid redundant setup from clearing each row individually. Note - // that we can only safely do this if the stride is tightly packed. - if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end && - (t.should_free() || stride == t.width * sizeof(T))) { + size_t stride = t.stride(sizeof(T)); + // When clearing multiple full-width rows, collapse them into a single + // large "row" to avoid redundant setup from clearing each row individually. + if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end) { bb.x1 += (stride / sizeof(T)) * (bb.height() - 1); bb.y1 = bb.y0 + 1; } - T* buf = (T*)t.sample_ptr(bb.x0, bb.y0); + T* buf = (T*)t.sample_ptr(bb.x0, bb.y0, layer, sizeof(T), stride); uint32_t chunk = clear_chunk(value); for (int rows = bb.height(); rows > 0; rows--) { if (bb.x0 < skip_start) { @@ -2144,12 +1818,20 @@ static void clear_buffer(Texture& t, T value, IntRect bb, int skip_start = 0, } template <typename T> +static inline void clear_buffer(Texture& t, T value, int layer = 0) { + IntRect bb = ctx->apply_scissor(t.bounds()); + if (bb.width() > 0) { + clear_buffer<T>(t, value, layer, bb); + } +} + +template <typename T> static inline void force_clear_row(Texture& t, int y, int skip_start = 0, int skip_end = 0) { assert(t.buf != nullptr); assert(sizeof(T) == t.bpp()); assert(skip_start <= skip_end); - T* buf = (T*)t.sample_ptr(0, y); + T* buf = (T*)t.sample_ptr(0, y, 0, sizeof(T)); uint32_t chunk = clear_chunk((T)t.clear_val); if (skip_start > 0) { clear_row<T>(buf, skip_start, t.clear_val, chunk); @@ -2188,9 +1870,9 @@ static void force_clear(Texture& t, const IntRect* skip = nullptr) { while (mask) { int count = __builtin_ctz(mask); if (count > 0) { - clear_buffer<T>(t, t.clear_val, - IntRect{0, start, t.width, start + count}, skip_start, - skip_end); + clear_buffer<T>(t, t.clear_val, 0, + IntRect{0, start, t.width, start + count}, + skip_start, skip_end); t.delay_clear -= count; start += count; mask >>= count; @@ -2201,9 +1883,9 @@ static void force_clear(Texture& t, const IntRect* skip = nullptr) { } int count = (i + 1) * 32 - start; if (count > 0) { - clear_buffer<T>(t, t.clear_val, - IntRect{0, start, t.width, start + count}, skip_start, - skip_end); + clear_buffer<T>(t, t.clear_val, 0, + IntRect{0, start, t.width, start + count}, + skip_start, skip_end); t.delay_clear -= count; } } @@ -2220,7 +1902,7 @@ static void prepare_texture(Texture& t, const IntRect* skip) { case GL_R8: force_clear<uint8_t>(t, skip); break; - case GL_RG8: + case GL_DEPTH_COMPONENT16: force_clear<uint16_t>(t, skip); break; default: @@ -2230,53 +1912,31 @@ static void prepare_texture(Texture& t, const IntRect* skip) { } } -// Setup a clear on a texture. This may either force an immediate clear or -// potentially punt to a delayed clear, if applicable. -template <typename T> -static void request_clear(Texture& t, T value, const IntRect& scissor) { - // If the clear would require a scissor, force clear anything outside - // the scissor, and then immediately clear anything inside the scissor. - if (!scissor.contains(t.offset_bounds())) { - IntRect skip = scissor - t.offset; - force_clear<T>(t, &skip); - clear_buffer<T>(t, value, skip.intersection(t.bounds())); - } else { - // Do delayed clear for 2D texture without scissor. - t.enable_delayed_clear(value); - } -} - -template <typename T> -static inline void request_clear(Texture& t, T value) { - // If scissoring is enabled, use the scissor rect. Otherwise, just scissor to - // the entire texture bounds. - request_clear(t, value, ctx->scissortest ? ctx->scissor : t.offset_bounds()); -} - extern "C" { -void InitDefaultFramebuffer(int x, int y, int width, int height, int stride, - void* buf) { +void InitDefaultFramebuffer(int width, int height) { Framebuffer& fb = ctx->framebuffers[0]; if (!fb.color_attachment) { GenTextures(1, &fb.color_attachment); + fb.layer = 0; } - // If the dimensions or buffer properties changed, we need to reallocate - // the underlying storage for the color buffer texture. Texture& colortex = ctx->textures[fb.color_attachment]; - set_tex_storage(colortex, GL_RGBA8, width, height, buf, stride); - colortex.offset = IntPoint(x, y); + if (colortex.width != width || colortex.height != height) { + colortex.cleanup(); + set_tex_storage(colortex, GL_RGBA8, width, height); + } if (!fb.depth_attachment) { GenTextures(1, &fb.depth_attachment); } - // Ensure dimensions of the depth buffer match the color buffer. Texture& depthtex = ctx->textures[fb.depth_attachment]; - set_tex_storage(depthtex, GL_DEPTH_COMPONENT24, width, height); - depthtex.offset = IntPoint(x, y); + if (depthtex.width != width || depthtex.height != height) { + depthtex.cleanup(); + set_tex_storage(depthtex, GL_DEPTH_COMPONENT16, width, height); + } } void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width, - int32_t* height, int32_t* stride) { + int32_t* height) { Framebuffer* fb = ctx->framebuffers.find(fbo); if (!fb || !fb->color_attachment) { return nullptr; @@ -2285,33 +1945,16 @@ void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width, if (flush) { prepare_texture(colortex); } - assert(colortex.offset == IntPoint(0, 0)); - if (width) { - *width = colortex.width; - } - if (height) { - *height = colortex.height; - } - if (stride) { - *stride = colortex.stride(); - } - return colortex.buf ? colortex.sample_ptr(0, 0) : nullptr; -} - -void ResolveFramebuffer(GLuint fbo) { - Framebuffer* fb = ctx->framebuffers.find(fbo); - if (!fb || !fb->color_attachment) { - return; - } - Texture& colortex = ctx->textures[fb->color_attachment]; - prepare_texture(colortex); + *width = colortex.width; + *height = colortex.height; + return colortex.buf ? colortex.sample_ptr(0, 0, fb->layer) : nullptr; } void SetTextureBuffer(GLuint texid, GLenum internal_format, GLsizei width, - GLsizei height, GLsizei stride, void* buf, - GLsizei min_width, GLsizei min_height) { + GLsizei height, void* buf, GLsizei min_width, + GLsizei min_height) { Texture& t = ctx->textures[texid]; - set_tex_storage(t, internal_format, width, height, buf, stride, min_width, + set_tex_storage(t, internal_format, width, height, !buf, buf, min_width, min_height); } @@ -2323,170 +1966,57 @@ GLenum CheckFramebufferStatus(GLenum target) { return GL_FRAMEBUFFER_COMPLETE; } -void ClearTexSubImage(GLuint texture, GLint level, GLint xoffset, GLint yoffset, - GLint zoffset, GLsizei width, GLsizei height, - GLsizei depth, GLenum format, GLenum type, - const void* data) { - if (level != 0) { - assert(false); - return; - } - Texture& t = ctx->textures[texture]; - assert(!t.locked); - if (width <= 0 || height <= 0 || depth <= 0) { - return; - } - assert(zoffset == 0 && depth == 1); - IntRect scissor = {xoffset, yoffset, xoffset + width, yoffset + height}; - if (t.internal_format == GL_DEPTH_COMPONENT24) { - uint32_t value = 0xFFFFFF; - switch (format) { - case GL_DEPTH_COMPONENT: - switch (type) { - case GL_DOUBLE: - value = uint32_t(*(const GLdouble*)data * 0xFFFFFF); - break; - case GL_FLOAT: - value = uint32_t(*(const GLfloat*)data * 0xFFFFFF); - break; - default: - assert(false); - break; - } - break; - default: - assert(false); - break; - } - if (t.cleared() && !scissor.contains(t.offset_bounds())) { - // If we need to scissor the clear and the depth buffer was already - // initialized, then just fill runs for that scissor area. - t.fill_depth_runs(value, scissor); - } else { - // Otherwise, the buffer is either uninitialized or the clear would - // encompass the entire buffer. If uninitialized, we can safely fill - // the entire buffer with any value and thus ignore any scissoring. - t.init_depth_runs(value); - } - return; - } - - uint32_t color = 0xFF000000; - switch (type) { - case GL_FLOAT: { - const GLfloat* f = (const GLfloat*)data; - Float v = {0.0f, 0.0f, 0.0f, 1.0f}; - switch (format) { - case GL_RGBA: - v.w = f[3]; // alpha - FALLTHROUGH; - case GL_RGB: - v.z = f[2]; // blue - FALLTHROUGH; - case GL_RG: - v.y = f[1]; // green - FALLTHROUGH; - case GL_RED: - v.x = f[0]; // red - break; - default: - assert(false); - break; - } - color = bit_cast<uint32_t>(CONVERT(round_pixel(v), U8)); - break; - } - case GL_UNSIGNED_BYTE: { - const GLubyte* b = (const GLubyte*)data; - switch (format) { - case GL_RGBA: - color = (color & ~0xFF000000) | (uint32_t(b[3]) << 24); // alpha - FALLTHROUGH; - case GL_RGB: - color = (color & ~0x00FF0000) | (uint32_t(b[2]) << 16); // blue - FALLTHROUGH; - case GL_RG: - color = (color & ~0x0000FF00) | (uint32_t(b[1]) << 8); // green - FALLTHROUGH; - case GL_RED: - color = (color & ~0x000000FF) | uint32_t(b[0]); // red - break; - default: - assert(false); - break; - } - break; - } - default: - assert(false); - break; - } - - switch (t.internal_format) { - case GL_RGBA8: - // Clear color needs to swizzle to BGRA. - request_clear<uint32_t>(t, - (color & 0xFF00FF00) | - ((color << 16) & 0xFF0000) | - ((color >> 16) & 0xFF), - scissor); - break; - case GL_R8: - request_clear<uint8_t>(t, uint8_t(color & 0xFF), scissor); - break; - case GL_RG8: - request_clear<uint16_t>(t, uint16_t(color & 0xFFFF), scissor); - break; - default: - assert(false); - break; - } -} - -void ClearTexImage(GLuint texture, GLint level, GLenum format, GLenum type, - const void* data) { - Texture& t = ctx->textures[texture]; - IntRect scissor = t.offset_bounds(); - ClearTexSubImage(texture, level, scissor.x0, scissor.y0, 0, scissor.width(), - scissor.height(), 1, format, type, data); +static inline bool clear_requires_scissor(Texture& t) { + return ctx->scissortest && !ctx->scissor.contains(t.bounds()); } void Clear(GLbitfield mask) { - Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true); + Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER); if ((mask & GL_COLOR_BUFFER_BIT) && fb.color_attachment) { Texture& t = ctx->textures[fb.color_attachment]; - IntRect scissor = ctx->scissortest - ? ctx->scissor.intersection(t.offset_bounds()) - : t.offset_bounds(); - ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0, - scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT, - ctx->clearcolor); + if (t.internal_format == GL_RGBA8) { + uint32_t color = ctx->clearcolor; + // If the clear would require a scissor, force clear anything outside + // the scissor, and then immediately clear anything inside the scissor. + if (clear_requires_scissor(t)) { + force_clear<uint32_t>(t, &ctx->scissor); + clear_buffer<uint32_t>(t, color, fb.layer); + } else if (t.depth > 1) { + // Delayed clear is not supported on texture arrays. + t.disable_delayed_clear(); + clear_buffer<uint32_t>(t, color, fb.layer); + } else { + // Do delayed clear for 2D texture without scissor. + t.enable_delayed_clear(color); + } + } else if (t.internal_format == GL_R8) { + uint8_t color = uint8_t((ctx->clearcolor >> 16) & 0xFF); + if (clear_requires_scissor(t)) { + force_clear<uint8_t>(t, &ctx->scissor); + clear_buffer<uint8_t>(t, color, fb.layer); + } else if (t.depth > 1) { + t.disable_delayed_clear(); + clear_buffer<uint8_t>(t, color, fb.layer); + } else { + t.enable_delayed_clear(color); + } + } else { + assert(false); + } } if ((mask & GL_DEPTH_BUFFER_BIT) && fb.depth_attachment) { Texture& t = ctx->textures[fb.depth_attachment]; - IntRect scissor = ctx->scissortest - ? ctx->scissor.intersection(t.offset_bounds()) - : t.offset_bounds(); - ClearTexSubImage(fb.depth_attachment, 0, scissor.x0, scissor.y0, 0, - scissor.width(), scissor.height(), 1, GL_DEPTH_COMPONENT, - GL_DOUBLE, &ctx->cleardepth); + assert(t.internal_format == GL_DEPTH_COMPONENT16); + uint16_t depth = uint16_t(0xFFFF * ctx->cleardepth) - 0x8000; + if (clear_requires_scissor(t)) { + force_clear<uint16_t>(t, &ctx->scissor); + clear_buffer<uint16_t>(t, depth); + } else { + t.enable_delayed_clear(depth); + } } } -void ClearColorRect(GLuint fbo, GLint xoffset, GLint yoffset, GLsizei width, - GLsizei height, GLfloat r, GLfloat g, GLfloat b, - GLfloat a) { - GLfloat color[] = {r, g, b, a}; - Framebuffer& fb = ctx->framebuffers[fbo]; - Texture& t = ctx->textures[fb.color_attachment]; - IntRect scissor = - IntRect{xoffset, yoffset, xoffset + width, yoffset + height}.intersection( - t.offset_bounds()); - ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0, - scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT, - color); -} - void InvalidateFramebuffer(GLenum target, GLsizei num_attachments, const GLenum* attachments) { Framebuffer* fb = get_framebuffer(target); @@ -2497,7 +2027,7 @@ void InvalidateFramebuffer(GLenum target, GLsizei num_attachments, switch (attachments[i]) { case GL_DEPTH_ATTACHMENT: { Texture& t = ctx->textures[fb->depth_attachment]; - t.set_cleared(false); + t.disable_delayed_clear(); break; } case GL_COLOR_ATTACHMENT0: { @@ -2516,58 +2046,40 @@ void ReadPixels(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER); if (!fb) return; assert(format == GL_RED || format == GL_RGBA || format == GL_RGBA_INTEGER || - format == GL_BGRA || format == GL_RG); + format == GL_BGRA); Texture& t = ctx->textures[fb->color_attachment]; if (!t.buf) return; prepare_texture(t); // debugf("read pixels %d, %d, %d, %d from fb %d with format %x\n", x, y, // width, height, ctx->read_framebuffer_binding, t.internal_format); - x -= t.offset.x; - y -= t.offset.y; - assert(x >= 0 && y >= 0); assert(x + width <= t.width); assert(y + height <= t.height); if (internal_format_for_data(format, type) != t.internal_format) { debugf("mismatched format for read pixels: %x vs %x\n", t.internal_format, internal_format_for_data(format, type)); assert(false); - return; - } - // Only support readback conversions that are reversible - assert(!format_requires_conversion(format, t.internal_format) || - bytes_for_internal_format(format) == t.bpp()); - uint8_t* dest = (uint8_t*)data; - size_t destStride = width * t.bpp(); - if (y < 0) { - dest += -y * destStride; - height += y; - y = 0; - } - if (y + height > t.height) { - height = t.height - y; - } - if (x < 0) { - dest += -x * t.bpp(); - width += x; - x = 0; } - if (x + width > t.width) { - width = t.width - x; - } - if (width <= 0 || height <= 0) { - return; + int bpp = t.bpp(); + char* dest = (char*)data; + size_t src_stride = t.stride(bpp); + char* src = t.sample_ptr(x, y, fb->layer, bpp, src_stride); + for (; height > 0; height--) { + if (t.internal_format == GL_RGBA8 && format != GL_BGRA) { + copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width); + } else { + memcpy(dest, src, width * bpp); + } + dest += width * bpp; + src += src_stride; } - convert_copy(format, t.internal_format, dest, destStride, - (const uint8_t*)t.sample_ptr(x, y), t.stride(), width, height); } void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, - GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX, - GLint dstY, GLint dstZ, GLsizei srcWidth, - GLsizei srcHeight, GLsizei srcDepth) { + GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX, GLint dstY, + GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, + GLsizei srcDepth) { assert(srcLevel == 0 && dstLevel == 0); - assert(srcZ == 0 && srcDepth == 1 && dstZ == 0); if (srcTarget == GL_RENDERBUFFER) { Renderbuffer& rb = ctx->renderbuffers[srcName]; srcName = rb.texture; @@ -2581,44 +2093,532 @@ void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel, prepare_texture(srctex); Texture& dsttex = ctx->textures[dstName]; if (!dsttex.buf) return; - assert(!dsttex.locked); IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight}; prepare_texture(dsttex, &skip); assert(srctex.internal_format == dsttex.internal_format); assert(srcWidth >= 0); assert(srcHeight >= 0); + assert(srcDepth >= 0); assert(srcX + srcWidth <= srctex.width); assert(srcY + srcHeight <= srctex.height); + assert(srcZ + srcDepth <= max(srctex.depth, 1)); assert(dstX + srcWidth <= dsttex.width); assert(dstY + srcHeight <= dsttex.height); + assert(dstZ + srcDepth <= max(dsttex.depth, 1)); int bpp = srctex.bpp(); - int src_stride = srctex.stride(); - int dest_stride = dsttex.stride(); - char* dest = dsttex.sample_ptr(dstX, dstY); - char* src = srctex.sample_ptr(srcX, srcY); - for (int y = 0; y < srcHeight; y++) { - memcpy(dest, src, srcWidth * bpp); - dest += dest_stride; - src += src_stride; + int src_stride = srctex.stride(bpp); + int dest_stride = dsttex.stride(bpp); + for (int z = 0; z < srcDepth; z++) { + char* dest = dsttex.sample_ptr(dstX, dstY, dstZ + z, bpp, dest_stride); + char* src = srctex.sample_ptr(srcX, srcY, srcZ + z, bpp, src_stride); + for (int y = 0; y < srcHeight; y++) { + memcpy(dest, src, srcWidth * bpp); + dest += dest_stride; + src += src_stride; + } } } -void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset, - GLint yoffset, GLint x, GLint y, GLsizei width, +void CopyTexSubImage3D(GLenum target, UNUSED GLint level, GLint xoffset, GLint yoffset, + GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height) { assert(level == 0); Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER); if (!fb) return; - CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D, 0, x, y, 0, - ctx->get_binding(target), GL_TEXTURE_2D, 0, xoffset, yoffset, - 0, width, height, 1); + CopyImageSubData(fb->color_attachment, GL_TEXTURE_3D, 0, x, y, fb->layer, + ctx->get_binding(target), GL_TEXTURE_3D, 0, xoffset, yoffset, + zoffset, width, height, 1); +} + +void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset, GLint yoffset, + GLint x, GLint y, GLsizei width, GLsizei height) { + assert(level == 0); + Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER); + if (!fb) return; + CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D_ARRAY, 0, x, y, + fb->layer, ctx->get_binding(target), GL_TEXTURE_2D_ARRAY, 0, + xoffset, yoffset, 0, width, height, 1); } } // extern "C" -#include "blend.h" -#include "composite.h" -#include "swgl_ext.h" +using PackedRGBA8 = V16<uint8_t>; +using WideRGBA8 = V16<uint16_t>; +using HalfRGBA8 = V8<uint16_t>; + +static inline WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); } + +static inline PackedRGBA8 pack(WideRGBA8 p) { +#if USE_SSE2 + return _mm_packus_epi16(lowHalf(p), highHalf(p)); +#elif USE_NEON + return vcombine_u8(vqmovn_u16(lowHalf(p)), vqmovn_u16(highHalf(p))); +#else + return CONVERT(p, PackedRGBA8); +#endif +} + +static inline HalfRGBA8 packRGBA8(I32 a, I32 b) { +#if USE_SSE2 + return _mm_packs_epi32(a, b); +#elif USE_NEON + return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b)); +#else + return CONVERT(combine(a, b), HalfRGBA8); +#endif +} + +using PackedR8 = V4<uint8_t>; +using WideR8 = V4<uint16_t>; + +static inline WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); } + +static inline WideR8 packR8(I32 a) { +#if USE_SSE2 + return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a))); +#elif USE_NEON + return vqmovun_s32(a); +#else + return CONVERT(a, WideR8); +#endif +} + +static inline PackedR8 pack(WideR8 p) { +#if USE_SSE2 + auto m = expand(p); + auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m)); + return SHUFFLE(r, r, 0, 1, 2, 3); +#elif USE_NEON + return lowHalf(bit_cast<V8<uint8_t>>(vqmovn_u16(expand(p)))); +#else + return CONVERT(p, PackedR8); +#endif +} + +using ZMask4 = V4<int16_t>; +using ZMask8 = V8<int16_t>; + +static inline PackedRGBA8 unpack(ZMask4 mask, uint32_t*) { + return bit_cast<PackedRGBA8>(mask.xxyyzzww); +} + +static inline WideR8 unpack(ZMask4 mask, uint8_t*) { + return bit_cast<WideR8>(mask); +} + +#if USE_SSE2 +# define ZMASK_NONE_PASSED 0xFFFF +# define ZMASK_ALL_PASSED 0 +static inline uint32_t zmask_code(ZMask8 mask) { + return _mm_movemask_epi8(mask); +} +static inline uint32_t zmask_code(ZMask4 mask) { + return zmask_code(mask.xyzwxyzw); +} +#else +using ZMask4Code = V4<uint8_t>; +using ZMask8Code = V8<uint8_t>; +# define ZMASK_NONE_PASSED 0xFFFFFFFFU +# define ZMASK_ALL_PASSED 0 +static inline uint32_t zmask_code(ZMask4 mask) { + return bit_cast<uint32_t>(CONVERT(mask, ZMask4Code)); +} +static inline uint32_t zmask_code(ZMask8 mask) { + return zmask_code( + ZMask4((U16(lowHalf(mask)) >> 12) | (U16(highHalf(mask)) << 4))); +} +#endif + +template <int FUNC, bool MASK> +static ALWAYS_INLINE int check_depth8(uint16_t z, uint16_t* zbuf, + ZMask8& outmask) { + ZMask8 dest = unaligned_load<ZMask8>(zbuf); + ZMask8 src = int16_t(z); + // Invert the depth test to check which pixels failed and should be discarded. + ZMask8 mask = FUNC == GL_LEQUAL ? + // GL_LEQUAL: Not(LessEqual) = Greater + ZMask8(src > dest) + : + // GL_LESS: Not(Less) = GreaterEqual + ZMask8(src >= dest); + switch (zmask_code(mask)) { + case ZMASK_NONE_PASSED: + return 0; + case ZMASK_ALL_PASSED: + if (MASK) { + unaligned_store(zbuf, src); + } + return -1; + default: + if (MASK) { + unaligned_store(zbuf, (mask & dest) | (~mask & src)); + } + outmask = mask; + return 1; + } +} + +template <bool FULL_SPANS, bool DISCARD> +static ALWAYS_INLINE bool check_depth4(ZMask4 src, uint16_t* zbuf, + ZMask4& outmask, int span = 0) { + ZMask4 dest = unaligned_load<ZMask4>(zbuf); + // Invert the depth test to check which pixels failed and should be discarded. + ZMask4 mask = ctx->depthfunc == GL_LEQUAL + ? + // GL_LEQUAL: Not(LessEqual) = Greater + ZMask4(src > dest) + : + // GL_LESS: Not(Less) = GreaterEqual + ZMask4(src >= dest); + if (!FULL_SPANS) { + mask |= ZMask4(span) < ZMask4{1, 2, 3, 4}; + } + if (zmask_code(mask) == ZMASK_NONE_PASSED) { + return false; + } + if (!DISCARD && ctx->depthmask) { + unaligned_store(zbuf, (mask & dest) | (~mask & src)); + } + outmask = mask; + return true; +} + +template <bool FULL_SPANS, bool DISCARD> +static ALWAYS_INLINE bool check_depth4(uint16_t z, uint16_t* zbuf, + ZMask4& outmask, int span = 0) { + return check_depth4<FULL_SPANS, DISCARD>(ZMask4(int16_t(z)), zbuf, outmask, + span); +} + +template <typename T> +static inline ZMask4 packZMask4(T a) { +#if USE_SSE2 + return lowHalf(bit_cast<ZMask8>(_mm_packs_epi32(a, a))); +#elif USE_NEON + return vqmovn_s32(a); +#else + return CONVERT(a, ZMask4); +#endif +} + +static ALWAYS_INLINE ZMask4 packDepth() { + return packZMask4(cast(fragment_shader->gl_FragCoord.z * 0xFFFF) - 0x8000); +} + +static ALWAYS_INLINE void discard_depth(ZMask4 src, uint16_t* zbuf, + ZMask4 mask) { + if (ctx->depthmask) { + ZMask4 dest = unaligned_load<ZMask4>(zbuf); + mask |= packZMask4(fragment_shader->isPixelDiscarded); + unaligned_store(zbuf, (mask & dest) | (~mask & src)); + } +} + +static ALWAYS_INLINE void discard_depth(uint16_t z, uint16_t* zbuf, + ZMask4 mask) { + discard_depth(ZMask4(int16_t(z)), zbuf, mask); +} + +static inline WideRGBA8 pack_pixels_RGBA8(const vec4& v) { + ivec4 i = round_pixel(v); + HalfRGBA8 xz = packRGBA8(i.z, i.x); + HalfRGBA8 yw = packRGBA8(i.y, i.w); + HalfRGBA8 xy = zipLow(xz, yw); + HalfRGBA8 zw = zipHigh(xz, yw); + HalfRGBA8 lo = zip2Low(xy, zw); + HalfRGBA8 hi = zip2High(xy, zw); + return combine(lo, hi); +} + +static inline WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v) { + I32 i = round_pixel((Float){v.z, v.y, v.x, v.w}); + HalfRGBA8 c = packRGBA8(i, i); + return combine(c, c); +} + +static inline WideRGBA8 pack_pixels_RGBA8() { + return pack_pixels_RGBA8(fragment_shader->gl_FragColor); +} + +template <typename V> +static inline PackedRGBA8 pack_span(uint32_t*, const V& v) { + return pack(pack_pixels_RGBA8(v)); +} + +static inline PackedRGBA8 pack_span(uint32_t*) { + return pack(pack_pixels_RGBA8()); +} + +// (x*y + x) >> 8, cheap approximation of (x*y) / 255 +template <typename T> +static inline T muldiv255(T x, T y) { + return (x * y + x) >> 8; +} + +// Byte-wise addition for when x or y is a signed 8-bit value stored in the +// low byte of a larger type T only with zeroed-out high bits, where T is +// greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used +// upon signed operands, using up all the precision in a 16 bit integer, and +// potentially losing the sign bit in the last >> 8 shift. Due to the +// properties of two's complement arithmetic, even though we've discarded the +// sign bit, we can still represent a negative number under addition (without +// requiring any extra sign bits), just that any negative number will behave +// like a large unsigned number under addition, generating a single carry bit +// on overflow that we need to discard. Thus, just doing a byte-wise add will +// overflow without the troublesome carry, giving us only the remaining 8 low +// bits we actually need while keeping the high bits at zero. +template <typename T> +static inline T addlow(T x, T y) { + typedef VectorType<uint8_t, sizeof(T)> bytes; + return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y)); +} + +static inline WideRGBA8 alphas(WideRGBA8 c) { + return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15); +} + +static inline WideRGBA8 blend_pixels_RGBA8(PackedRGBA8 pdst, WideRGBA8 src) { + WideRGBA8 dst = unpack(pdst); + const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0, 0xFFFF, 0xFFFF, + 0xFFFF, 0, 0xFFFF, 0xFFFF, 0xFFFF, 0, + 0xFFFF, 0xFFFF, 0xFFFF, 0}; + const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF, + 0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF}; + const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255, + 0, 0, 0, 255, 0, 0, 0, 255}; + switch (blend_key) { + case BLEND_KEY_NONE: + return src; + case BLEND_KEY(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE): + // dst + src.a*(src.rgb1 - dst.rgb0) + // use addlow for signed overflow + return addlow(dst, + muldiv255(alphas(src), (src | ALPHA_OPAQUE) - (dst & RGB_MASK))); + case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC_ALPHA): + return src + dst - muldiv255(dst, alphas(src)); + case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_COLOR): + return dst - muldiv255(dst, src); + case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE): + return dst - (muldiv255(dst, src) & RGB_MASK); + case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA): + return dst - muldiv255(dst, alphas(src)); + case BLEND_KEY(GL_ZERO, GL_SRC_COLOR): + return muldiv255(src, dst); + case BLEND_KEY(GL_ONE, GL_ONE): + return src + dst; + case BLEND_KEY(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA): + return src + dst - (muldiv255(dst, src) & ALPHA_MASK); + case BLEND_KEY(GL_ONE, GL_ZERO): + return src; + case BLEND_KEY(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE): + // src*(1-dst.a) + dst*1 = src - src*dst.a + dst + return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK); + case BLEND_KEY(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR): + // src*k + (1-src)*dst = src*k + dst - src*dst = dst + src*(k - dst) + // use addlow for signed overflow + return addlow(dst, + muldiv255(src, combine(ctx->blendcolor, ctx->blendcolor) - dst)); + case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { + WideRGBA8 secondary = + pack_pixels_RGBA8(fragment_shader->gl_SecondaryFragColor); + return src + dst - muldiv255(dst, secondary); + } + default: + UNREACHABLE; + // return src; + } +} + +template <bool DISCARD> +static inline void discard_output(uint32_t* buf, PackedRGBA8 mask) { + PackedRGBA8 dst = unaligned_load<PackedRGBA8>(buf); + WideRGBA8 r = pack_pixels_RGBA8(); + if (blend_key) r = blend_pixels_RGBA8(dst, r); + if (DISCARD) mask |= bit_cast<PackedRGBA8>(fragment_shader->isPixelDiscarded); + unaligned_store(buf, (mask & dst) | (~mask & pack(r))); +} + +template <bool DISCARD> +static inline void discard_output(uint32_t* buf) { + discard_output<DISCARD>(buf, 0); +} + +template <> +inline void discard_output<false>(uint32_t* buf) { + WideRGBA8 r = pack_pixels_RGBA8(); + if (blend_key) r = blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), r); + unaligned_store(buf, pack(r)); +} + +static inline PackedRGBA8 span_mask_RGBA8(int span) { + return bit_cast<PackedRGBA8>(I32(span) < I32{1, 2, 3, 4}); +} + +static inline PackedRGBA8 span_mask(uint32_t*, int span) { + return span_mask_RGBA8(span); +} + +static inline WideR8 pack_pixels_R8(Float c) { + return packR8(round_pixel(c)); +} + +static inline WideR8 pack_pixels_R8() { + return pack_pixels_R8(fragment_shader->gl_FragColor.x); +} + +template <typename C> +static inline PackedR8 pack_span(uint8_t*, C c) { + return pack(pack_pixels_R8(c)); +} + +static inline PackedR8 pack_span(uint8_t*) { return pack(pack_pixels_R8()); } + +static inline WideR8 blend_pixels_R8(WideR8 dst, WideR8 src) { + switch (blend_key) { + case BLEND_KEY_NONE: + return src; + case BLEND_KEY(GL_ZERO, GL_SRC_COLOR): + return muldiv255(src, dst); + case BLEND_KEY(GL_ONE, GL_ONE): + return src + dst; + case BLEND_KEY(GL_ONE, GL_ZERO): + return src; + default: + UNREACHABLE; + // return src; + } +} + +template <bool DISCARD> +static inline void discard_output(uint8_t* buf, WideR8 mask) { + WideR8 dst = unpack(unaligned_load<PackedR8>(buf)); + WideR8 r = pack_pixels_R8(); + if (blend_key) r = blend_pixels_R8(dst, r); + if (DISCARD) mask |= packR8(fragment_shader->isPixelDiscarded); + unaligned_store(buf, pack((mask & dst) | (~mask & r))); +} + +template <bool DISCARD> +static inline void discard_output(uint8_t* buf) { + discard_output<DISCARD>(buf, 0); +} + +template <> +inline void discard_output<false>(uint8_t* buf) { + WideR8 r = pack_pixels_R8(); + if (blend_key) r = blend_pixels_R8(unpack(unaligned_load<PackedR8>(buf)), r); + unaligned_store(buf, pack(r)); +} + +static inline WideR8 span_mask_R8(int span) { + return bit_cast<WideR8>(WideR8(span) < WideR8{1, 2, 3, 4}); +} + +static inline WideR8 span_mask(uint8_t*, int span) { + return span_mask_R8(span); +} + +template <bool DISCARD, bool W, typename P, typename M> +static inline void commit_output(P* buf, M mask) { + fragment_shader->run<W>(); + discard_output<DISCARD>(buf, mask); +} + +template <bool DISCARD, bool W, typename P> +static inline void commit_output(P* buf) { + fragment_shader->run<W>(); + discard_output<DISCARD>(buf); +} + +template <bool DISCARD, bool W, typename P> +static inline void commit_output(P* buf, int span) { + commit_output<DISCARD, W>(buf, span_mask(buf, span)); +} + +template <bool DISCARD, bool W, typename P, typename Z> +static inline void commit_output(P* buf, Z z, uint16_t* zbuf) { + ZMask4 zmask; + if (check_depth4<true, DISCARD>(z, zbuf, zmask)) { + commit_output<DISCARD, W>(buf, unpack(zmask, buf)); + if (DISCARD) { + discard_depth(z, zbuf, zmask); + } + } else { + fragment_shader->skip<W>(); + } +} + +template <bool DISCARD, bool W, typename P, typename Z> +static inline void commit_output(P* buf, Z z, uint16_t* zbuf, int span) { + ZMask4 zmask; + if (check_depth4<false, DISCARD>(z, zbuf, zmask, span)) { + commit_output<DISCARD, W>(buf, unpack(zmask, buf)); + if (DISCARD) { + discard_depth(z, zbuf, zmask); + } + } +} + +static inline void commit_span(uint32_t* buf, PackedRGBA8 r) { + if (blend_key) + r = pack(blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), unpack(r))); + unaligned_store(buf, r); +} + +UNUSED static inline void commit_solid_span(uint32_t* buf, PackedRGBA8 r, + int len) { + if (blend_key) { + auto src = unpack(r); + for (uint32_t* end = &buf[len]; buf < end; buf += 4) { + unaligned_store( + buf, pack(blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), src))); + } + } else { + fill_n(buf, len, bit_cast<U32>(r).x); + } +} + +UNUSED static inline void commit_texture_span(uint32_t* buf, uint32_t* src, + int len) { + if (blend_key) { + for (uint32_t* end = &buf[len]; buf < end; buf += 4, src += 4) { + PackedRGBA8 r = unaligned_load<PackedRGBA8>(src); + unaligned_store(buf, pack(blend_pixels_RGBA8( + unaligned_load<PackedRGBA8>(buf), unpack(r)))); + } + } else { + memcpy(buf, src, len * sizeof(uint32_t)); + } +} + +static inline void commit_span(uint8_t* buf, PackedR8 r) { + if (blend_key) + r = pack(blend_pixels_R8(unpack(unaligned_load<PackedR8>(buf)), unpack(r))); + unaligned_store(buf, r); +} + +UNUSED static inline void commit_solid_span(uint8_t* buf, PackedR8 r, int len) { + if (blend_key) { + auto src = unpack(r); + for (uint8_t* end = &buf[len]; buf < end; buf += 4) { + unaligned_store(buf, pack(blend_pixels_R8( + unpack(unaligned_load<PackedR8>(buf)), src))); + } + } else { + fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(r)); + } +} + +#define DISPATCH_DRAW_SPAN(self, buf, len) do { \ + int drawn = self->draw_span(buf, len); \ + if (drawn) self->step_interp_inputs(drawn >> 2); \ + for (buf += drawn; drawn < len; drawn += 4, buf += 4) { \ + run(self); \ + commit_span(buf, pack_span(buf)); \ + } \ +} while (0) + +#include "texture.h" #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" @@ -2627,14 +2627,942 @@ void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset, #pragma GCC diagnostic ignored "-Wunused-variable" #pragma GCC diagnostic ignored "-Wimplicit-fallthrough" #ifdef __clang__ -# pragma GCC diagnostic ignored "-Wunused-private-field" +#pragma GCC diagnostic ignored "-Wunused-private-field" #else -# pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" #endif #include "load_shader.h" #pragma GCC diagnostic pop -#include "rasterize.h" +typedef vec2_scalar Point2D; +typedef vec4_scalar Point3D; + +struct ClipRect { + float x0; + float y0; + float x1; + float y1; + + ClipRect(const IntRect& i) : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {} + ClipRect(Texture& t) : ClipRect(ctx->apply_scissor(t.bounds())) {} + + template <typename P> + bool overlaps(int nump, const P* p) const { + // Generate a mask of which side of the clip rect all of a polygon's points + // fall inside of. This is a cheap conservative estimate of whether the + // bounding box of the polygon might overlap the clip rect, rather than an + // exact test that would require multiple slower line intersections. + int sides = 0; + for (int i = 0; i < nump; i++) { + sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2; + sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8; + } + return sides == 0xF; + } +}; + +// Helper function for drawing 8-pixel wide chunks of a span with depth buffer. +// Using 8-pixel chunks maximizes use of 16-bit depth values in 128-bit wide +// SIMD register. However, since fragment shaders process only 4 pixels per +// invocation, we need to run fragment shader twice for every 8 pixel batch +// of results we get from the depth test. Perspective is not supported. +template <int FUNC, bool MASK, typename P> +static inline void draw_depth_span(uint16_t z, P* buf, uint16_t* depth, + int span) { + int skip = 0; + // Check if the fragment shader has an optimized draw specialization. + if (fragment_shader->has_draw_span(buf)) { + // The loop tries to accumulate runs of pixels that passed (len) and + // runs of pixels that failed (skip). This allows it to pass the largest + // possible span in between changes in depth pass or fail status to the + // fragment shader's draw specialer. + int len = 0; + do { + ZMask8 zmask; + // Process depth in 8-pixel chunks. + switch (check_depth8<FUNC, MASK>(z, depth, zmask)) { + case 0: // All pixels failed the depth test. + if (len) { + // Flush out passed pixels. + fragment_shader->draw_span(buf - len, len); + len = 0; + } + // Accumulate 2 skipped chunks. + skip += 2; + break; + case -1: // All pixels passed the depth test. + if (skip) { + // Flushed out any skipped chunks. + fragment_shader->skip(skip); + skip = 0; + } + // Accumulate 8 passed pixels. + len += 8; + break; + default: // Mixture of pass and fail results. + if (len) { + // Flush out any passed pixels. + fragment_shader->draw_span(buf - len, len); + len = 0; + } else if (skip) { + // Flush out any skipped chunks. + fragment_shader->skip(skip); + skip = 0; + } + // Run fragment shader on first 4 depth results. + commit_output<false, false>(buf, unpack(lowHalf(zmask), buf)); + // Run fragment shader on next 4 depth results. + commit_output<false, false>(buf + 4, unpack(highHalf(zmask), buf)); + break; + } + // Advance to next 8 pixels... + buf += 8; + depth += 8; + span -= 8; + } while (span >= 8); + // Flush out any remaining passed pixels. + if (len) { + fragment_shader->draw_span(buf - len, len); + } + } else { + // No draw specialization, so we can use a simpler loop here that just + // accumulates depth failures, but otherwise invokes fragment shader + // immediately on depth pass. + do { + ZMask8 zmask; + // Process depth in 8-pixel chunks. + switch (check_depth8<FUNC, MASK>(z, depth, zmask)) { + case 0: // All pixels failed the depth test. + // Accumulate 2 skipped chunks. + skip += 2; + break; + case -1: // All pixels passed the depth test. + if (skip) { + // Flush out any skipped chunks. + fragment_shader->skip(skip); + skip = 0; + } + // Run the fragment shader for two 4-pixel chunks. + commit_output<false, false>(buf); + commit_output<false, false>(buf + 4); + break; + default: // Mixture of pass and fail results. + if (skip) { + // Flush out any skipped chunks. + fragment_shader->skip(skip); + skip = 0; + } + // Run fragment shader on first 4 depth results. + commit_output<false, false>(buf, unpack(lowHalf(zmask), buf)); + // Run fragment shader on next 4 depth results. + commit_output<false, false>(buf + 4, unpack(highHalf(zmask), buf)); + break; + } + // Advance to next 8 pixels... + buf += 8; + depth += 8; + span -= 8; + } while (span >= 8); + } + // Flush out any remaining skipped chunks. + if (skip) { + fragment_shader->skip(skip); + } +} + +// Draw a simple span in 4-pixel wide chunks, optionally using depth. +template <bool DISCARD, bool W, typename P, typename Z> +static ALWAYS_INLINE void draw_span(P* buf, uint16_t* depth, int span, Z z) { + if (depth) { + // Depth testing is enabled. If perspective is used, Z values will vary + // across the span, we use packDepth to generate 16-bit Z values suitable + // for depth testing based on current values from gl_FragCoord.z. + // Otherwise, for the no-perspective case, we just use the provided Z. + // Process 4-pixel chunks first. + for (; span >= 4; span -= 4, buf += 4, depth += 4) { + commit_output<DISCARD, W>(buf, z(), depth); + } + // If there are any remaining pixels, do a partial chunk. + if (span > 0) { + commit_output<DISCARD, W>(buf, z(), depth, span); + } + } else { + // Process 4-pixel chunks first. + for (; span >= 4; span -= 4, buf += 4) { + commit_output<DISCARD, W>(buf); + } + // If there are any remaining pixels, do a partial chunk. + if (span > 0) { + commit_output<DISCARD, W>(buf, span); + } + } +} + +// Draw spans for each row of a given quad (or triangle) with a constant Z +// value. The quad is assumed convex. It is clipped to fall within the given +// clip rect. In short, this function rasterizes a quad by first finding a +// top most starting point and then from there tracing down the left and right +// sides of this quad until it hits the bottom, outputting a span between the +// current left and right positions at each row along the way. Points are +// assumed to be ordered in either CW or CCW to support this, but currently +// both orders (CW and CCW) are supported and equivalent. +template <typename P> +static inline void draw_quad_spans(int nump, Point2D p[4], uint16_t z, + Interpolants interp_outs[4], + Texture& colortex, int layer, + Texture& depthtex, + const ClipRect& clipRect) { + // Only triangles and convex quads supported. + assert(nump == 3 || nump == 4); + Point2D l0, r0, l1, r1; + int l0i, r0i, l1i, r1i; + { + // Find the index of the top-most (smallest Y) point from which + // rasterization can start. + int top = nump > 3 && p[3].y < p[2].y + ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3) + : (p[1].y < p[3].y ? 1 : 3)) + : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2) + : (p[1].y < p[2].y ? 1 : 2)); + // Helper to find next index in the points array, walking forward. +#define NEXT_POINT(idx) \ + ({ \ + int cur = (idx) + 1; \ + cur < nump ? cur : 0; \ + }) + // Helper to find the previous index in the points array, walking backward. +#define PREV_POINT(idx) \ + ({ \ + int cur = (idx)-1; \ + cur >= 0 ? cur : nump - 1; \ + }) + // Start looking for "left"-side and "right"-side descending edges starting + // from the determined top point. + int next = NEXT_POINT(top); + int prev = PREV_POINT(top); + if (p[top].y == p[next].y) { + // If the next point is on the same row as the top, then advance one more + // time to the next point and use that as the "left" descending edge. + l0i = next; + l1i = NEXT_POINT(next); + // Assume top and prev form a descending "right" edge, as otherwise this + // will be a collapsed polygon and harmlessly bail out down below. + r0i = top; + r1i = prev; + } else if (p[top].y == p[prev].y) { + // If the prev point is on the same row as the top, then advance to the + // prev again and use that as the "right" descending edge. + // Assume top and next form a non-empty descending "left" edge. + l0i = top; + l1i = next; + r0i = prev; + r1i = PREV_POINT(prev); + } else { + // Both next and prev are on distinct rows from top, so both "left" and + // "right" edges are non-empty/descending. + l0i = r0i = top; + l1i = next; + r1i = prev; + } + // Load the points from the indices. + l0 = p[l0i]; // Start of left edge + r0 = p[r0i]; // End of left edge + l1 = p[l1i]; // Start of right edge + r1 = p[r1i]; // End of right edge + // debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1: + // %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i, + // r1.x, r1.y); + } + + struct Edge + { + float yScale; + float xSlope; + float x; + Interpolants interpSlope; + Interpolants interp; + + Edge(float y, const Point2D& p0, const Point2D& p1, + const Interpolants& i0, const Interpolants& i1) : + // Inverse Y scale for slope calculations. Avoid divide on 0-length edge. + // Later checks below ensure that Y <= p1.y, or otherwise we don't use + // this edge. We just need to guard against Y == p1.y == p0.y. In that + // case, Y - p0.y == 0 and will cancel out the slopes below, except if + // yScale is Inf for some reason (or worse, NaN), which 1/(p1.y-p0.y) + // might produce if we don't bound it. + yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)), + // Calculate dX/dY slope + xSlope((p1.x - p0.x) * yScale), + // Initialize current X based on Y and slope + x(p0.x + (y - p0.y) * xSlope), + // Calculate change in interpolants per change in Y + interpSlope((i1 - i0) * yScale), + // Initialize current interpolants based on Y and slope + interp(i0 + (y - p0.y) * interpSlope) + {} + + void nextRow() { + // step current X and interpolants to next row from slope + x += xSlope; + interp += interpSlope; + } + }; + + // Vertex selection above should result in equal left and right start rows + assert(l0.y == r0.y); + // Find the start y, clip to within the clip rect, and round to row center. + float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f; + // Initialize left and right edges from end points and start Y + Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]); + Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]); + // Get pointer to color buffer and depth buffer at current Y + P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer, sizeof(P)); + uint16_t* fdepth = + (uint16_t*)depthtex.sample_ptr(0, int(y), 0, sizeof(uint16_t)); + // Loop along advancing Ys, rasterizing spans at each row + float checkY = min(min(l1.y, r1.y), clipRect.y1); + for (;;) { + // Check if we maybe passed edge ends or outside clip rect... + if (y > checkY) { + // If we're outside the clip rect, we're done. + if (y > clipRect.y1) break; + // Helper to find the next non-duplicate vertex that doesn't loop back. +#define STEP_EDGE(e0i, e0, e1i, e1, STEP_POINT, end) \ + for (;;) { \ + /* Set new start of edge to be end of old edge */ \ + e0i = e1i; \ + e0 = e1; \ + /* Set new end of edge to next point */ \ + e1i = STEP_POINT(e1i); \ + e1 = p[e1i]; \ + /* If the edge is descending, use it. */ \ + if (e1.y > e0.y) break; \ + /* If the edge is ascending or crossed the end, we're done. */ \ + if (e1.y < e0.y || e0i == end) return; \ + /* Otherwise, it's a duplicate, so keep searching. */ \ + } + // Check if Y advanced past the end of the left edge + if (y > l1.y) { + // Step to next left edge past Y and reset edge interpolants. + do { STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i); } while (y > l1.y); + left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]); + } + // Check if Y advanced past the end of the right edge + if (y > r1.y) { + // Step to next right edge past Y and reset edge interpolants. + do { STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i); } while (y > r1.y); + right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]); + } + // Reset check condition for next time around. + checkY = min(min(l1.y, r1.y), clipRect.y1); + } + // lx..rx form the bounds of the span. WR does not use backface culling, + // so we need to use min/max to support the span in either orientation. + // Clip the span to fall within the clip rect and then round to nearest + // column. + int startx = int(max(min(left.x, right.x), clipRect.x0) + 0.5f); + int endx = int(min(max(left.x, right.x), clipRect.x1) + 0.5f); + // Check if span is non-empty. + int span = endx - startx; + if (span > 0) { + ctx->shaded_rows++; + ctx->shaded_pixels += span; + // Advance color/depth buffer pointers to the start of the span. + P* buf = fbuf + startx; + // Check if the we will need to use depth-buffer or discard on this span. + uint16_t* depth = depthtex.buf != nullptr ? fdepth + startx : nullptr; + bool use_discard = fragment_shader->use_discard(); + if (depthtex.delay_clear) { + // Delayed clear is enabled for the depth buffer. Check if this row + // needs to be cleared. + int yi = int(y); + uint32_t& mask = depthtex.cleared_rows[yi / 32]; + if ((mask & (1 << (yi & 31))) == 0) { + // The depth buffer is unitialized on this row, but we know it will + // thus be cleared entirely to the clear value. This lets us quickly + // check the constant Z value of the quad against the clear Z to know + // if the entire span passes or fails the depth test all at once. + switch (ctx->depthfunc) { + case GL_LESS: + if (int16_t(z) < int16_t(depthtex.clear_val)) + break; + else + goto next_span; + case GL_LEQUAL: + if (int16_t(z) <= int16_t(depthtex.clear_val)) + break; + else + goto next_span; + } + // If we got here, we passed the depth test. + if (ctx->depthmask) { + // Depth writes are enabled, so we need to initialize depth. + mask |= 1 << (yi & 31); + depthtex.delay_clear--; + if (use_discard) { + // if discard is enabled, we don't know what pixels may be + // written to, so we have to clear the entire row. + force_clear_row<uint16_t>(depthtex, yi); + } else { + // Otherwise, we only need to clear the pixels that fall outside + // the current span on this row. + if (startx > 0 || endx < depthtex.width) { + force_clear_row<uint16_t>(depthtex, yi, startx, endx); + } + // Fill in the span's Z values with constant Z. + clear_buffer<uint16_t>(depthtex, z, 0, + IntRect{startx, yi, endx, yi + 1}); + // We already passed the depth test, so no need to test depth + // any more. + depth = nullptr; + } + } else { + // No depth writes, so don't clear anything, and no need to test. + depth = nullptr; + } + } + } + if (colortex.delay_clear) { + // Delayed clear is enabled for the color buffer. Check if needs clear. + int yi = int(y); + uint32_t& mask = colortex.cleared_rows[yi / 32]; + if ((mask & (1 << (yi & 31))) == 0) { + mask |= 1 << (yi & 31); + colortex.delay_clear--; + if (depth || blend_key || use_discard) { + // If depth test, blending, or discard is used, old color values + // might be sampled, so we need to clear the entire row to fill it. + force_clear_row<P>(colortex, yi); + } else if (startx > 0 || endx < colortex.width) { + // Otherwise, we only need to clear the row outside of the span. + // The fragment shader will fill the row within the span itself. + force_clear_row<P>(colortex, yi, startx, endx); + } + } + } + // Initialize fragment shader interpolants to current span position. + fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1); + fragment_shader->gl_FragCoord.y = y; + { + // Change in interpolants is difference between current right and left + // edges per the change in right and left X. + Interpolants step = + (right.interp - left.interp) * (1.0f / (right.x - left.x)); + // Advance current interpolants to X at start of span. + Interpolants o = left.interp + step * (startx + 0.5f - left.x); + fragment_shader->init_span(&o, &step, 4.0f); + } + if (!use_discard) { + // Fast paths for the case where fragment discard is not used. + if (depth) { + // If depth is used, we want to process spans in 8-pixel chunks to + // maximize sampling and testing 16-bit depth values within the 128- + // bit width of a SIMD register. + if (span >= 8) { + // Specializations for supported depth functions depending on + // whether depth writes are enabled. + if (ctx->depthfunc == GL_LEQUAL) { + if (ctx->depthmask) + draw_depth_span<GL_LEQUAL, true>(z, buf, depth, span); + else + draw_depth_span<GL_LEQUAL, false>(z, buf, depth, span); + } else { + if (ctx->depthmask) + draw_depth_span<GL_LESS, true>(z, buf, depth, span); + else + draw_depth_span<GL_LESS, false>(z, buf, depth, span); + } + // Advance buffers past processed chunks. + buf += span & ~7; + depth += span & ~7; + span &= 7; + } + } else { + // Check if the fragment shader has an optimized draw specialization. + if (span >= 4 && fragment_shader->has_draw_span(buf)) { + // Draw specialization expects 4-pixel chunks. + int len = span & ~3; + fragment_shader->draw_span(buf, len); + buf += len; + span &= 3; + } + } + draw_span<false, false>(buf, depth, span, [=]{ return z; }); + } else { + // If discard is used, then use slower fallbacks. This should be rare. + // Just needs to work, doesn't need to be too fast yet... + draw_span<true, false>(buf, depth, span, [=]{ return z; }); + } + } + next_span: + // Advance Y and edge interpolants to next row. + y++; + left.nextRow(); + right.nextRow(); + // Advance buffers to next row. + fbuf += colortex.stride(sizeof(P)) / sizeof(P); + fdepth += depthtex.stride(sizeof(uint16_t)) / sizeof(uint16_t); + } +} + +// Draw perspective-correct spans for a convex quad that has been clipped to +// the near and far Z planes, possibly producing a clipped convex polygon with +// more than 4 sides. This assumes the Z value will vary across the spans and +// requires interpolants to factor in W values. This tends to be slower than +// the simpler 2D draw_quad_spans above, especially since we can't optimize the +// depth test easily when Z values, and should be used only rarely if possible. +template <typename P> +static inline void draw_perspective_spans(int nump, Point3D* p, + Interpolants* interp_outs, + Texture& colortex, int layer, + Texture& depthtex, + const ClipRect& clipRect) { + Point3D l0, r0, l1, r1; + int l0i, r0i, l1i, r1i; + { + // Find the index of the top-most point (smallest Y) from which + // rasterization can start. + int top = 0; + for (int i = 1; i < nump; i++) { + if (p[i].y < p[top].y) { + top = i; + } + } + // Find left-most top point, the start of the left descending edge. + // Advance forward in the points array, searching at most nump points + // in case the polygon is flat. + l0i = top; + for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) { + l0i = i; + } + if (l0i == nump - 1) { + for (int i = 0; i <= top && p[i].y == p[top].y; i++) { + l0i = i; + } + } + // Find right-most top point, the start of the right descending edge. + // Advance backward in the points array, searching at most nump points. + r0i = top; + for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) { + r0i = i; + } + if (r0i == 0) { + for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) { + r0i = i; + } + } + // End of left edge is next point after left edge start. + l1i = NEXT_POINT(l0i); + // End of right edge is prev point after right edge start. + r1i = PREV_POINT(r0i); + l0 = p[l0i]; // Start of left edge + r0 = p[r0i]; // End of left edge + l1 = p[l1i]; // Start of right edge + r1 = p[r1i]; // End of right edge + } + + struct Edge + { + float yScale; + // Current coordinates for edge. Where in the 2D case of draw_quad_spans, + // it is enough to just track the X coordinate as we advance along the rows, + // for the perspective case we also need to keep track of Z and W. For + // simplicity, we just use the full 3D point to track all these coordinates. + Point3D pSlope; + Point3D p; + Interpolants interpSlope; + Interpolants interp; + + Edge(float y, const Point3D& p0, const Point3D& p1, + const Interpolants& i0, const Interpolants& i1) : + // Inverse Y scale for slope calculations. Avoid divide on 0-length edge. + yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)), + // Calculate dX/dY slope + pSlope((p1 - p0) * yScale), + // Initialize current coords based on Y and slope + p(p0 + (y - p0.y) * pSlope), + // Crucially, these interpolants must be scaled by the point's 1/w value, + // which allows linear interpolation in a perspective-correct manner. + // This will be canceled out inside the fragment shader later. + // Calculate change in interpolants per change in Y + interpSlope((i1 * p1.w - i0 * p0.w) * yScale), + // Initialize current interpolants based on Y and slope + interp(i0 * p0.w + (y - p0.y) * interpSlope) + {} + + float x() const { return p.x; } + vec2_scalar zw() const { return {p.z, p.w}; } + + void nextRow() { + // step current coords and interpolants to next row from slope + p += pSlope; + interp += interpSlope; + } + }; + + // Vertex selection above should result in equal left and right start rows + assert(l0.y == r0.y); + // Find the start y, clip to within the clip rect, and round to row center. + float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f; + // Initialize left and right edges from end points and start Y + Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]); + Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]); + // Get pointer to color buffer and depth buffer at current Y + P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer, sizeof(P)); + uint16_t* fdepth = + (uint16_t*)depthtex.sample_ptr(0, int(y), 0, sizeof(uint16_t)); + // Loop along advancing Ys, rasterizing spans at each row + float checkY = min(min(l1.y, r1.y), clipRect.y1); + for (;;) { + // Check if we maybe passed edge ends or outside clip rect... + if (y > checkY) { + // If we're outside the clip rect, we're done. + if (y > clipRect.y1) break; + // Check if Y advanced past the end of the left edge + if (y > l1.y) { + // Step to next left edge past Y and reset edge interpolants. + do { STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i); } while (y > l1.y); + left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]); + } + // Check if Y advanced past the end of the right edge + if (y > r1.y) { + // Step to next right edge past Y and reset edge interpolants. + do { STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i); } while (y > r1.y); + right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]); + } + // Reset check condition for next time around. + checkY = min(min(l1.y, r1.y), clipRect.y1); + } + // lx..rx form the bounds of the span. WR does not use backface culling, + // so we need to use min/max to support the span in either orientation. + // Clip the span to fall within the clip rect and then round to nearest + // column. + int startx = int(max(min(left.x(), right.x()), clipRect.x0) + 0.5f); + int endx = int(min(max(left.x(), right.x()), clipRect.x1) + 0.5f); + // Check if span is non-empty. + int span = endx - startx; + if (span > 0) { + ctx->shaded_rows++; + ctx->shaded_pixels += span; + // Advance color/depth buffer pointers to the start of the span. + P* buf = fbuf + startx; + // Check if the we will need to use depth-buffer or discard on this span. + uint16_t* depth = depthtex.buf != nullptr ? fdepth + startx : nullptr; + bool use_discard = fragment_shader->use_discard(); + if (depthtex.delay_clear) { + // Delayed clear is enabled for the depth buffer. Check if this row + // needs to be cleared. + int yi = int(y); + uint32_t& mask = depthtex.cleared_rows[yi / 32]; + if ((mask & (1 << (yi & 31))) == 0) { + mask |= 1 << (yi & 31); + depthtex.delay_clear--; + // Since Z varies across the span, it's easier to just clear the + // row and rely on later depth testing. If necessary, this could be + // optimized to test against the start and end Z values of the span + // here. + force_clear_row<uint16_t>(depthtex, yi); + } + } + if (colortex.delay_clear) { + // Delayed clear is enabled for the color buffer. Check if needs clear. + int yi = int(y); + uint32_t& mask = colortex.cleared_rows[yi / 32]; + if ((mask & (1 << (yi & 31))) == 0) { + mask |= 1 << (yi & 31); + colortex.delay_clear--; + if (depth || blend_key || use_discard) { + // If depth test, blending, or discard is used, old color values + // might be sampled, so we need to clear the entire row to fill it. + force_clear_row<P>(colortex, yi); + } else if (startx > 0 || endx < colortex.width) { + // Otherwise, we only need to clear the row outside of the span. + // The fragment shader will fill the row within the span itself. + force_clear_row<P>(colortex, yi, startx, endx); + } + } + } + // Initialize fragment shader interpolants to current span position. + fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1); + fragment_shader->gl_FragCoord.y = y; + { + // Calculate the fragment Z and W change per change in fragment X step. + vec2_scalar stepZW = + (right.zw() - left.zw()) * (1.0f / (right.x() - left.x())); + // Calculate initial Z and W values for span start. + vec2_scalar zw = left.zw() + stepZW * (startx + 0.5f - left.x()); + // Set fragment shader's Z and W values so that it can use them to + // cancel out the 1/w baked into the interpolants. + fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x); + fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y); + fragment_shader->stepZW = stepZW * 4.0f; + // Change in interpolants is difference between current right and left + // edges per the change in right and left X. The left and right + // interpolant values were previously multipled by 1/w, so the step and + // initial span values take this into account. + Interpolants step = + (right.interp - left.interp) * (1.0f / (right.x() - left.x())); + // Advance current interpolants to X at start of span. + Interpolants o = left.interp + step * (startx + 0.5f - left.x()); + fragment_shader->init_span<true>(&o, &step, 4.0f); + } + if (!use_discard) { + // No discard is used. Common case. + draw_span<false, true>(buf, depth, span, packDepth); + } else { + // Discard is used. Rare. + draw_span<true, true>(buf, depth, span, packDepth); + } + } + // Advance Y and edge interpolants to next row. + y++; + left.nextRow(); + right.nextRow(); + // Advance buffers to next row. + fbuf += colortex.stride(sizeof(P)) / sizeof(P); + fdepth += depthtex.stride(sizeof(uint16_t)) / sizeof(uint16_t); + } +} + +// Clip a primitive against both sides of a view-frustum axis, producing +// intermediate vertexes with interpolated attributes that will no longer +// intersect the selected axis planes. This assumes the primitive is convex +// and should produce at most N+2 vertexes for each invocation (only in the +// worst case where one point falls outside on each of the opposite sides +// with the rest of the points inside). +template <XYZW AXIS> +static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP, + Interpolants* outInterp) { + int numClip = 0; + Point3D prev = p[nump - 1]; + Interpolants prevInterp = interp[nump - 1]; + float prevCoord = prev.select(AXIS); + // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and + // if so, remember which side it is outside of. + int prevSide = prevCoord < -prev.w ? -1 : (prevCoord > prev.w ? 1 : 0); + // Loop through points, finding edges that cross the planes by evaluating + // the side at each point. + for (int i = 0; i < nump; i++) { + Point3D cur = p[i]; + Interpolants curInterp = interp[i]; + float curCoord = cur.select(AXIS); + int curSide = curCoord < -cur.w ? -1 : (curCoord > cur.w ? 1 : 0); + // Check if the previous and current end points are on different sides. + if (curSide != prevSide) { + // One of the edge's end points is outside the plane with the other + // inside the plane. Find the offset where it crosses the plane and + // adjust the point and interpolants to there. + if (prevSide) { + // Edge that was previously outside crosses inside. + // Evaluate plane equation for previous and current end-point + // based on previous side and calculate relative offset. + assert(numClip < nump + 2); + float prevDist = prevCoord - prevSide * prev.w; + float curDist = curCoord - prevSide * cur.w; + float k = prevDist / (prevDist - curDist); + outP[numClip] = prev + (cur - prev) * k; + outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k; + numClip++; + } + if (curSide) { + // Edge that was previously inside crosses outside. + // Evaluate plane equation for previous and current end-point + // based on current side and calculate relative offset. + assert(numClip < nump + 2); + float prevDist = prevCoord - curSide * prev.w; + float curDist = curCoord - curSide * cur.w; + float k = prevDist / (prevDist - curDist); + outP[numClip] = prev + (cur - prev) * k; + outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k; + numClip++; + } + } + if (!curSide) { + // The current end point is inside the plane, so output point unmodified. + assert(numClip < nump + 2); + outP[numClip] = cur; + outInterp[numClip] = curInterp; + numClip++; + } + prev = cur; + prevInterp = curInterp; + prevCoord = curCoord; + prevSide = curSide; + } + return numClip; +} + +// Helper function to dispatch to perspective span drawing with points that +// have already been transformed and clipped. +static inline void draw_perspective_clipped(int nump, Point3D* p_clip, + Interpolants* interp_clip, + Texture& colortex, int layer, + Texture& depthtex) { + // If polygon is ouside clip rect, nothing to draw. + ClipRect clipRect(colortex); + if (!clipRect.overlaps(nump, p_clip)) { + return; + } + + // Finally draw perspective-correct spans for the polygon. + if (colortex.internal_format == GL_RGBA8) { + draw_perspective_spans<uint32_t>(nump, p_clip, interp_clip, colortex, + layer, depthtex, clipRect); + } else if (colortex.internal_format == GL_R8) { + draw_perspective_spans<uint8_t>(nump, p_clip, interp_clip, colortex, + layer, depthtex, clipRect); + } else { + assert(false); + } +} + +// Draws a perspective-correct 3D primitive with varying Z value, as opposed +// to a simple 2D planar primitive with a constant Z value that could be +// trivially Z rejected. This requires clipping the primitive against the near +// and far planes to ensure it stays within the valid Z-buffer range. The Z +// and W of each fragment of the primitives are interpolated across the +// generated spans and then depth-tested as appropriate. +// Additionally, vertex attributes must be interpolated with perspective- +// correction by dividing by W before interpolation, and then later multiplied +// by W again to produce the final correct attribute value for each fragment. +// This process is expensive and should be avoided if possible for primitive +// batches that are known ahead of time to not need perspective-correction. +static void draw_perspective(int nump, + Interpolants interp_outs[4], + Texture& colortex, int layer, + Texture& depthtex) { + // Convert output of vertex shader to screen space. + vec4 pos = vertex_shader->gl_Position; + vec3_scalar scale = + vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f; + vec3_scalar offset = + vec3_scalar(ctx->viewport.x0, ctx->viewport.y0, 0.0f) + scale; + if (test_none(pos.z <= -pos.w || pos.z >= pos.w)) { + // No points cross the near or far planes, so no clipping required. + // Just divide coords by W and convert to viewport. + Float w = 1.0f / pos.w; + vec3 screen = pos.sel(X, Y, Z) * w * scale + offset; + Point3D p[4] = { + {screen.x.x, screen.y.x, screen.z.x, w.x}, + {screen.x.y, screen.y.y, screen.z.y, w.y}, + {screen.x.z, screen.y.z, screen.z.z, w.z}, + {screen.x.w, screen.y.w, screen.z.w, w.w} + }; + draw_perspective_clipped(nump, p, interp_outs, colortex, layer, depthtex); + } else { + // Points cross the near or far planes, so we need to clip. + // Start with the original 3 or 4 points... + Point3D p[4] = { + {pos.x.x, pos.y.x, pos.z.x, pos.w.x}, + {pos.x.y, pos.y.y, pos.z.y, pos.w.y}, + {pos.x.z, pos.y.z, pos.z.z, pos.w.z}, + {pos.x.w, pos.y.w, pos.z.w, pos.w.w} + }; + // Clipping can expand the points by 1 for each of 6 view frustum planes. + Point3D p_clip[4 + 6]; + Interpolants interp_clip[4 + 6]; + // Clip against near and far Z planes. + nump = clip_side<Z>(nump, p, interp_outs, p_clip, interp_clip); + // If no points are left inside the view frustum, there's nothing to draw. + if (nump < 3) { + return; + } + // After clipping against only the near and far planes, we might still + // produce points where W = 0, exactly at the camera plane. OpenGL specifies + // that for clip coordinates, points must satisfy: + // -W <= X <= W + // -W <= Y <= W + // -W <= Z <= W + // When Z = W = 0, this is trivially satisfied, but when we transform and + // divide by W below it will produce a divide by 0. Usually we want to only + // clip Z to avoid the extra work of clipping X and Y. We can still project + // points that fall outside the view frustum X and Y so long as Z is valid. + // The span drawing code will then ensure X and Y are clamped to viewport + // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y, + // will push W further inside the view frustum so that it is no longer 0, + // allowing us to finally proceed to projecting the points to the screen. + for (int i = 0; i < nump; i++) { + // Found an invalid W, so need to clip against X and Y... + if (p_clip[i].w <= 0.0f) { + // Ping-pong p_clip -> p_tmp -> p_clip. + Point3D p_tmp[4 + 6]; + Interpolants interp_tmp[4 + 6]; + nump = clip_side<X>(nump, p_clip, interp_clip, p_tmp, interp_tmp); + if (nump < 3) return; + nump = clip_side<Y>(nump, p_tmp, interp_tmp, p_clip, interp_clip); + if (nump < 3) return; + // After clipping against X and Y planes, there's still points left + // to draw, so proceed to trying projection now... + break; + } + } + // Divide coords by W and convert to viewport. + for (int i = 0; i < nump; i++) { + float w = 1.0f / p_clip[i].w; + p_clip[i] = Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w); + } + draw_perspective_clipped(nump, p_clip, interp_clip, colortex, layer, + depthtex); + } +} + +static void draw_quad(int nump, Texture& colortex, int layer, + Texture& depthtex) { + // Run vertex shader once for the primitive's vertices. + // Reserve space for 6 sets of interpolants, in case we need to clip against + // near and far planes in the perspective case. + Interpolants interp_outs[4]; + vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants)); + vec4 pos = vertex_shader->gl_Position; + // Check if any vertex W is different from another. If so, use perspective. + if (test_any(pos.w != pos.w.x)) { + draw_perspective(nump, interp_outs, colortex, layer, depthtex); + return; + } + + // Convert output of vertex shader to screen space. + // Divide coords by W and convert to viewport. + float w = 1.0f / pos.w.x; + vec2 screen = + (pos.sel(X, Y) * w + 1) * 0.5f * + vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) + + vec2_scalar(ctx->viewport.x0, ctx->viewport.y0); + Point2D p[4] = {{screen.x.x, screen.y.x}, + {screen.x.y, screen.y.y}, + {screen.x.z, screen.y.z}, + {screen.x.w, screen.y.w}}; + + // If quad is ouside clip rect, nothing to draw. + ClipRect clipRect(colortex); + if (!clipRect.overlaps(nump, p)) { + return; + } + + // Since the quad is assumed 2D, Z is constant across the quad. + float screenZ = (pos.z.x * w + 1) * 0.5f; + if (screenZ < 0 || screenZ > 1) { + // Z values would cross the near or far plane, so just bail. + return; + } + // Since Z doesn't need to be interpolated, just set the fragment shader's + // Z and W values here, once and for all fragment shader invocations. + // SSE2 does not support unsigned comparison, so bias Z to be negative. + uint16_t z = uint16_t(0xFFFF * screenZ) - 0x8000; + fragment_shader->gl_FragCoord.z = screenZ; + fragment_shader->gl_FragCoord.w = w; + + // Finally draw 2D spans for the quad. Currently only supports drawing to + // RGBA8 and R8 color buffers. + if (colortex.internal_format == GL_RGBA8) { + draw_quad_spans<uint32_t>(nump, p, z, interp_outs, colortex, layer, + depthtex, clipRect); + } else if (colortex.internal_format == GL_R8) { + draw_quad_spans<uint8_t>(nump, p, z, interp_outs, colortex, layer, depthtex, + clipRect); + } else { + assert(false); + } +} void VertexArray::validate() { int last_enabled = -1; @@ -2653,32 +3581,78 @@ void VertexArray::validate() { max_attrib = last_enabled; } +template <typename INDEX> +static inline void draw_elements(GLsizei count, GLsizei instancecount, + Buffer& indices_buf, size_t offset, + VertexArray& v, Texture& colortex, int layer, + Texture& depthtex) { + assert((offset & (sizeof(INDEX) - 1)) == 0); + INDEX* indices = (INDEX*)(indices_buf.buf + offset); + count = min(count, + (GLsizei)((indices_buf.size - offset) / sizeof(INDEX))); + // Triangles must be indexed at offsets 0, 1, 2. + // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3. + if (count == 6 && indices[1] == indices[0] + 1 && + indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) { + assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1); + // Fast path - since there is only a single quad, we only load per-vertex + // attribs once for all instances, as they won't change across instances + // or within an instance. + vertex_shader->load_attribs(v.attribs, indices[0], 0, 4); + draw_quad(4, colortex, layer, depthtex); + for (GLsizei instance = 1; instance < instancecount; instance++) { + vertex_shader->load_attribs(v.attribs, indices[0], instance, 0); + draw_quad(4, colortex, layer, depthtex); + } + } else { + for (GLsizei instance = 0; instance < instancecount; instance++) { + for (GLsizei i = 0; i + 3 <= count; i += 3) { + if (indices[i + 1] != indices[i] + 1 || + indices[i + 2] != indices[i] + 2) { + continue; + } + int nump = 3; + if (i + 6 <= count && indices[i + 5] == indices[i] + 3) { + assert(indices[i + 3] == indices[i] + 2 && + indices[i + 4] == indices[i] + 1); + nump = 4; + i += 3; + } + vertex_shader->load_attribs(v.attribs, indices[i], instance, nump); + draw_quad(nump, colortex, layer, depthtex); + } + } + } +} + extern "C" { void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type, - GLintptr offset, GLsizei instancecount) { - if (offset < 0 || count <= 0 || instancecount <= 0 || !vertex_shader || - !fragment_shader) { + void* indicesptr, GLsizei instancecount) { + assert(mode == GL_TRIANGLES); + assert(type == GL_UNSIGNED_SHORT || type == GL_UNSIGNED_INT); + if (count <= 0 || instancecount <= 0) { return; } - Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true); - if (!fb.color_attachment) { - return; - } + Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER); Texture& colortex = ctx->textures[fb.color_attachment]; if (!colortex.buf) { return; } - assert(!colortex.locked); assert(colortex.internal_format == GL_RGBA8 || colortex.internal_format == GL_R8); Texture& depthtex = ctx->textures[ctx->depthtest ? fb.depth_attachment : 0]; if (depthtex.buf) { - assert(depthtex.internal_format == GL_DEPTH_COMPONENT24); + assert(depthtex.internal_format == GL_DEPTH_COMPONENT16); assert(colortex.width == depthtex.width && colortex.height == depthtex.height); - assert(colortex.offset == depthtex.offset); + } + + Buffer& indices_buf = ctx->buffers[ctx->element_array_buffer_binding]; + size_t offset = (size_t)indicesptr; + if (!indices_buf.buf || offset >= indices_buf.size) { + return; } // debugf("current_vertex_array %d\n", ctx->current_vertex_array); @@ -2689,8 +3663,8 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type, v.validate(); } -#ifdef PRINT_TIMINGS - uint64_t start = get_time_value(); +#ifndef NDEBUG + // uint64_t start = get_time_value(); #endif ctx->shaded_rows = 0; @@ -2698,43 +3672,14 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type, vertex_shader->init_batch(); - switch (type) { - case GL_UNSIGNED_SHORT: - assert(mode == GL_TRIANGLES); - draw_elements<uint16_t>(count, instancecount, offset, v, colortex, - depthtex); - break; - case GL_UNSIGNED_INT: - assert(mode == GL_TRIANGLES); - draw_elements<uint32_t>(count, instancecount, offset, v, colortex, - depthtex); - break; - case GL_NONE: - // Non-standard GL extension - if element type is GL_NONE, then we don't - // use any element buffer and behave as if DrawArrays was called instead. - for (GLsizei instance = 0; instance < instancecount; instance++) { - switch (mode) { - case GL_LINES: - for (GLsizei i = 0; i + 2 <= count; i += 2) { - vertex_shader->load_attribs(v.attribs, offset + i, instance, 2); - draw_quad(2, colortex, depthtex); - } - break; - case GL_TRIANGLES: - for (GLsizei i = 0; i + 3 <= count; i += 3) { - vertex_shader->load_attribs(v.attribs, offset + i, instance, 3); - draw_quad(3, colortex, depthtex); - } - break; - default: - assert(false); - break; - } - } - break; - default: - assert(false); - break; + if (type == GL_UNSIGNED_SHORT) { + draw_elements<uint16_t>(count, instancecount, indices_buf, offset, v, + colortex, fb.layer, depthtex); + } else if (type == GL_UNSIGNED_INT) { + draw_elements<uint32_t>(count, instancecount, indices_buf, offset, v, + colortex, fb.layer, depthtex); + } else { + assert(false); } if (ctx->samples_passed_query) { @@ -2742,66 +3687,329 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type, q.value += ctx->shaded_pixels; } -#ifdef PRINT_TIMINGS - uint64_t end = get_time_value(); - printf( - "%7.3fms draw(%s, %d): %d pixels in %d rows (avg %f pixels/row, " - "%fns/pixel)\n", - double(end - start) / (1000. * 1000.), - ctx->programs[ctx->current_program].impl->get_name(), instancecount, - ctx->shaded_pixels, ctx->shaded_rows, - double(ctx->shaded_pixels) / ctx->shaded_rows, - double(end - start) / max(ctx->shaded_pixels, 1)); +#ifndef NDEBUG + // uint64_t end = get_time_value(); + // debugf("draw(%d): %fms for %d pixels in %d rows (avg %f pixels/row, %f + // ns/pixel)\n", instancecount, double(end - start)/(1000.*1000.), + // ctx->shaded_pixels, ctx->shaded_rows, + // double(ctx->shaded_pixels)/ctx->shaded_rows, double(end - + // start)/max(ctx->shaded_pixels, 1)); #endif } -void Finish() { -#ifdef PRINT_TIMINGS - printf("Finish\n"); -#endif +} // extern "C" + +template <typename P> +static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth, + int span) { + int frac = 0; + for (P* end = dst + span; dst < end; dst++) { + *dst = *src; + // Step source according to width ratio. + for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { + src++; + } + } } -void MakeCurrent(Context* c) { - if (ctx == c) { +static void scale_blit(Texture& srctex, const IntRect& srcReq, int srcZ, + Texture& dsttex, const IntRect& dstReq, int dstZ, + bool invertY) { + // Cache scaling ratios + int srcWidth = srcReq.width(); + int srcHeight = srcReq.height(); + int dstWidth = dstReq.width(); + int dstHeight = dstReq.height(); + // Compute valid dest bounds + IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY); + // Compute valid source bounds + // Scale source to dest, rounding inward to avoid sampling outside source + IntRect srcBounds = srctex.sample_bounds(srcReq) + .scale(srcWidth, srcHeight, dstWidth, dstHeight, true); + // Limit dest sampling bounds to overlap source bounds + dstBounds.intersect(srcBounds); + // Check if sampling bounds are empty + if (dstBounds.is_empty()) { return; } - ctx = c; - setup_program(ctx ? ctx->current_program : 0); + // Compute final source bounds from clamped dest sampling bounds + srcBounds = IntRect(dstBounds) + .scale(dstWidth, dstHeight, srcWidth, srcHeight); + // Calculate source and dest pointers from clamped offsets + int bpp = srctex.bpp(); + int srcStride = srctex.stride(bpp); + int destStride = dsttex.stride(bpp); + char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ, invertY); + char* src = srctex.sample_ptr(srcReq, srcBounds, srcZ); + // Inverted Y must step downward along dest rows + if (invertY) { + destStride = -destStride; + } + int span = dstBounds.width(); + int frac = 0; + for (int rows = dstBounds.height(); rows > 0; rows--) { + if (srcWidth == dstWidth) { + // No scaling, so just do a fast copy. + memcpy(dest, src, span * bpp); + } else { + // Do scaling with different source and dest widths. + switch (bpp) { + case 1: + scale_row((uint8_t*)dest, dstWidth, (uint8_t*)src, srcWidth, span); + break; + case 2: + scale_row((uint16_t*)dest, dstWidth, (uint16_t*)src, srcWidth, span); + break; + case 4: + scale_row((uint32_t*)dest, dstWidth, (uint32_t*)src, srcWidth, span); + break; + default: + assert(false); + break; + } + } + dest += destStride; + // Step source according to height ratio. + for (frac += srcHeight; frac >= dstHeight; frac -= dstHeight) { + src += srcStride; + } + } +} + +static void linear_row(uint32_t* dest, int span, const vec2_scalar& srcUV, + float srcDU, int srcZOffset, sampler2DArray sampler) { + vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); + for (; span >= 4; span -= 4) { + auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset); + unaligned_store(dest, srcpx); + dest += 4; + uv.x += 4 * srcDU; + } + if (span > 0) { + auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset); + auto mask = span_mask_RGBA8(span); + auto dstpx = unaligned_load<PackedRGBA8>(dest); + unaligned_store(dest, (mask & dstpx) | (~mask & srcpx)); + } } -Context* CreateContext() { return new Context; } +static void linear_row(uint8_t* dest, int span, const vec2_scalar& srcUV, + float srcDU, int srcZOffset, sampler2DArray sampler) { + vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); + for (; span >= 4; span -= 4) { + auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset); + unaligned_store(dest, pack(srcpx)); + dest += 4; + uv.x += 4 * srcDU; + } + if (span > 0) { + auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset); + auto mask = span_mask_R8(span); + auto dstpx = unpack(unaligned_load<PackedR8>(dest)); + unaligned_store(dest, pack((mask & dstpx) | (~mask & srcpx))); + } +} -void ReferenceContext(Context* c) { - if (!c) { +static void linear_blit(Texture& srctex, const IntRect& srcReq, int srcZ, + Texture& dsttex, const IntRect& dstReq, int dstZ, + bool invertY) { + assert(srctex.internal_format == GL_RGBA8 || + srctex.internal_format == GL_R8); + // Compute valid dest bounds + IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY); + // Check if sampling bounds are empty + if (dstBounds.is_empty()) { return; } - ++c->references; + // Initialize sampler for source texture + sampler2DArray_impl sampler; + init_sampler(&sampler, srctex); + init_depth(&sampler, srctex); + sampler.filter = TextureFilter::LINEAR; + // Compute source UVs + int srcZOffset = srcZ * sampler.height_stride; + vec2_scalar srcUV(srcReq.x0, srcReq.y0); + vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(), + float(srcReq.height()) / dstReq.height()); + // Skip to clamped source start + srcUV += srcDUV * vec2_scalar(dstBounds.x0, dstBounds.y0); + // Offset source UVs to texel centers and scale by lerp precision + srcUV = linearQuantize(srcUV + 0.5f, 128); + srcDUV *= 128.0f; + // Calculate dest pointer from clamped offsets + int bpp = dsttex.bpp(); + int destStride = dsttex.stride(bpp); + char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ, invertY); + // Inverted Y must step downward along dest rows + if (invertY) { + destStride = -destStride; + } + int span = dstBounds.width(); + for (int rows = dstBounds.height(); rows > 0; rows--) { + switch (bpp) { + case 1: + linear_row((uint8_t*)dest, span, srcUV, srcDUV.x, srcZOffset, + &sampler); + break; + case 4: + linear_row((uint32_t*)dest, span, srcUV, srcDUV.x, srcZOffset, + &sampler); + break; + default: + assert(false); + break; + } + dest += destStride; + srcUV.y += srcDUV.y; + } } -void DestroyContext(Context* c) { - if (!c) { +extern "C" { + +void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, + GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, + GLbitfield mask, GLenum filter) { + assert(mask == GL_COLOR_BUFFER_BIT); + Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER); + if (!srcfb || srcfb->layer < 0) return; + Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER); + if (!dstfb || dstfb->layer < 0) return; + Texture& srctex = ctx->textures[srcfb->color_attachment]; + if (!srctex.buf || srcfb->layer >= max(srctex.depth, 1)) return; + Texture& dsttex = ctx->textures[dstfb->color_attachment]; + if (!dsttex.buf || dstfb->layer >= max(dsttex.depth, 1)) return; + if (srctex.internal_format != dsttex.internal_format) { + assert(false); return; } - assert(c->references > 0); - --c->references; - if (c->references > 0) { + // Force flipped Y onto dest coordinates + if (srcY1 < srcY0) { + swap(srcY0, srcY1); + swap(dstY0, dstY1); + } + bool invertY = dstY1 < dstY0; + if (invertY) { + swap(dstY0, dstY1); + } + IntRect srcReq = {srcX0, srcY0, srcX1, srcY1}; + IntRect dstReq = {dstX0, dstY0, dstX1, dstY1}; + if (srcReq.is_empty() || dstReq.is_empty()) { return; } - if (ctx == c) { - MakeCurrent(nullptr); + prepare_texture(srctex); + prepare_texture(dsttex, &dstReq); + if (!srcReq.same_size(dstReq) && filter == GL_LINEAR && + (srctex.internal_format == GL_RGBA8 || + srctex.internal_format == GL_R8)) { + linear_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer, + invertY); + } else { + scale_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer, + invertY); } - delete c; } -size_t ReportMemory(size_t (*size_of_op)(void*)) { - size_t size = 0; +void Finish() {} + +void MakeCurrent(void* ctx_ptr) { + ctx = (Context*)ctx_ptr; if (ctx) { - for (auto& t : ctx->textures) { - if (t && t->should_free()) { - size += size_of_op(t->buf); + setup_program(ctx->current_program); + blend_key = ctx->blend ? ctx->blend_key : BLEND_KEY_NONE; + } else { + setup_program(0); + blend_key = BLEND_KEY_NONE; + } +} + +void* CreateContext() { return new Context; } + +void DestroyContext(void* ctx_ptr) { + if (!ctx_ptr) { + return; + } + if (ctx == ctx_ptr) { + MakeCurrent(nullptr); + } + delete (Context*)ctx_ptr; +} + +void Composite(GLuint srcId, GLint srcX, GLint srcY, GLsizei srcWidth, + GLsizei srcHeight, GLint dstX, GLint dstY, GLboolean opaque, + GLboolean flip) { + Framebuffer& fb = ctx->framebuffers[0]; + if (!fb.color_attachment) { + return; + } + Texture& srctex = ctx->textures[srcId]; + if (!srctex.buf) return; + prepare_texture(srctex); + Texture& dsttex = ctx->textures[fb.color_attachment]; + if (!dsttex.buf) return; + assert(srctex.bpp() == 4); + const int bpp = 4; + size_t src_stride = srctex.stride(bpp); + size_t dest_stride = dsttex.stride(bpp); + if (srcY < 0) { + dstY -= srcY; + srcHeight += srcY; + srcY = 0; + } + if (dstY < 0) { + srcY -= dstY; + srcHeight += dstY; + dstY = 0; + } + if (srcY + srcHeight > srctex.height) { + srcHeight = srctex.height - srcY; + } + if (dstY + srcHeight > dsttex.height) { + srcHeight = dsttex.height - dstY; + } + IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight}; + prepare_texture(dsttex, &skip); + char* dest = dsttex.sample_ptr(dstX, flip ? dsttex.height - 1 - dstY : dstY, + fb.layer, bpp, dest_stride); + char* src = srctex.sample_ptr(srcX, srcY, 0, bpp, src_stride); + if (flip) { + dest_stride = -dest_stride; + } + if (opaque) { + for (int y = 0; y < srcHeight; y++) { + memcpy(dest, src, srcWidth * bpp); + dest += dest_stride; + src += src_stride; + } + } else { + for (int y = 0; y < srcHeight; y++) { + char* end = src + srcWidth * bpp; + while (src + 4 * bpp <= end) { + WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src)); + WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest)); + PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); + unaligned_store(dest, r); + src += 4 * bpp; + dest += 4 * bpp; } + if (src < end) { + WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src)); + WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest)); + U32 r = bit_cast<U32>( + pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)))); + unaligned_store(dest, r.x); + if (src + bpp < end) { + unaligned_store(dest + bpp, r.y); + if (src + 2 * bpp < end) { + unaligned_store(dest + 2 * bpp, r.z); + } + } + dest += end - src; + src = end; + } + dest += dest_stride - srcWidth * bpp; + src += src_stride - srcWidth * bpp; } } - return size; } + } // extern "C" diff --git a/third_party/webrender/swgl/src/gl_defs.h b/third_party/webrender/swgl/src/gl_defs.h index 22219366ecf..c7e87230a3d 100644 --- a/third_party/webrender/swgl/src/gl_defs.h +++ b/third_party/webrender/swgl/src/gl_defs.h @@ -15,27 +15,20 @@ typedef float GLfloat; typedef double GLdouble; typedef uint32_t GLenum; -typedef uint8_t GLboolean; +typedef int32_t GLboolean; typedef uint32_t GLbitfield; typedef int32_t GLsizei; typedef size_t GLsizeiptr; typedef intptr_t GLintptr; -#define GL_FALSE 0 -#define GL_TRUE 1 - -#define GL_NONE 0 - #define GL_NO_ERROR 0 #define GL_RGBA32F 0x8814 #define GL_RGBA8 0x8058 #define GL_R8 0x8229 -#define GL_R16 0x822A #define GL_RGBA32I 0x8D82 #define GL_BGRA8 0x93A1 -#define GL_RG8 0x822B #define GL_BYTE 0x1400 #define GL_UNSIGNED_BYTE 0x1401 @@ -44,7 +37,6 @@ typedef intptr_t GLintptr; #define GL_INT 0x1404 #define GL_UNSIGNED_INT 0x1405 #define GL_FLOAT 0x1406 -#define GL_DOUBLE 0x1408 #define GL_RED 0x1903 #define GL_GREEN 0x1904 @@ -54,7 +46,6 @@ typedef intptr_t GLintptr; #define GL_RGBA 0x1908 #define GL_RGBA_INTEGER 0x8D99 #define GL_BGRA 0x80E1 -#define GL_RG 0x8227 #define GL_DEPTH_COMPONENT 0x1902 #define GL_DEPTH_COMPONENT16 0x81A5 @@ -155,8 +146,6 @@ typedef intptr_t GLintptr; #define GL_ONE_MINUS_SRC1_ALPHA 0x88FB #define GL_FUNC_ADD 0x8006 -#define GL_MIN 0x8007 -#define GL_MAX 0x8008 #define GL_NEVER 0x0200 #define GL_LESS 0x0201 @@ -176,9 +165,6 @@ typedef intptr_t GLintptr; #define GL_VERSION 0x1F02 #define GL_EXTENSIONS 0x1F03 #define GL_NUM_EXTENSIONS 0x821D -#define GL_MINOR_VERSION 0x821C -#define GL_MAJOR_VERSION 0x821B -#define GL_SHADING_LANGUAGE_VERSION 0x8B8C #define GL_POINTS 0x0000 #define GL_LINES 0x0001 @@ -188,29 +174,3 @@ typedef intptr_t GLintptr; #define GL_TRIANGLE_STRIP 0x0005 #define GL_TRIANGLE_FAN 0x0006 #define GL_QUADS 0x0007 - -#define GL_UNSIGNED_INT_8_8_8_8_REV 0x8367 - -#define GL_RGB_422_APPLE 0x8A1F -#define GL_UNSIGNED_SHORT_8_8_APPLE 0x85BA -#define GL_UNSIGNED_SHORT_8_8_REV_APPLE 0x85BB -#define GL_RGB_RAW_422_APPLE 0x8A51 - -#define GL_MULTIPLY_KHR 0x9294 -#define GL_SCREEN_KHR 0x9295 -#define GL_OVERLAY_KHR 0x9296 -#define GL_DARKEN_KHR 0x9297 -#define GL_LIGHTEN_KHR 0x9298 -#define GL_COLORDODGE_KHR 0x9299 -#define GL_COLORBURN_KHR 0x929A -#define GL_HARDLIGHT_KHR 0x929B -#define GL_SOFTLIGHT_KHR 0x929C -#define GL_DIFFERENCE_KHR 0x929E -#define GL_EXCLUSION_KHR 0x92A0 -#define GL_HSL_HUE_KHR 0x92AD -#define GL_HSL_SATURATION_KHR 0x92AE -#define GL_HSL_COLOR_KHR 0x92AF -#define GL_HSL_LUMINOSITY_KHR 0x92B0 - -#define SWGL_BLEND_DROP_SHADOW 0xB001 -#define SWGL_BLEND_SUBPIXEL_TEXT 0xB002 diff --git a/third_party/webrender/swgl/src/glsl.h b/third_party/webrender/swgl/src/glsl.h index bec63858b0d..cdedb43d567 100644 --- a/third_party/webrender/swgl/src/glsl.h +++ b/third_party/webrender/swgl/src/glsl.h @@ -2,45 +2,14 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +// Some of this is copied from Skia and is governed by a BSD-style license +// Every function in this file should be marked static and inline using SI. #define SI ALWAYS_INLINE static #include "vector_type.h" namespace glsl { -enum TextureFormat { RGBA32F, RGBA32I, RGBA8, R8, RG8, R16, YUV422 }; - -enum TextureFilter { NEAREST, LINEAR }; - -struct samplerCommon { - uint32_t* buf = nullptr; - uint32_t stride = 0; // in units of BPP if < 4, or dwords if BPP >= 4 - uint32_t height = 0; - uint32_t width = 0; - TextureFormat format = TextureFormat::RGBA8; -}; - -struct samplerFilter { - TextureFilter filter = TextureFilter::NEAREST; -}; - -struct sampler2D_impl : samplerCommon, samplerFilter {}; -typedef sampler2D_impl* sampler2D; - -typedef struct sampler2DR8_impl : sampler2D_impl{} * sampler2DR8; -typedef struct sampler2DRG8_impl : sampler2D_impl{} * sampler2DRG8; -typedef struct sampler2DRGBA8_impl : sampler2D_impl{} * sampler2DRGBA8; -typedef struct sampler2DRGBA32F_impl : sampler2D_impl{} * sampler2DRGBA32F; - -struct isampler2D_impl : samplerCommon {}; -typedef isampler2D_impl* isampler2D; - -struct isampler2DRGBA32I_impl : isampler2D_impl {}; -typedef isampler2DRGBA32I_impl* isampler2DRGBA32I; - -struct sampler2DRect_impl : samplerCommon, samplerFilter {}; -typedef sampler2DRect_impl* sampler2DRect; - #if USE_SSE2 SI bool test_all(Bool cond) { return _mm_movemask_ps(cond) == 0xF; } SI bool test_any(Bool cond) { return _mm_movemask_ps(cond) != 0; } @@ -49,14 +18,9 @@ SI bool test_none(Bool cond) { return _mm_movemask_ps(cond) == 0; } SI bool test_all(Bool cond) { return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0xFFFFFFFFU; } -SI bool test_any(Bool cond) { - return bit_cast<uint32_t>(CONVERT(cond, U8)) != 0; -} -SI bool test_none(Bool cond) { - return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0; -} +SI bool test_any(Bool cond) { return bit_cast<uint32_t>(CONVERT(cond, U8)) != 0; } +SI bool test_none(Bool cond) { return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0; } #endif -SI bool test_equal(Bool cond) { return test_none(cond != cond.x); } float make_float(float n) { return n; } @@ -110,23 +74,17 @@ struct vec4; struct ivec2; SI int32_t if_then_else(int32_t c, int32_t t, int32_t e) { return c ? t : e; } -SI int32_t if_then_else(bool c, int32_t t, int32_t e) { return c ? t : e; } SI float if_then_else(int32_t c, float t, float e) { return c ? t : e; } SI Float if_then_else(I32 c, float t, float e) { - return bit_cast<Float>((c & bit_cast<I32>(Float(t))) | - (~c & bit_cast<I32>(Float(e)))); + return bit_cast<Float>((c & bit_cast<I32>(Float(t))) | (~c & bit_cast<I32>(Float(e)))); } SI I32 if_then_else(I32 c, int32_t t, int32_t e) { return (c & I32(t)) | (~c & I32(e)); } -SI U32 if_then_else(I32 c, U32 t, U32 e) { - return bit_cast<U32>((c & bit_cast<I32>(t)) | (~c & bit_cast<I32>(e))); -} - SI Float if_then_else(I32 c, Float t, Float e) { return bit_cast<Float>((c & bit_cast<I32>(t)) | (~c & bit_cast<I32>(e))); } @@ -137,10 +95,7 @@ SI Bool if_then_else(I32 c, Bool t, Bool e) { return (c & t) | (~c & e); } SI Bool if_then_else(int32_t c, Bool t, Bool e) { return c ? t : e; } -SI I16 if_then_else(I16 c, I16 t, I16 e) { return (c & t) | (~c & e); } - -template <typename T> -SI void swap(T& a, T& b) { +template <typename T> SI void swap(T& a, T& b) { T t(a); a = b; b = t; @@ -201,37 +156,7 @@ SI Float sqrt(Float v) { #endif } -SI float recip(float x) { -#if USE_SSE2 - return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(x))); -#else - return 1.0f / x; -#endif -} - -// Use a fast vector reciprocal approximation when available. This should only -// be used in cases where it is okay that the approximation is imprecise - -// essentially visually correct but numerically wrong. Otherwise just rely on -// however the compiler would implement slower division if the platform doesn't -// provide a convenient intrinsic. -SI Float recip(Float v) { -#if USE_SSE2 - return _mm_rcp_ps(v); -#elif USE_NEON - Float e = vrecpeq_f32(v); - return vrecpsq_f32(v, e) * e; -#else - return 1.0f / v; -#endif -} - -SI float inversesqrt(float x) { -#if USE_SSE2 - return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(x))); -#else - return 1.0f / sqrtf(x); -#endif -} +SI float inversesqrt(float x) { return 1.0f / sqrtf(x); } SI Float inversesqrt(Float v) { #if USE_SSE2 @@ -269,45 +194,18 @@ enum XYZW { A = 3, }; -struct bvec4_scalar; - struct bvec2_scalar { bool x; bool y; bvec2_scalar() : bvec2_scalar(false) {} - IMPLICIT constexpr bvec2_scalar(bool a) : x(a), y(a) {} + constexpr bvec2_scalar(bool a) : x(a), y(a) {} constexpr bvec2_scalar(bool x, bool y) : x(x), y(y) {} - - bool& select(XYZW c) { - switch (c) { - case X: - return x; - case Y: - return y; - default: - UNREACHABLE; - } - } - bool sel(XYZW c1) { return select(c1); } - - bvec2_scalar sel(XYZW c1, XYZW c2) { - return bvec2_scalar(select(c1), select(c2)); - } - bvec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4); -}; - -struct bvec2_scalar1 { - bool x; - - IMPLICIT constexpr bvec2_scalar1(bool a) : x(a) {} - - operator bvec2_scalar() const { return bvec2_scalar(x); } }; struct bvec2 { bvec2() : bvec2(0) {} - IMPLICIT bvec2(Bool a) : x(a), y(a) {} + bvec2(Bool a) : x(a), y(a) {} bvec2(Bool x, Bool y) : x(x), y(y) {} Bool& select(XYZW c) { switch (c) { @@ -321,15 +219,13 @@ struct bvec2 { } Bool sel(XYZW c1) { return select(c1); } - bvec2 sel(XYZW c1, XYZW c2) { return bvec2(select(c1), select(c2)); } - bvec2 operator~() { return bvec2(~x, ~y); } Bool x; Bool y; }; -bvec2_scalar1 make_bvec2(bool n) { return bvec2_scalar1(n); } +bvec2_scalar make_bvec2(bool n) { return bvec2_scalar{n, n}; } bvec2_scalar make_bvec2(bool x, bool y) { return bvec2_scalar{x, y}; } @@ -353,8 +249,8 @@ struct vec2_scalar { float y; constexpr vec2_scalar() : vec2_scalar(0.0f) {} - IMPLICIT constexpr vec2_scalar(float a) : x(a), y(a) {} - IMPLICIT constexpr vec2_scalar(int a) : x(a), y(a) {} + constexpr vec2_scalar(float a) : x(a), y(a) {} + constexpr vec2_scalar(int a) : x(a), y(a) {} constexpr vec2_scalar(float x, float y) : x(x), y(y) {} float& select(XYZW c) { @@ -390,9 +286,6 @@ struct vec2_scalar { friend vec2_scalar operator*(vec2_scalar a, vec2_scalar b) { return vec2_scalar(a.x * b.x, a.y * b.y); } - friend vec2_scalar operator/(vec2_scalar a, float b) { - return vec2_scalar(a.x / b, a.y / b); - } friend vec2_scalar operator/(vec2_scalar a, vec2_scalar b) { return vec2_scalar(a.x / b.x, a.y / b.y); } @@ -415,12 +308,6 @@ struct vec2_scalar { return *this; } - vec2_scalar operator/=(vec2_scalar a) { - x /= a.x; - y /= a.y; - return *this; - } - vec2_scalar operator+=(vec2_scalar a) { x += a.x; y += a.y; @@ -469,12 +356,12 @@ struct vec2 { typedef float element_type; constexpr vec2() : vec2(Float(0.0f)) {} - IMPLICIT constexpr vec2(Float a) : x(a), y(a) {} + constexpr vec2(Float a) : x(a), y(a) {} vec2(Float x, Float y) : x(x), y(y) {} - IMPLICIT constexpr vec2(vec2_scalar s) : x(s.x), y(s.y) {} + constexpr vec2(vec2_scalar s) : x(s.x), y(s.y) {} constexpr vec2(vec2_scalar s0, vec2_scalar s1, vec2_scalar s2, vec2_scalar s3) : x(Float{s0.x, s1.x, s2.x, s3.x}), y(Float{s0.y, s1.y, s2.y, s3.y}) {} - explicit vec2(ivec2 a); + vec2(ivec2 a); Float x; Float y; @@ -583,7 +470,6 @@ vec2 operator*(vec2_scalar a, Float b) { return vec2(a.x * b, a.y * b); } vec2 operator*(Float a, vec2_scalar b) { return vec2(a * b.x, a * b.y); } SI vec2 min(vec2 a, vec2 b) { return vec2(min(a.x, b.x), min(a.y, b.y)); } -SI vec2 min(vec2 a, Float b) { return vec2(min(a.x, b), min(a.y, b)); } SI vec2_scalar min(vec2_scalar a, vec2_scalar b) { return vec2_scalar{min(a.x, b.x), min(a.y, b.y)}; @@ -599,12 +485,8 @@ vec2 step(vec2 edge, vec2 x) { return vec2(step(edge.x, x.x), step(edge.y, x.y)); } -vec2_scalar step(vec2_scalar edge, vec2_scalar x) { - return vec2_scalar(step(edge.x, x.x), step(edge.y, x.y)); -} - -SI vec2 max(vec2 a, vec2 b) { return vec2(max(a.x, b.x), max(a.y, b.y)); } -SI vec2 max(vec2 a, Float b) { return vec2(max(a.x, b), max(a.y, b)); } +vec2 max(vec2 a, vec2 b) { return vec2(max(a.x, b.x), max(a.y, b.y)); } +vec2 max(vec2 a, Float b) { return vec2(max(a.x, b), max(a.y, b)); } SI vec2_scalar max(vec2_scalar a, vec2_scalar b) { return vec2_scalar{max(a.x, b.x), max(a.y, b.y)}; @@ -617,31 +499,9 @@ Float length(vec2 a) { return sqrt(a.x * a.x + a.y * a.y); } float length(vec2_scalar a) { return hypotf(a.x, a.y); } -template <typename A, typename B> -SI auto distance(A a, B b) { - return length(a - b); -} +SI Float distance(vec2 a, vec2 b) { return length(a - b); } -template <typename T> -SI T normalize(T a) { - return a / length(a); -} - -SI vec2 sqrt(vec2 a) { return vec2(sqrt(a.x), sqrt(a.y)); } - -SI vec2_scalar sqrt(vec2_scalar a) { return vec2_scalar(sqrt(a.x), sqrt(a.y)); } - -SI vec2 recip(vec2 a) { return vec2(recip(a.x), recip(a.y)); } - -SI vec2_scalar recip(vec2_scalar a) { - return vec2_scalar(recip(a.x), recip(a.y)); -} - -SI vec2 inversesqrt(vec2 a) { return vec2(inversesqrt(a.x), inversesqrt(a.y)); } - -SI vec2_scalar inversesqrt(vec2_scalar a) { - return vec2_scalar(inversesqrt(a.x), inversesqrt(a.y)); -} +SI vec2 normalize(vec2 a) { return a / length(a); } #define abs __glsl_abs @@ -657,13 +517,6 @@ Float abs(Float v) { #endif } -float sign(float a) { return copysignf(1.0f, a); } - -Float sign(Float v) { - return bit_cast<Float>((bit_cast<I32>(v) & 0x80000000) | - bit_cast<I32>(Float(1.0f))); -} - Float cast(U32 v) { return CONVERT((I32)v, Float); } Float cast(I32 v) { return CONVERT((I32)v, Float); } I32 cast(Float v) { return CONVERT(v, I32); } @@ -725,22 +578,17 @@ SI I32 roundfast(Float v, Float scale) { #endif } -template <typename T> -SI auto round_pixel(T v, float scale = 255.0f) { - return roundfast(v, scale); -} +template <typename T> SI auto round_pixel(T v) { return roundfast(v, 255.0f); } #define round __glsl_round float round(float a) { return roundf(a); } -Float round(Float v) { return floor(v + 0.5f); } - float fract(float a) { return a - floor(a); } -Float fract(Float v) { return v - floor(v); } +Float round(Float v) { return floor(v + 0.5f); } -vec2 fract(vec2 v) { return vec2(fract(v.x), fract(v.y)); } +Float fract(Float v) { return v - floor(v); } // X derivatives can be approximated by dFdx(x) = x[1] - x[0]. // Y derivatives are not easily available since we operate in terms of X spans @@ -748,15 +596,11 @@ vec2 fract(vec2 v) { return vec2(fract(v.x), fract(v.y)); } // uniform scaling, and thus abs(dFdx(p.x)) + abs(dFdy(p.x)) = abs(dFdx(p.x)) + // abs(dFdx(p.y)) which mirrors abs(dFdx(p.y)) + abs(dFdy(p.y)) = abs(dFdx(p.y)) // + abs(dFdx(p.x)). -vec2_scalar fwidth(vec2 p) { +vec2 fwidth(vec2 p) { Float d = abs(SHUFFLE(p.x, p.y, 1, 1, 5, 5) - SHUFFLE(p.x, p.y, 0, 0, 4, 4)); - return vec2_scalar(d.x + d.z); + return vec2(d.xyxy + d.zwzw); } -float dFdx(Float x) { return x.y - x.x; } - -vec2_scalar dFdx(vec2 p) { return vec2_scalar(dFdx(p.x), dFdx(p.y)); } - // See // http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. Float approx_log2(Float x) { @@ -768,7 +612,6 @@ Float approx_log2(Float x) { return e - 124.225514990f - 1.498030302f * m - 1.725879990f / (0.3520887068f + m); } - Float approx_pow2(Float x) { Float f = fract(x); return bit_cast<Float>( @@ -776,41 +619,16 @@ Float approx_pow2(Float x) { 27.728023300f / (4.84252568f - f))); } -#define pow __glsl_pow - -SI float pow(float x, float y) { return powf(x, y); } - +// From skia Float pow(Float x, Float y) { return if_then_else((x == 0) | (x == 1), x, approx_pow2(approx_log2(x) * y)); } -#define exp __glsl_exp - -SI float exp(float x) { return expf(x); } - Float exp(Float y) { - float l2e = 1.4426950408889634074f; - return approx_pow2(l2e * y); + float x = 2.718281828459045235360287471352; + return approx_pow2(log2f(x) * y); } -#define exp2 __glsl_exp2 - -SI float exp2(float x) { return exp2f(x); } - -Float exp2(Float x) { return approx_pow2(x); } - -#define log __glsl_log - -SI float log(float x) { return logf(x); } - -Float log(Float x) { return approx_log2(x) * 0.69314718f; } - -#define log2 __glsl_log2 - -SI float log2(float x) { return log2f(x); } - -Float log2(Float x) { return approx_log2(x); } - struct ivec4; struct ivec2_scalar { @@ -820,7 +638,7 @@ struct ivec2_scalar { int32_t y; ivec2_scalar() : ivec2_scalar(0) {} - IMPLICIT constexpr ivec2_scalar(int32_t a) : x(a), y(a) {} + constexpr ivec2_scalar(int32_t a) : x(a), y(a) {} constexpr ivec2_scalar(int32_t x, int32_t y) : x(x), y(y) {} int32_t& select(XYZW c) { @@ -838,8 +656,6 @@ struct ivec2_scalar { return ivec2_scalar{select(c1), select(c2)}; } - ivec2_scalar operator-() const { return ivec2_scalar{-x, -y}; } - ivec2_scalar& operator+=(ivec2_scalar a) { x += a.x; y += a.y; @@ -864,25 +680,17 @@ struct ivec2_scalar { friend ivec2_scalar operator+(ivec2_scalar a, ivec2_scalar b) { return ivec2_scalar{a.x + b.x, a.y + b.y}; } - - friend ivec2_scalar operator-(ivec2_scalar a, ivec2_scalar b) { - return ivec2_scalar{a.x - b.x, a.y - b.y}; - } - - friend bool operator==(const ivec2_scalar& l, const ivec2_scalar& r) { - return l.x == r.x && l.y == r.y; - } }; struct ivec2 { typedef int32_t element_type; ivec2() : ivec2(I32(0)) {} - IMPLICIT ivec2(I32 a) : x(a), y(a) {} + ivec2(I32 a) : x(a), y(a) {} ivec2(I32 x, I32 y) : x(x), y(y) {} - IMPLICIT ivec2(vec2 a) : x(cast(a.x)), y(cast(a.y)) {} + ivec2(vec2 a) : x(cast(a.x)), y(cast(a.y)) {} ivec2(U32 x, U32 y) : x(CONVERT(x, I32)), y(CONVERT(y, I32)) {} - IMPLICIT constexpr ivec2(ivec2_scalar s) : x(s.x), y(s.y) {} + constexpr ivec2(ivec2_scalar s) : x(s.x), y(s.y) {} constexpr ivec2(ivec2_scalar s0, ivec2_scalar s1, ivec2_scalar s2, ivec2_scalar s3) : x(I32{s0.x, s1.x, s2.x, s3.x}), y(I32{s0.y, s1.y, s2.y, s3.y}) {} @@ -973,7 +781,7 @@ struct ivec3_scalar { int32_t z; ivec3_scalar() : ivec3_scalar(0) {} - IMPLICIT constexpr ivec3_scalar(int32_t a) : x(a), y(a), z(a) {} + constexpr ivec3_scalar(int32_t a) : x(a), y(a), z(a) {} constexpr ivec3_scalar(int32_t x, int32_t y, int32_t z) : x(x), y(y), z(z) {} int32_t& select(XYZW c) { @@ -996,7 +804,7 @@ struct ivec3_scalar { struct ivec3 { ivec3() : ivec3(0) {} - IMPLICIT ivec3(I32 a) : x(a), y(a), z(a) {} + ivec3(I32 a) : x(a), y(a), z(a) {} ivec3(I32 x, I32 y, I32 z) : x(x), y(y), z(z) {} ivec3(ivec2 a, I32 b) : x(a.x), y(a.y), z(b) {} ivec3(vec2 a, Float b) : x(cast(a.x)), y(cast(a.y)), z(cast(b)) {} @@ -1047,7 +855,7 @@ struct ivec4_scalar { int32_t w; ivec4_scalar() : ivec4_scalar(0) {} - IMPLICIT constexpr ivec4_scalar(int32_t a) : x(a), y(a), z(a), w(a) {} + constexpr ivec4_scalar(int32_t a) : x(a), y(a), z(a), w(a) {} constexpr ivec4_scalar(int32_t x, int32_t y, int32_t z, int32_t w) : x(x), y(y), z(z), w(w) {} @@ -1073,31 +881,16 @@ struct ivec4_scalar { friend ivec4_scalar operator&(int32_t a, ivec4_scalar b) { return ivec4_scalar{a & b.x, a & b.y, a & b.z, a & b.w}; } - - int32_t& operator[](int index) { - switch (index) { - case 0: - return x; - case 1: - return y; - case 2: - return z; - case 3: - return w; - default: - UNREACHABLE; - } - } }; struct ivec4 { typedef int32_t element_type; ivec4() : ivec4(I32(0)) {} - IMPLICIT ivec4(I32 a) : x(a), y(a), z(a), w(a) {} + ivec4(I32 a) : x(a), y(a), z(a), w(a) {} ivec4(I32 x, I32 y, I32 z, I32 w) : x(x), y(y), z(z), w(w) {} ivec4(ivec2 a, I32 b, I32 c) : x(a.x), y(a.y), z(b), w(c) {} - IMPLICIT constexpr ivec4(ivec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {} + constexpr ivec4(ivec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {} constexpr ivec4(ivec4_scalar s0, ivec4_scalar s1, ivec4_scalar s2, ivec4_scalar s3) : x(I32{s0.x, s1.x, s2.x, s3.x}), @@ -1190,21 +983,13 @@ struct bvec3_scalar { bool z; bvec3_scalar() : bvec3_scalar(false) {} - IMPLICIT constexpr bvec3_scalar(bool a) : x(a), y(a), z(a) {} + constexpr bvec3_scalar(bool a) : x(a), y(a), z(a) {} constexpr bvec3_scalar(bool x, bool y, bool z) : x(x), y(y), z(z) {} }; -struct bvec3_scalar1 { - bool x; - - IMPLICIT constexpr bvec3_scalar1(bool a) : x(a) {} - - operator bvec3_scalar() const { return bvec3_scalar(x); } -}; - struct bvec3 { bvec3() : bvec3(0) {} - IMPLICIT bvec3(Bool a) : x(a), y(a), z(a) {} + bvec3(Bool a) : x(a), y(a), z(a) {} bvec3(Bool x, Bool y, Bool z) : x(x), y(y), z(z) {} Bool& select(XYZW c) { switch (c) { @@ -1225,8 +1010,6 @@ struct bvec3 { Bool z; }; -bvec3_scalar1 make_bvec3(bool n) { return bvec3_scalar1(n); } - struct bvec4_scalar { bool x; bool y; @@ -1234,45 +1017,14 @@ struct bvec4_scalar { bool w; bvec4_scalar() : bvec4_scalar(false) {} - IMPLICIT constexpr bvec4_scalar(bool a) : x(a), y(a), z(a), w(a) {} + constexpr bvec4_scalar(bool a) : x(a), y(a), z(a), w(a) {} constexpr bvec4_scalar(bool x, bool y, bool z, bool w) : x(x), y(y), z(z), w(w) {} - - bool& select(XYZW c) { - switch (c) { - case X: - return x; - case Y: - return y; - case Z: - return z; - case W: - return w; - default: - UNREACHABLE; - } - } - bool sel(XYZW c1) { return select(c1); } - bvec2_scalar sel(XYZW c1, XYZW c2) { - return bvec2_scalar(select(c1), select(c2)); - } -}; - -bvec4_scalar bvec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { - return bvec4_scalar{select(c1), select(c2), select(c3), select(c4)}; -} - -struct bvec4_scalar1 { - bool x; - - IMPLICIT constexpr bvec4_scalar1(bool a) : x(a) {} - - operator bvec4_scalar() const { return bvec4_scalar(x); } }; struct bvec4 { bvec4() : bvec4(0) {} - IMPLICIT bvec4(Bool a) : x(a), y(a), z(a), w(a) {} + bvec4(Bool a) : x(a), y(a), z(a), w(a) {} bvec4(Bool x, Bool y, Bool z, Bool w) : x(x), y(y), z(z), w(w) {} bvec4(bvec2 x, bvec2 y) : x(x.x), y(x.y), z(y.x), w(y.y) {} Bool& select(XYZW c) { @@ -1285,8 +1037,6 @@ struct bvec4 { return z; case W: return w; - default: - UNREACHABLE; } } Bool sel(XYZW c1) { return select(c1); } @@ -1297,16 +1047,12 @@ struct bvec4 { Bool w; }; -bvec4_scalar1 make_bvec4(bool n) { return bvec4_scalar1(n); } +bvec4_scalar make_bvec4(bool n) { return bvec4_scalar{n, n, n, n}; } bvec4_scalar make_bvec4(bool x, bool y, bool z, bool w) { return bvec4_scalar{x, y, z, w}; } -bvec4_scalar make_bvec4(bvec2_scalar a, bvec2_scalar b) { - return bvec4_scalar{a.x, a.y, b.x, b.y}; -} - template <typename N> bvec4 make_bvec4(const N& n) { return bvec4(n); @@ -1383,7 +1129,7 @@ struct vec3_scalar { float z; constexpr vec3_scalar() : vec3_scalar(0.0f) {} - IMPLICIT constexpr vec3_scalar(float a) : x(a), y(a), z(a) {} + constexpr vec3_scalar(float a) : x(a), y(a), z(a) {} constexpr vec3_scalar(float x, float y, float z) : x(x), y(y), z(z) {} float& select(XYZW c) { @@ -1474,11 +1220,10 @@ struct vec3 { typedef float element_type; constexpr vec3() : vec3(Float(0.0f)) {} - IMPLICIT constexpr vec3(Float a) : x(a), y(a), z(a) {} + constexpr vec3(Float a) : x(a), y(a), z(a) {} constexpr vec3(Float x, Float y, Float z) : x(x), y(y), z(z) {} vec3(vec2 a, Float z) : x(a.x), y(a.y), z(z) {} - explicit vec3(vec4); - IMPLICIT constexpr vec3(vec3_scalar s) : x(s.x), y(s.y), z(s.z) {} + constexpr vec3(vec3_scalar s) : x(s.x), y(s.y), z(s.z) {} constexpr vec3(vec3_scalar s0, vec3_scalar s1, vec3_scalar s2, vec3_scalar s3) : x(Float{s0.x, s1.x, s2.x, s3.x}), y(Float{s0.y, s1.y, s2.y, s3.y}), @@ -1507,8 +1252,6 @@ struct vec3 { return vec3(select(c1), select(c2), select(c3)); } - vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4); - vec2_ref lsel(XYZW c1, XYZW c2) { return vec2_ref(select(c1), select(c2)); } friend vec3 operator*(vec3 a, Float b) { @@ -1605,26 +1348,13 @@ vec3 step(vec3 edge, vec3 x) { return vec3(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z)); } -vec3_scalar step(vec3_scalar edge, vec3_scalar x) { - return vec3_scalar(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z)); -} - SI vec3 min(vec3 a, vec3 b) { return vec3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); } -SI vec3 min(vec3 a, Float b) { - return vec3(min(a.x, b), min(a.y, b), min(a.z, b)); -} -SI vec3_scalar min(vec3_scalar a, vec3_scalar b) { - return vec3_scalar{min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)}; -} - SI vec3 max(vec3 a, vec3 b) { return vec3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); } -SI vec3 max(vec3 a, Float b) { - return vec3(max(a.x, b), max(a.y, b), max(a.z, b)); -} + SI vec3_scalar max(vec3_scalar a, vec3_scalar b) { return vec3_scalar{max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)}; } @@ -1670,15 +1400,11 @@ struct vec4_scalar { float w; constexpr vec4_scalar() : vec4_scalar(0.0f) {} - IMPLICIT constexpr vec4_scalar(float a) : x(a), y(a), z(a), w(a) {} + constexpr vec4_scalar(float a) : x(a), y(a), z(a), w(a) {} constexpr vec4_scalar(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {} vec4_scalar(vec3_scalar xyz, float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {} - static vec4_scalar load_from_ptr(const float* f) { - return vec4_scalar(f[0], f[1], f[2], f[3]); - } - ALWAYS_INLINE float& select(XYZW c) { switch (c) { case X: @@ -1700,9 +1426,6 @@ struct vec4_scalar { vec3_scalar sel(XYZW c1, XYZW c2, XYZW c3) { return vec3_scalar{select(c1), select(c2), select(c3)}; } - vec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { - return vec4_scalar{select(c1), select(c2), select(c3), select(c4)}; - } vec2_scalar_ref lsel(XYZW c1, XYZW c2) { return vec2_scalar_ref(select(c1), select(c2)); } @@ -1750,56 +1473,30 @@ struct vec4_scalar { w /= a.w; return *this; } - - vec4_scalar& operator*=(vec4_scalar a) { - x *= a.x; - y *= a.y; - z *= a.z; - w *= a.w; - return *this; - } - - friend bool operator==(const vec4_scalar& l, const vec4_scalar& r) { - return l.x == r.x && l.y == r.y && l.z == r.z && l.w == r.w; - } - - friend bool operator!=(const vec4_scalar& l, const vec4_scalar& r) { - return l.x != r.x || l.y != r.y || l.z != r.z || l.w != r.w; - } }; vec4_scalar vec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { return vec4_scalar{select(c1), select(c2), select(c3), select(c4)}; } -struct vec4_ref { - vec4_ref(Float& x, Float& y, Float& z, Float& w) : x(x), y(y), z(z), w(w) {} - Float& x; - Float& y; - Float& z; - Float& w; - - vec4_ref& operator=(const vec4& a); -}; - struct vec4 { typedef struct vec4 vector_type; typedef float element_type; constexpr vec4() : vec4(Float(0.0f)) {} - IMPLICIT constexpr vec4(Float a) : x(a), y(a), z(a), w(a) {} + constexpr vec4(Float a) : x(a), y(a), z(a), w(a) {} vec4(Float x, Float y, Float z, Float w) : x(x), y(y), z(z), w(w) {} vec4(vec3 xyz, Float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {} vec4(vec2 xy, vec2 zw) : x(xy.x), y(xy.y), z(zw.x), w(zw.y) {} vec4(vec2 xy, Float z, Float w) : x(xy.x), y(xy.y), z(z), w(w) {} vec4(Float x, Float y, vec2 zw) : x(x), y(y), z(zw.x), w(zw.y) {} - IMPLICIT constexpr vec4(vec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {} + constexpr vec4(vec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {} constexpr vec4(vec4_scalar s0, vec4_scalar s1, vec4_scalar s2, vec4_scalar s3) : x(Float{s0.x, s1.x, s2.x, s3.x}), y(Float{s0.y, s1.y, s2.y, s3.y}), z(Float{s0.z, s1.z, s2.z, s3.z}), w(Float{s0.w, s1.w, s2.w, s3.w}) {} - ALWAYS_INLINE Float& select(XYZW c) { + Float& select(XYZW c) { switch (c) { case X: return x; @@ -1813,29 +1510,18 @@ struct vec4 { UNREACHABLE; } } - ALWAYS_INLINE Float& sel(XYZW c1) { return select(c1); } + Float& sel(XYZW c1) { return select(c1); } - ALWAYS_INLINE vec2 sel(XYZW c1, XYZW c2) { - return vec2(select(c1), select(c2)); - } + vec2 sel(XYZW c1, XYZW c2) { return vec2(select(c1), select(c2)); } - ALWAYS_INLINE vec3 sel(XYZW c1, XYZW c2, XYZW c3) { + vec3 sel(XYZW c1, XYZW c2, XYZW c3) { return vec3(select(c1), select(c2), select(c3)); } - ALWAYS_INLINE vec3_ref lsel(XYZW c1, XYZW c2, XYZW c3) { + vec3_ref lsel(XYZW c1, XYZW c2, XYZW c3) { return vec3_ref(select(c1), select(c2), select(c3)); } - ALWAYS_INLINE vec2_ref lsel(XYZW c1, XYZW c2) { - return vec2_ref(select(c1), select(c2)); - } - - ALWAYS_INLINE vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { - return vec4(select(c1), select(c2), select(c3), select(c4)); - } - ALWAYS_INLINE vec4_ref lsel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { - return vec4_ref(select(c1), select(c2), select(c3), select(c4)); - } + vec2_ref lsel(XYZW c1, XYZW c2) { return vec2_ref(select(c1), select(c2)); } Float& operator[](int index) { switch (index) { @@ -1957,13 +1643,6 @@ struct vec4 { w /= a.w; return *this; } - vec4& operator*=(vec4 a) { - x *= a.x; - y *= a.y; - z *= a.z; - w *= a.w; - return *this; - } vec4& operator*=(Float a) { x *= a; y *= a; @@ -1978,18 +1657,6 @@ struct vec4 { Float w; }; -inline vec4_ref& vec4_ref::operator=(const vec4& a) { - x = a.x; - y = a.y; - z = a.z; - w = a.w; - return *this; -} - -inline vec4 vec3::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { - return vec4(select(c1), select(c2), select(c3), select(c4)); -} - vec4_scalar force_scalar(const vec4& v) { return vec4_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z), force_scalar(v.w)}; @@ -2017,10 +1684,6 @@ vec4_scalar make_vec4(float x, float y, const vec2_scalar& v) { return vec4_scalar{x, y, v.x, v.y}; } -ivec4_scalar make_ivec4(const vec4_scalar& v) { - return ivec4_scalar{int32_t(v.x), int32_t(v.y), int32_t(v.z), int32_t(v.w)}; -} - template <typename N> vec4 make_vec4(const N& n) { return vec4(n); @@ -2041,8 +1704,6 @@ vec4 make_vec4(const X& x, const Y& y, const Z& z, const W& w) { return vec4(x, y, z, w); } -ALWAYS_INLINE vec3::vec3(vec4 v) : x(v.x), y(v.y), z(v.z) {} - SI ivec4 roundfast(vec4 v, Float scale) { return ivec4(roundfast(v.x, scale), roundfast(v.y, scale), roundfast(v.z, scale), roundfast(v.w, scale)); @@ -2059,14 +1720,6 @@ SI vec4 if_then_else(I32 c, vec4 t, vec4 e) { SI vec4 if_then_else(int32_t c, vec4 t, vec4 e) { return c ? t : e; } -SI vec4_scalar if_then_else(int32_t c, vec4_scalar t, vec4_scalar e) { - return c ? t : e; -} - -SI vec2 clamp(vec2 a, Float minVal, Float maxVal) { - return vec2(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal)); -} - SI vec2 clamp(vec2 a, vec2 minVal, vec2 maxVal) { return vec2(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y)); } @@ -2076,56 +1729,20 @@ SI vec2_scalar clamp(vec2_scalar a, vec2_scalar minVal, vec2_scalar maxVal) { clamp(a.y, minVal.y, maxVal.y)}; } -SI vec2_scalar clamp(vec2_scalar a, float minVal, float maxVal) { - return vec2_scalar{clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal)}; -} - SI I32 clamp(I32 a, I32 minVal, I32 maxVal) { a = if_then_else(a < minVal, minVal, a); return if_then_else(a > maxVal, maxVal, a); } -SI vec3 clamp(vec3 a, Float minVal, Float maxVal) { - return vec3(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal), - clamp(a.z, minVal, maxVal)); -} - SI vec3 clamp(vec3 a, vec3 minVal, vec3 maxVal) { return vec3(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y), clamp(a.z, minVal.z, maxVal.z)); } -SI vec4 clamp(vec4 a, Float minVal, Float maxVal) { - return vec4(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal), - clamp(a.z, minVal, maxVal), clamp(a.w, minVal, maxVal)); -} - SI vec4 clamp(vec4 a, vec4 minVal, vec4 maxVal) { return vec4(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y), clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w)); } - -SI vec4_scalar clamp(vec4_scalar a, vec4_scalar minVal, vec4_scalar maxVal) { - return vec4_scalar{ - clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y), - clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w)}; -} - -SI vec4_scalar clamp(vec4_scalar a, float minVal, float maxVal) { - return vec4_scalar{clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal), - clamp(a.z, minVal, maxVal), clamp(a.w, minVal, maxVal)}; -} - -vec4 step(vec4 edge, vec4 x) { - return vec4(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z), - step(edge.w, x.w)); -} - -vec4_scalar step(vec4_scalar edge, vec4_scalar x) { - return vec4_scalar(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z), - step(edge.w, x.w)); -} - template <typename T> auto lessThanEqual(T x, T y) -> decltype(x <= y) { return x <= y; @@ -2163,20 +1780,6 @@ SI bvec2 lessThan(vec2 x, vec2 y) { return bvec2(lessThan(x.x, y.x), lessThan(x.y, y.y)); } -SI bvec2_scalar lessThan(vec2_scalar x, vec2_scalar y) { - return bvec2_scalar(lessThan(x.x, y.x), lessThan(x.y, y.y)); -} - -SI bvec4 lessThan(vec4 x, vec4 y) { - return bvec4(lessThan(x.x, y.x), lessThan(x.y, y.y), lessThan(x.z, y.z), - lessThan(x.w, y.w)); -} - -SI bvec4_scalar lessThan(vec4_scalar x, vec4_scalar y) { - return bvec4_scalar{lessThan(x.x, y.x), lessThan(x.y, y.y), - lessThan(x.z, y.z), lessThan(x.w, y.w)}; -} - template <typename T> auto greaterThan(T x, T y) -> decltype(x > y) { return x > y; @@ -2186,20 +1789,6 @@ bvec2 greaterThan(vec2 x, vec2 y) { return bvec2(greaterThan(x.x, y.x), greaterThan(x.y, y.y)); } -bvec2_scalar greaterThan(vec2_scalar x, vec2_scalar y) { - return bvec2_scalar(greaterThan(x.x, y.x), greaterThan(x.y, y.y)); -} - -SI bvec4 greaterThan(vec4 x, vec4 y) { - return bvec4(greaterThan(x.x, y.x), greaterThan(x.y, y.y), - greaterThan(x.z, y.z), greaterThan(x.w, y.w)); -} - -SI bvec4_scalar greaterThan(vec4_scalar x, vec4_scalar y) { - return bvec4_scalar{greaterThan(x.x, y.x), greaterThan(x.y, y.y), - greaterThan(x.z, y.z), greaterThan(x.w, y.w)}; -} - template <typename T> auto greaterThanEqual(T x, T y) -> decltype(x >= y) { return x >= y; @@ -2210,29 +1799,51 @@ bvec4 greaterThanEqual(vec4 x, vec4 y) { greaterThanEqual(x.z, y.z), greaterThanEqual(x.w, y.w)); } -template <typename T> -auto equal(T x, T y) -> decltype(x > y) { - return x == y; -} +enum TextureFormat { RGBA32F, RGBA32I, RGBA8, R8 }; -bvec2 equal(vec2 x, vec2 y) { return bvec2(equal(x.x, y.x), equal(x.y, y.y)); } +enum TextureFilter { NEAREST, LINEAR }; -bvec2_scalar equal(vec2_scalar x, vec2_scalar y) { - return bvec2_scalar(equal(x.x, y.x), equal(x.y, y.y)); -} +struct samplerCommon { + uint32_t* buf = nullptr; + uint32_t stride = 0; // in dwords + uint32_t height = 0; + uint32_t width = 0; + TextureFormat format = TextureFormat::RGBA8; +}; -template <typename T> -auto notEqual(T x, T y) -> decltype(x > y) { - return x != y; -} +struct samplerDepth { + int depth = 0; + uint32_t height_stride = 0; // in dwords +}; -bvec2 notEqual(vec2 x, vec2 y) { - return bvec2(notEqual(x.x, y.x), notEqual(x.y, y.y)); -} +struct samplerFilter { + TextureFilter filter = TextureFilter::NEAREST; +}; -bvec2_scalar notEqual(vec2_scalar x, vec2_scalar y) { - return bvec2_scalar(notEqual(x.x, y.x), notEqual(x.y, y.y)); -} +struct sampler2DArray_impl : samplerCommon, samplerDepth, samplerFilter {}; +typedef sampler2DArray_impl* sampler2DArray; + +typedef struct sampler2DArrayR8_impl : sampler2DArray_impl{} * sampler2DArrayR8; +typedef struct sampler2DArrayRGBA8_impl : sampler2DArray_impl{} * + sampler2DArrayRGBA8; +typedef struct sampler2DArrayRGBA32F_impl : sampler2DArray_impl{} * + sampler2DArrayRGBA32F; + +struct sampler2D_impl : samplerCommon, samplerFilter {}; +typedef sampler2D_impl* sampler2D; + +typedef struct sampler2DR8_impl : sampler2D_impl{} * sampler2DR8; +typedef struct sampler2DRGBA8_impl : sampler2D_impl{} * sampler2DRGBA8; +typedef struct sampler2DRGBA32F_impl : sampler2D_impl{} * sampler2DRGBA32F; + +struct isampler2D_impl : samplerCommon {}; +typedef isampler2D_impl* isampler2D; + +struct isampler2DRGBA32I_impl : isampler2D_impl {}; +typedef isampler2DRGBA32I_impl* isampler2DRGBA32I; + +struct sampler2DRect_impl : samplerCommon, samplerFilter {}; +typedef sampler2DRect_impl* sampler2DRect; struct mat4_scalar; @@ -2240,7 +1851,7 @@ struct mat2_scalar { vec2_scalar data[2]; mat2_scalar() = default; - IMPLICIT constexpr mat2_scalar(float a) { + constexpr mat2_scalar(float a) { data[0] = vec2_scalar(a); data[1] = vec2_scalar(a); } @@ -2248,7 +1859,7 @@ struct mat2_scalar { data[0] = a; data[1] = b; } - IMPLICIT mat2_scalar(const mat4_scalar& mat); + mat2_scalar(const mat4_scalar& mat); vec2_scalar& operator[](int index) { return data[index]; } const vec2_scalar& operator[](int index) const { return data[index]; } @@ -2286,7 +1897,7 @@ struct mat2 { const vec2& operator[](int index) const { return data[index]; } mat2() = default; - IMPLICIT mat2(Float a) { + mat2(Float a) { data[0] = vec2(a); data[1] = vec2(a); } @@ -2295,8 +1906,8 @@ struct mat2 { data[0] = a; data[1] = b; } - IMPLICIT mat2(const mat4& mat); - IMPLICIT constexpr mat2(mat2_scalar s) { + mat2(const mat4& mat); + constexpr mat2(mat2_scalar s) { data[0] = vec2(s.data[0]); data[1] = vec2(s.data[1]); } @@ -2350,7 +1961,7 @@ struct mat3_scalar { data[1] = b; data[2] = c; } - IMPLICIT mat3_scalar(const mat4_scalar& mat); + mat3_scalar(const mat4_scalar& mat); vec3_scalar& operator[](int index) { return data[index]; } const vec3_scalar& operator[](int index) const { return data[index]; } @@ -2384,7 +1995,7 @@ struct mat3 { data[2] = c; } - IMPLICIT constexpr mat3(mat3_scalar s) { + constexpr mat3(mat3_scalar s) { data[0] = vec3(s.data[0]); data[1] = vec3(s.data[1]); data[2] = vec3(s.data[2]); @@ -2403,7 +2014,7 @@ struct mat3 { data[2] = vec3(d7, d8, d9); } - IMPLICIT mat3(const mat4& mat); + mat3(const mat4& mat); friend vec3 operator*(mat3 m, vec3 v) { vec3 u; @@ -2490,7 +2101,7 @@ struct mat4 { vec4 data[4]; mat4() = default; - IMPLICIT constexpr mat4(mat4_scalar s) { + constexpr mat4(mat4_scalar s) { data[0] = vec4(s.data[0]); data[1] = vec4(s.data[1]); data[2] = vec4(s.data[2]); @@ -2522,15 +2133,15 @@ mat3::mat3(const mat4& mat) vec3(mat[1].x, mat[1].y, mat[1].z), vec3(mat[2].x, mat[2].y, mat[2].z)) {} -IMPLICIT mat3_scalar::mat3_scalar(const mat4_scalar& mat) +mat3_scalar::mat3_scalar(const mat4_scalar& mat) : mat3_scalar(vec3_scalar(mat[0].x, mat[0].y, mat[0].z), vec3_scalar(mat[1].x, mat[1].y, mat[1].z), vec3_scalar(mat[2].x, mat[2].y, mat[2].z)) {} -IMPLICIT mat2::mat2(const mat4& mat) +mat2::mat2(const mat4& mat) : mat2(vec2(mat[0].x, mat[0].y), vec2(mat[1].x, mat[1].y)) {} -IMPLICIT mat2_scalar::mat2_scalar(const mat4_scalar& mat) +mat2_scalar::mat2_scalar(const mat4_scalar& mat) : mat2_scalar(vec2_scalar(mat[0].x, mat[0].y), vec2_scalar(mat[1].x, mat[1].y)) {} @@ -2584,6 +2195,256 @@ SI mat4 if_then_else(I32 c, mat4 t, mat4 e) { SI mat4 if_then_else(int32_t c, mat4 t, mat4 e) { return c ? t : e; } +SI I32 clampCoord(I32 coord, int limit) { +#if USE_SSE2 + return _mm_min_epi16(_mm_max_epi16(coord, _mm_setzero_si128()), + _mm_set1_epi32(limit - 1)); +#else + return clamp(coord, 0, limit - 1); +#endif +} +SI int clampCoord(int coord, int limit) { + return min(max(coord, 0), limit - 1); +} +template <typename T, typename S> +SI T clamp2D(T P, S sampler) { + return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height)}; +} +template <typename T> +SI T clamp2DArray(T P, sampler2DArray sampler) { + return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height), + clampCoord(P.z, sampler->depth)}; +} + +float to_float(uint32_t x) { return x * (1.f / 255.f); } + +vec4 pixel_to_vec4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + U32 pixels = {a, b, c, d}; + return vec4(cast((pixels >> 16) & 0xFF), cast((pixels >> 8) & 0xFF), + cast(pixels & 0xFF), cast(pixels >> 24)) * + (1.0f / 255.0f); +} + +vec4 pixel_float_to_vec4(Float a, Float b, Float c, Float d) { + return vec4(Float{a.x, b.x, c.x, d.x}, Float{a.y, b.y, c.y, d.y}, + Float{a.z, b.z, c.z, d.z}, Float{a.w, b.w, c.w, d.w}); +} + +ivec4 pixel_int_to_ivec4(I32 a, I32 b, I32 c, I32 d) { + return ivec4(I32{a.x, b.x, c.x, d.x}, I32{a.y, b.y, c.y, d.y}, + I32{a.z, b.z, c.z, d.z}, I32{a.w, b.w, c.w, d.w}); +} + +vec4_scalar pixel_to_vec4(uint32_t p) { + U32 i = {(p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF, p >> 24}; + Float f = cast(i) * (1.0f / 255.0f); + return vec4_scalar(f.x, f.y, f.z, f.w); +} + +template <typename S> +SI vec4 fetchOffsetsRGBA8(S sampler, I32 offset) { + return pixel_to_vec4(sampler->buf[offset.x], sampler->buf[offset.y], + sampler->buf[offset.z], sampler->buf[offset.w]); +} + +vec4 texelFetchRGBA8(sampler2D sampler, ivec2 P) { + I32 offset = P.x + P.y * sampler->stride; + return fetchOffsetsRGBA8(sampler, offset); +} + +vec4 texelFetchRGBA8(sampler2DArray sampler, ivec3 P) { + assert(test_all(P.z == P.z.x)); + I32 offset = P.x + P.y * sampler->stride + P.z.x * sampler->height_stride; + return fetchOffsetsRGBA8(sampler, offset); +} + +template <typename S> +SI Float fetchOffsetsR8(S sampler, I32 offset) { + U32 i = { + ((uint8_t*)sampler->buf)[offset.x], ((uint8_t*)sampler->buf)[offset.y], + ((uint8_t*)sampler->buf)[offset.z], ((uint8_t*)sampler->buf)[offset.w]}; + return cast(i) * (1.0f / 255.0f); +} + +vec4 texelFetchR8(sampler2D sampler, ivec2 P) { + I32 offset = P.x + P.y * sampler->stride; + return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f); +} + +vec4 texelFetchR8(sampler2DArray sampler, ivec3 P) { + assert(test_all(P.z == P.z.x)); + I32 offset = P.x + P.y * sampler->stride + P.z.x * sampler->height_stride; + return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f); +} + +template <typename S> +SI vec4 fetchOffsetsFloat(S sampler, I32 offset) { + return pixel_float_to_vec4( + *(Float*)&sampler->buf[offset.x], *(Float*)&sampler->buf[offset.y], + *(Float*)&sampler->buf[offset.z], *(Float*)&sampler->buf[offset.w]); +} + +vec4 texelFetchFloat(sampler2D sampler, ivec2 P) { + I32 offset = P.x * 4 + P.y * sampler->stride; + return fetchOffsetsFloat(sampler, offset); +} + +SI vec4 texelFetchFloat(sampler2DArray sampler, ivec3 P) { + assert(test_all(P.z == P.z.x)); + I32 offset = P.x * 4 + P.y * sampler->stride + P.z.x * sampler->height_stride; + return fetchOffsetsFloat(sampler, offset); +} + +vec4 texelFetch(sampler2D sampler, ivec2 P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + if (sampler->format == TextureFormat::RGBA32F) { + return texelFetchFloat(sampler, P); + } else if (sampler->format == TextureFormat::RGBA8) { + return texelFetchRGBA8(sampler, P); + } else { + assert(sampler->format == TextureFormat::R8); + return texelFetchR8(sampler, P); + } +} + +vec4 texelFetch(sampler2DRGBA32F sampler, ivec2 P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA32F); + return texelFetchFloat(sampler, P); +} + +vec4 texelFetch(sampler2DRGBA8 sampler, ivec2 P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA8); + return texelFetchRGBA8(sampler, P); +} + +vec4 texelFetch(sampler2DR8 sampler, ivec2 P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::R8); + return texelFetchR8(sampler, P); +} + +vec4_scalar texelFetch(sampler2D sampler, ivec2_scalar P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + if (sampler->format == TextureFormat::RGBA32F) { + return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; + } else { + assert(sampler->format == TextureFormat::RGBA8); + return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]); + } +} + +vec4_scalar texelFetch(sampler2DRGBA32F sampler, ivec2_scalar P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA32F); + return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; +} + +vec4_scalar texelFetch(sampler2DRGBA8 sampler, ivec2_scalar P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA8); + return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]); +} + +vec4_scalar texelFetch(sampler2DR8 sampler, ivec2_scalar P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::R8); + return vec4_scalar{ + to_float(((uint8_t*)sampler->buf)[P.x + P.y * sampler->stride]), 0.0f, + 0.0f, 0.0f}; +} + +vec4 texelFetch(sampler2DRect sampler, ivec2 P) { + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA8); + I32 offset = P.x + P.y * sampler->stride; + return fetchOffsetsRGBA8(sampler, offset); +} + +SI vec4 texelFetch(sampler2DArray sampler, ivec3 P, int lod) { + assert(lod == 0); + P = clamp2DArray(P, sampler); + if (sampler->format == TextureFormat::RGBA32F) { + return texelFetchFloat(sampler, P); + } else if (sampler->format == TextureFormat::R8) { + return texelFetchR8(sampler, P); + } else { + assert(sampler->format == TextureFormat::RGBA8); + return texelFetchRGBA8(sampler, P); + } +} + +vec4 texelFetch(sampler2DArrayRGBA32F sampler, ivec3 P, int lod) { + assert(lod == 0); + P = clamp2DArray(P, sampler); + assert(sampler->format == TextureFormat::RGBA32F); + return texelFetchFloat(sampler, P); +} + +vec4 texelFetch(sampler2DArrayRGBA8 sampler, ivec3 P, int lod) { + assert(lod == 0); + P = clamp2DArray(P, sampler); + assert(sampler->format == TextureFormat::RGBA8); + return texelFetchRGBA8(sampler, P); +} + +vec4 texelFetch(sampler2DArrayR8 sampler, ivec3 P, int lod) { + assert(lod == 0); + P = clamp2DArray(P, sampler); + assert(sampler->format == TextureFormat::R8); + return texelFetchR8(sampler, P); +} + +template <typename S> +SI ivec4 fetchOffsetsInt(S sampler, I32 offset) { + return pixel_int_to_ivec4( + *(I32*)&sampler->buf[offset.x], *(I32*)&sampler->buf[offset.y], + *(I32*)&sampler->buf[offset.z], *(I32*)&sampler->buf[offset.w]); +} + +ivec4 texelFetch(isampler2D sampler, ivec2 P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA32I); + I32 offset = P.x * 4 + P.y * sampler->stride; + return fetchOffsetsInt(sampler, offset); +} + +ivec4_scalar texelFetch(isampler2D sampler, ivec2_scalar P, int lod) { + assert(lod == 0); + P = clamp2D(P, sampler); + assert(sampler->format == TextureFormat::RGBA32I); + return *(ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; +} + +SI vec4_scalar* texelFetchPtr(sampler2D sampler, ivec2_scalar P, int min_x, + int max_x, int min_y, int max_y) { + P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x); + P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y); + assert(sampler->format == TextureFormat::RGBA32F); + return (vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; +} + +SI ivec4_scalar* texelFetchPtr(isampler2D sampler, ivec2_scalar P, int min_x, + int max_x, int min_y, int max_y) { + P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x); + P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y); + assert(sampler->format == TextureFormat::RGBA32I); + return (ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; +} + +#define texelFetchOffset(sampler, P, lod, offset) \ + texelFetch(sampler, (P) + (offset), lod) + template <typename T, typename U, typename A, typename R = typename T::vector_type> SI R mix(T x, U y, A a) { @@ -2598,19 +2459,416 @@ SI T mix(T x, T y, float a) { } template <typename T> -SI T mix(T x, T y, vec2_scalar a) { - return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y)}; +SI T mix(T x, T y, vec4_scalar a) { + return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z), + mix(x.w, y.w, a.w)}; } +// Scale texture coords for quantization, subtract offset for filtering +// (assuming coords already offset to texel centers), and round to nearest +// 1/scale increment template <typename T> -SI T mix(T x, T y, vec3_scalar a) { - return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z)}; +SI T linearQuantize(T P, float scale) { + return P * scale + (0.5f - 0.5f * scale); } -template <typename T> -SI T mix(T x, T y, vec4_scalar a) { - return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z), - mix(x.w, y.w, a.w)}; +// Helper version that also scales normalized texture coords for sampler +template <typename T, typename S> +SI T linearQuantize(T P, float scale, S sampler) { + P.x *= sampler->width; + P.y *= sampler->height; + return linearQuantize(P, scale); +} + +template <typename S> +vec4 textureLinearRGBA8(S sampler, vec2 P, int32_t zoffset = 0) { + assert(sampler->format == TextureFormat::RGBA8); + +#if USE_SSE2 + ivec2 i(linearQuantize(P, 256, sampler)); + ivec2 frac = i & (I32)0xFF; + i >>= 8; + + // Pack coords so they get clamped into range, and also for later bounding + // of fractional coords. Store Y as low-bits for easier access, X as high. + __m128i yx = _mm_packs_epi32(i.y, i.x); + __m128i hw = _mm_packs_epi32(_mm_set1_epi32(sampler->height - 1), + _mm_set1_epi32(sampler->width - 1)); + // Clamp coords to valid range to prevent sampling outside texture. + __m128i clampyx = _mm_min_epi16(_mm_max_epi16(yx, _mm_setzero_si128()), hw); + // Multiply clamped Y by stride and add X offset. + __m128i row0 = _mm_madd_epi16( + _mm_unpacklo_epi16(clampyx, _mm_setzero_si128()), + _mm_set1_epi16(sampler->stride)); + row0 = _mm_add_epi32(row0, _mm_unpackhi_epi16(clampyx, _mm_setzero_si128())); + // Add in layer offset if available + row0 = _mm_add_epi32(row0, _mm_set1_epi32(zoffset)); + + // Check if fractional coords are all zero, in which case skip filtering. + __m128i fracyx = _mm_packs_epi32(frac.y, frac.x); + if (!_mm_movemask_epi8(_mm_cmpgt_epi16(fracyx, _mm_setzero_si128()))) { + return fetchOffsetsRGBA8(sampler, row0); + } + + // Check if coords were clamped at all above. If so, need to adjust fractions + // to avoid sampling outside the texture on the edges. + __m128i yxinside = _mm_andnot_si128( + _mm_cmplt_epi16(yx, _mm_setzero_si128()), + _mm_cmplt_epi16(yx, hw)); + // Set fraction to zero when outside. + fracyx = _mm_and_si128(fracyx, yxinside); + // Store two side-by-side copies of X fraction, as below each pixel value + // will be interleaved to be next to the pixel value for the next row. + __m128i fracx = _mm_unpackhi_epi16(fracyx, fracyx); + // For Y fraction, we need to store 1-fraction before each fraction, as a + // madd will be used to weight and collapse all results as last step. + __m128i fracy = _mm_unpacklo_epi16( + _mm_sub_epi16(_mm_set1_epi16(256), fracyx), fracyx); + + // Ensure we don't sample row off end of texture from added stride. + __m128i row1 = _mm_and_si128(yxinside, _mm_set1_epi16(sampler->stride)); + + // Load two adjacent pixels on each row and interleave them. + // r0,g0,b0,a0,r1,g1,b1,a1 \/ R0,G0,B0,A0,R1,G1,B1,A1 + // r0,R0,g0,G0,b0,B0,a0,A0,r1,R1,g1,G1,b1,B1,a1,A1 +# define LOAD_LANE(out, idx) \ + { \ + uint32_t* buf = &sampler->buf[_mm_cvtsi128_si32( \ + _mm_shuffle_epi32(row0, _MM_SHUFFLE(idx, idx, idx, idx)))]; \ + out = _mm_unpacklo_epi8( \ + _mm_loadl_epi64((__m128i*)buf), \ + _mm_loadl_epi64((__m128i*)(buf + _mm_extract_epi16(row1, idx)))); \ + } + __m128i x, y, z, w; + LOAD_LANE(x, 0) + LOAD_LANE(y, 1) + LOAD_LANE(z, 2) + LOAD_LANE(w, 3) +# undef LOAD_LANE + + // Need to transpose the data from AoS to SoA format. Best to do this here + // while the data is still packed into 8-bit components, requiring fewer + // insns. + // r0,R0,g0,G0,b0,B0,a0,A0,r1,R1,g1,G1,b1,B1,a1,A1 \/ + // r2,R2,g2,G2,b2,B2,a2,A2,r3,R3,g3,G3,b3,B3,a3,A3 + // ... r0,R0,r2,R2,g0,G0,g2,G2,b0,B0,b2,B2,a0,A0,a2,A2 + // ... r1,R1,r3,R3,g1,G1,g3,G3,b1,B1,b3,B3,a1,A1,a3,A3 + __m128i xy0 = _mm_unpacklo_epi16(x, y); + __m128i xy1 = _mm_unpackhi_epi16(x, y); + __m128i zw0 = _mm_unpacklo_epi16(z, w); + __m128i zw1 = _mm_unpackhi_epi16(z, w); + // r0,R0,r2,R2,g0,G0,g2,G2,b0,B0,b2,B2,a0,A0,a2,A2 \/ + // r4,R4,r6,R6,g4,G4,g6,G6,b4,B4,b6,B6,a4,A4,a6,A6 + // ... r0,R0,r2,R2,r4,R4,r6,R6,g0,G0,g2,G2,g4,G4,g6,G6 + // ... b0,B0,b2,B2,b4,B4,b6,B6,a0,A0,a2,A2,a4,A4,a6,A6 + __m128i rg0 = _mm_unpacklo_epi32(xy0, zw0); + __m128i ba0 = _mm_unpackhi_epi32(xy0, zw0); + __m128i rg1 = _mm_unpacklo_epi32(xy1, zw1); + __m128i ba1 = _mm_unpackhi_epi32(xy1, zw1); + + // Expand packed SoA pixels for each column. Multiply then add columns with + // 8-bit precision so we don't carry to high byte of word accidentally. Use + // final madd insn to blend interleaved rows and expand result to 32 bits. +# define FILTER_COMPONENT(out, unpack, src0, src1) \ + { \ + __m128i cc0 = unpack(src0, _mm_setzero_si128()); \ + __m128i cc1 = unpack(src1, _mm_setzero_si128()); \ + cc0 = _mm_add_epi8( \ + cc0, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(cc1, cc0), fracx), \ + 8)); \ + out = _mm_cvtepi32_ps(_mm_madd_epi16(cc0, fracy)); \ + } + __m128 fr, fg, fb, fa; + FILTER_COMPONENT(fr, _mm_unpacklo_epi8, rg0, rg1); + FILTER_COMPONENT(fg, _mm_unpackhi_epi8, rg0, rg1); + FILTER_COMPONENT(fb, _mm_unpacklo_epi8, ba0, ba1); + FILTER_COMPONENT(fa, _mm_unpackhi_epi8, ba0, ba1); +# undef FILTER_COMPONENT + + return vec4(fb, fg, fr, fa) * (1.0f / 0xFF00); +#else + ivec2 i(linearQuantize(P, 128, sampler)); + ivec2 frac = i & (I32)0x7F; + i >>= 7; + + I32 row0 = clampCoord(i.x, sampler->width) + + clampCoord(i.y, sampler->height) * sampler->stride + zoffset; + I32 row1 = row0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) & + I32(sampler->stride)); + I16 fracx = + CONVERT(frac.x & (i.x >= 0 && i.x < int32_t(sampler->width) - 1), I16); + I16 fracy = CONVERT(frac.y, I16); + + auto a0 = + CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.x]), V8<int16_t>); + auto a1 = + CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.x]), V8<int16_t>); + a0 += ((a1 - a0) * fracy.x) >> 7; + + auto b0 = + CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.y]), V8<int16_t>); + auto b1 = + CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.y]), V8<int16_t>); + b0 += ((b1 - b0) * fracy.y) >> 7; + + auto abl = zipLow(a0, b0); + auto abh = zipHigh(a0, b0); + abl += ((abh - abl) * fracx.xyxyxyxy) >> 7; + + auto c0 = + CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.z]), V8<int16_t>); + auto c1 = + CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.z]), V8<int16_t>); + c0 += ((c1 - c0) * fracy.z) >> 7; + + auto d0 = + CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.w]), V8<int16_t>); + auto d1 = + CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.w]), V8<int16_t>); + d0 += ((d1 - d0) * fracy.w) >> 7; + + auto cdl = zipLow(c0, d0); + auto cdh = zipHigh(c0, d0); + cdl += ((cdh - cdl) * fracx.zwzwzwzw) >> 7; + + auto rg = CONVERT(V8<uint16_t>(zip2Low(abl, cdl)), V8<float>); + auto ba = CONVERT(V8<uint16_t>(zip2High(abl, cdl)), V8<float>); + + auto r = lowHalf(rg); + auto g = highHalf(rg); + auto b = lowHalf(ba); + auto a = highHalf(ba); + return vec4(b, g, r, a) * (1.0f / 255.0f); +#endif +} + +template <typename S> +static U16 textureLinearPackedR8(S sampler, ivec2 i, int32_t zoffset) { + assert(sampler->format == TextureFormat::R8); + ivec2 frac = i & (I32)0x7F; + i >>= 7; + + I32 row0 = clampCoord(i.x, sampler->width) + + clampCoord(i.y, sampler->height) * sampler->stride + zoffset; + I32 row1 = row0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) & + I32(sampler->stride)); + I16 fracx = + CONVERT(frac.x & (i.x >= 0 && i.x < int32_t(sampler->width) - 1), I16); + I16 fracy = CONVERT(frac.y, I16); + + uint8_t* buf = (uint8_t*)sampler->buf; + auto a0 = unaligned_load<V2<uint8_t> >(&buf[row0.x]); + auto b0 = unaligned_load<V2<uint8_t> >(&buf[row0.y]); + auto c0 = unaligned_load<V2<uint8_t> >(&buf[row0.z]); + auto d0 = unaligned_load<V2<uint8_t> >(&buf[row0.w]); + auto abcd0 = CONVERT(combine(combine(a0, b0), combine(c0, d0)), V8<int16_t>); + + auto a1 = unaligned_load<V2<uint8_t> >(&buf[row1.x]); + auto b1 = unaligned_load<V2<uint8_t> >(&buf[row1.y]); + auto c1 = unaligned_load<V2<uint8_t> >(&buf[row1.z]); + auto d1 = unaligned_load<V2<uint8_t> >(&buf[row1.w]); + auto abcd1 = CONVERT(combine(combine(a1, b1), combine(c1, d1)), V8<int16_t>); + + abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7; + + abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7); + auto abcdl = lowHalf(abcd0); + auto abcdh = highHalf(abcd0); + abcdl += ((abcdh - abcdl) * fracx) >> 7; + + return U16(abcdl); +} + +template <typename S> +vec4 textureLinearR8(S sampler, vec2 P, int32_t zoffset = 0) { + assert(sampler->format == TextureFormat::R8); + +#if USE_SSE2 + ivec2 i(linearQuantize(P, 256, sampler)); + ivec2 frac = i & (I32)0xFF; + i >>= 8; + + // Pack coords so they get clamped into range, and also for later bounding + // of fractional coords. Store Y as low-bits for easier access, X as high. + __m128i yx = _mm_packs_epi32(i.y, i.x); + __m128i hw = _mm_packs_epi32(_mm_set1_epi32(sampler->height - 1), + _mm_set1_epi32(sampler->width - 1)); + // Clamp coords to valid range to prevent sampling outside texture. + __m128i clampyx = _mm_min_epi16(_mm_max_epi16(yx, _mm_setzero_si128()), hw); + // Multiply clamped Y by stride and add X offset. + __m128i row0 = _mm_madd_epi16( + _mm_unpacklo_epi16(clampyx, _mm_setzero_si128()), + _mm_set1_epi16(sampler->stride)); + row0 = _mm_add_epi32(row0, _mm_unpackhi_epi16(clampyx, _mm_setzero_si128())); + // Add in layer offset if available + row0 = _mm_add_epi32(row0, _mm_set1_epi32(zoffset)); + + __m128i fracyx = _mm_packs_epi32(frac.y, frac.x); + + // Check if coords were clamped at all above. If so, need to adjust fractions + // to avoid sampling outside the texture on the edges. + __m128i yxinside = _mm_andnot_si128( + _mm_cmplt_epi16(yx, _mm_setzero_si128()), + _mm_cmplt_epi16(yx, hw)); + // Set fraction to zero when outside. + fracyx = _mm_and_si128(fracyx, yxinside); + // For X fraction, we need to store 1-fraction before each fraction, as a + // madd will be used to weight and collapse all results as last step. + __m128i fracx = _mm_unpackhi_epi16( + _mm_sub_epi16(_mm_set1_epi16(256), fracyx), fracyx); + // Store two side-by-side copies of Y fraction, as below each pixel value + // will be interleaved to be next to the pixel value for the next column. + __m128i fracy = _mm_unpacklo_epi16(fracyx, fracyx); + + // Ensure we don't sample row off end of texture from added stride. + __m128i row1 = _mm_and_si128(yxinside, _mm_set1_epi16(sampler->stride)); + + // Calculate pointers for first row in each lane + uint8_t* buf = (uint8_t*)sampler->buf; + uint8_t* buf0 = + buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(0, 0, 0, 0))); + uint8_t* buf1 = + buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(1, 1, 1, 1))); + uint8_t* buf2 = + buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(2, 2, 2, 2))); + uint8_t* buf3 = + buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 3, 3, 3))); + // Load adjacent columns from first row, pack into register, then expand. + __m128i cc0 = _mm_unpacklo_epi8( + _mm_setr_epi16(*(uint16_t*)buf0, *(uint16_t*)buf1, *(uint16_t*)buf2, + *(uint16_t*)buf3, 0, 0, 0, 0), + _mm_setzero_si128()); + // Load adjacent columns from next row, pack into register, then expand. + __m128i cc1 = _mm_unpacklo_epi8( + _mm_setr_epi16(*(uint16_t*)(buf0 + _mm_extract_epi16(row1, 0)), + *(uint16_t*)(buf1 + _mm_extract_epi16(row1, 1)), + *(uint16_t*)(buf2 + _mm_extract_epi16(row1, 2)), + *(uint16_t*)(buf3 + _mm_extract_epi16(row1, 3)), + 0, 0, 0, 0), + _mm_setzero_si128()); + // Multiply then add rows with 8-bit precision so we don't carry to high byte + // of word accidentally. Use final madd insn to blend interleaved columns and + // expand result to 32 bits. + __m128i cc = _mm_add_epi8( + cc0, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(cc1, cc0), fracy), 8)); + __m128 r = _mm_cvtepi32_ps(_mm_madd_epi16(cc, fracx)); + return vec4((Float)r * (1.0f / 0xFF00), 0.0f, 0.0f, 1.0f); +#else + ivec2 i(linearQuantize(P, 128, sampler)); + Float r = CONVERT(textureLinearPackedR8(sampler, i, zoffset), Float); + return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f); +#endif +} + +template <typename S> +vec4 textureLinearRGBA32F(S sampler, vec2 P, int32_t zoffset = 0) { + assert(sampler->format == TextureFormat::RGBA32F); + P.x *= sampler->width; + P.y *= sampler->height; + P -= 0.5f; + vec2 f = floor(P); + vec2 r = P - f; + ivec2 i(f); + ivec2 c = clamp2D(i, sampler); + r.x = if_then_else(i.x >= 0 && i.x < sampler->width - 1, r.x, 0.0f); + I32 offset0 = c.x * 4 + c.y * sampler->stride + zoffset; + I32 offset1 = offset0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) & + I32(sampler->stride)); + + Float c0 = mix(mix(*(Float*)&sampler->buf[offset0.x], + *(Float*)&sampler->buf[offset0.x + 4], r.x), + mix(*(Float*)&sampler->buf[offset1.x], + *(Float*)&sampler->buf[offset1.x + 4], r.x), + r.y); + Float c1 = mix(mix(*(Float*)&sampler->buf[offset0.y], + *(Float*)&sampler->buf[offset0.y + 4], r.x), + mix(*(Float*)&sampler->buf[offset1.y], + *(Float*)&sampler->buf[offset1.y + 4], r.x), + r.y); + Float c2 = mix(mix(*(Float*)&sampler->buf[offset0.z], + *(Float*)&sampler->buf[offset0.z + 4], r.x), + mix(*(Float*)&sampler->buf[offset1.z], + *(Float*)&sampler->buf[offset1.z + 4], r.x), + r.y); + Float c3 = mix(mix(*(Float*)&sampler->buf[offset0.w], + *(Float*)&sampler->buf[offset0.w + 4], r.x), + mix(*(Float*)&sampler->buf[offset1.w], + *(Float*)&sampler->buf[offset1.w + 4], r.x), + r.y); + return pixel_float_to_vec4(c0, c1, c2, c3); +} + +SI vec4 texture(sampler2D sampler, vec2 P) { + if (sampler->filter == TextureFilter::LINEAR) { + if (sampler->format == TextureFormat::RGBA8) { + return textureLinearRGBA8(sampler, P); + } else if (sampler->format == TextureFormat::R8) { + return textureLinearR8(sampler, P); + } else { + assert(sampler->format == TextureFormat::RGBA32F); + return textureLinearRGBA32F(sampler, P); + } + } else { + ivec2 coord(roundzero(P.x, sampler->width), roundzero(P.y, sampler->height)); + return texelFetch(sampler, coord, 0); + } +} + +vec4 texture(sampler2DRect sampler, vec2 P) { + assert(sampler->format == TextureFormat::RGBA8); + if (sampler->filter == TextureFilter::LINEAR) { + return textureLinearRGBA8(sampler, + P * vec2_scalar{1.0f / sampler->width, 1.0f / sampler->height}); + } else { + ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f)); + return texelFetch(sampler, coord); + } +} + +SI vec4 texture(sampler2DArray sampler, vec3 P) { + if (sampler->filter == TextureFilter::LINEAR) { + // SSE2 can generate slow code for 32-bit multiply, and we never actually sample + // from different layers in one chunk, so do cheaper scalar multiplication instead. + assert(test_all(P.z == P.z.x)); + int32_t zoffset = + clampCoord(roundeven(P.z.x, 1.0f), sampler->depth) * sampler->height_stride; + if (sampler->format == TextureFormat::RGBA8) { + return textureLinearRGBA8(sampler, vec2(P.x, P.y), zoffset); + } else if (sampler->format == TextureFormat::R8) { + return textureLinearR8(sampler, vec2(P.x, P.y), zoffset); + } else { + assert(sampler->format == TextureFormat::RGBA32F); + return textureLinearRGBA32F(sampler, vec2(P.x, P.y), zoffset); + } + } else { + // just do nearest for now + ivec3 coord(roundzero(P.x, sampler->width), roundzero(P.y, sampler->height), + roundeven(P.z, 1.0f)); + return texelFetch(sampler, coord, 0); + } +} + +vec4 texture(sampler2DArray sampler, vec3 P, float bias) { + assert(bias == 0.0f); + return texture(sampler, P); +} + +vec4 textureLod(sampler2DArray sampler, vec3 P, float lod) { + assert(lod == 0.0f); + return texture(sampler, P); +} + +ivec3_scalar textureSize(sampler2DArray sampler, int) { + return ivec3_scalar{int32_t(sampler->width), int32_t(sampler->height), + int32_t(sampler->depth)}; +} + +ivec2_scalar textureSize(sampler2D sampler, int) { + return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)}; +} + +ivec2_scalar textureSize(sampler2DRect sampler) { + return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)}; } ivec4 ivec2::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) { @@ -2675,30 +2933,15 @@ SI T mix(T x, T y, bvec4_scalar a) { } template <typename T> -SI T mix(T x, T y, bvec4_scalar1 a) { - return a.x ? y : x; -} - -template <typename T> SI T mix(T x, T y, bvec3_scalar a) { return T{a.x ? y.x : x.x, a.y ? y.y : x.y, a.z ? y.z : x.z}; } template <typename T> -SI T mix(T x, T y, bvec3_scalar1 a) { - return a.x ? y : x; -} - -template <typename T> SI T mix(T x, T y, bvec2_scalar a) { return T{a.x ? y.x : x.x, a.y ? y.y : x.y}; } -template <typename T> -SI T mix(T x, T y, bvec2_scalar1 a) { - return a.x ? y : x; -} - float dot(vec3_scalar a, vec3_scalar b) { return a.x * b.x + a.y * b.y + a.z * b.z; } @@ -2736,28 +2979,7 @@ Float atan(Float v) { return {atanf(v.x), atanf(v.y), atanf(v.z), atanf(v.w)}; } float atan(float a, float b) { return atan2f(a, b); } Float atan(Float a, Float b) { - return {atan2f(a.x, b.x), atan2f(a.y, b.y), atan2f(a.z, b.z), - atan2f(a.w, b.w)}; -} - -bvec4 equal(vec4 x, vec4 y) { - return bvec4(equal(x.x, y.x), equal(x.y, y.y), equal(x.z, y.z), - equal(x.w, y.w)); -} - -bvec4_scalar equal(vec4_scalar x, vec4_scalar y) { - return bvec4_scalar(equal(x.x, y.x), equal(x.y, y.y), equal(x.z, y.z), - equal(x.w, y.w)); -} - -bvec4 notEqual(vec4 x, vec4 y) { - return bvec4(notEqual(x.x, y.x), notEqual(x.y, y.y), notEqual(x.z, y.z), - notEqual(x.w, y.w)); -} - -bvec4_scalar notEqual(vec4_scalar x, vec4_scalar y) { - return bvec4_scalar(notEqual(x.x, y.x), notEqual(x.y, y.y), - notEqual(x.z, y.z), notEqual(x.w, y.w)); + return {atan2f(a.x, b.x), atan2f(a.y, b.y), atan2f(a.z, b.z), atan2f(a.w, b.w)}; } bvec4 notEqual(ivec4 a, ivec4 b) { @@ -2783,18 +3005,12 @@ vec2 abs(vec2 v) { return vec2(abs(v.x), abs(v.y)); } vec2_scalar abs(vec2_scalar v) { return vec2_scalar{fabsf(v.x), fabsf(v.y)}; } -vec2 sign(vec2 v) { return vec2(sign(v.x), sign(v.y)); } - -vec2_scalar sign(vec2_scalar v) { return vec2_scalar{sign(v.x), sign(v.y)}; } - Float mod(Float a, Float b) { return a - b * floor(a / b); } vec2 mod(vec2 a, vec2 b) { return vec2(mod(a.x, b.x), mod(a.y, b.y)); } vec3 abs(vec3 v) { return vec3(abs(v.x), abs(v.y), abs(v.z)); } -vec3 sign(vec3 v) { return vec3(sign(v.x), sign(v.y), sign(v.z)); } - mat2 inverse(mat2 v) { Float det = v[0].x * v[1].y - v[0].y * v[1].x; return mat2(vec2(v[1].y, -v[0].y), vec2(-v[1].x, v[0].x)) * (1. / det); diff --git a/third_party/webrender/swgl/src/lib.rs b/third_party/webrender/swgl/src/lib.rs index e8fc030e0c9..e19e85fd512 100644 --- a/third_party/webrender/swgl/src/lib.rs +++ b/third_party/webrender/swgl/src/lib.rs @@ -5,7 +5,7 @@ #![crate_name = "swgl"] #![crate_type = "lib"] -extern crate gleam; +use gleam; mod swgl_fns; diff --git a/third_party/webrender/swgl/src/program.h b/third_party/webrender/swgl/src/program.h index 9ea7c6dd6eb..80e5a5b68f7 100644 --- a/third_party/webrender/swgl/src/program.h +++ b/third_party/webrender/swgl/src/program.h @@ -12,12 +12,6 @@ namespace glsl { // to operate in Float-sized chunks. typedef vec3 Interpolants; -// Clip distances, if enabled, are always stored in the first SIMD chunk of the -// interpolants. -static ALWAYS_INLINE Float get_clip_distances(const Interpolants& interp) { - return interp.x; -} - struct VertexShaderImpl; struct FragmentShaderImpl; @@ -29,14 +23,10 @@ struct ProgramImpl { virtual size_t interpolants_size() const = 0; virtual VertexShaderImpl* get_vertex_shader() = 0; virtual FragmentShaderImpl* get_fragment_shader() = 0; - virtual const char* get_name() const = 0; }; typedef ProgramImpl* (*ProgramLoader)(); -// The maximum size of the gl_ClipDistance array. -constexpr int32_t gl_MaxClipDistances = 4; - struct VertexShaderImpl { typedef void (*SetUniform1iFunc)(VertexShaderImpl*, int index, int value); typedef void (*SetUniform4fvFunc)(VertexShaderImpl*, int index, @@ -56,17 +46,7 @@ struct VertexShaderImpl { LoadAttribsFunc load_attribs_func = nullptr; RunPrimitiveFunc run_primitive_func = nullptr; - enum FLAGS { - CLIP_DISTANCE = 1 << 0, - }; - int flags = 0; - void enable_clip_distance() { flags |= CLIP_DISTANCE; } - ALWAYS_INLINE bool use_clip_distance() const { - return (flags & CLIP_DISTANCE) != 0; - } - vec4 gl_Position; - Float gl_ClipDistance[gl_MaxClipDistances]; void set_uniform_1i(int index, int value) { (*set_uniform_1i_func)(this, index, value); @@ -92,20 +72,18 @@ struct VertexShaderImpl { } }; -// The number of pixels in a step. -constexpr int32_t swgl_StepSize = 4; - struct FragmentShaderImpl { typedef void (*InitSpanFunc)(FragmentShaderImpl*, const void* interps, - const void* step); + const void* step, float step_width); typedef void (*RunFunc)(FragmentShaderImpl*); - typedef void (*SkipFunc)(FragmentShaderImpl*, int steps); + typedef void (*SkipFunc)(FragmentShaderImpl*, int chunks); typedef void (*InitSpanWFunc)(FragmentShaderImpl*, const void* interps, - const void* step); + const void* step, float step_width); typedef void (*RunWFunc)(FragmentShaderImpl*); - typedef void (*SkipWFunc)(FragmentShaderImpl*, int steps); - typedef int (*DrawSpanRGBA8Func)(FragmentShaderImpl*); - typedef int (*DrawSpanR8Func)(FragmentShaderImpl*); + typedef void (*SkipWFunc)(FragmentShaderImpl*, int chunks); + typedef void (*DrawSpanRGBA8Func)(FragmentShaderImpl*, uint32_t* buf, + int len); + typedef void (*DrawSpanR8Func)(FragmentShaderImpl*, uint8_t* buf, int len); InitSpanFunc init_span_func = nullptr; RunFunc run_func = nullptr; @@ -129,27 +107,31 @@ struct FragmentShaderImpl { } vec4 gl_FragCoord; + vec2_scalar stepZW; + Bool isPixelDiscarded = false; vec4 gl_FragColor; vec4 gl_SecondaryFragColor; - vec2_scalar swgl_StepZW; - Bool swgl_IsPixelDiscarded = false; - // The current buffer position for committing span output. - uint32_t* swgl_OutRGBA8 = nullptr; - uint8_t* swgl_OutR8 = nullptr; - // The remaining number of pixels in the span. - int32_t swgl_SpanLength = 0; + ALWAYS_INLINE void step_fragcoord() { gl_FragCoord.x += 4; } - ALWAYS_INLINE void step_fragcoord(int steps = 4) { gl_FragCoord.x += steps; } + ALWAYS_INLINE void step_fragcoord(int chunks) { + gl_FragCoord.x += 4 * chunks; + } + + ALWAYS_INLINE void step_perspective() { + gl_FragCoord.z += stepZW.x; + gl_FragCoord.w += stepZW.y; + } - ALWAYS_INLINE void step_perspective(int steps = 4) { - gl_FragCoord.z += swgl_StepZW.x * steps; - gl_FragCoord.w += swgl_StepZW.y * steps; + ALWAYS_INLINE void step_perspective(int chunks) { + gl_FragCoord.z += stepZW.x * chunks; + gl_FragCoord.w += stepZW.y * chunks; } template <bool W = false> - ALWAYS_INLINE void init_span(const void* interps, const void* step) { - (*(W ? init_span_w_func : init_span_func))(this, interps, step); + ALWAYS_INLINE void init_span(const void* interps, const void* step, + float step_width) { + (*(W ? init_span_w_func : init_span_func))(this, interps, step, step_width); } template <bool W = false> @@ -158,24 +140,20 @@ struct FragmentShaderImpl { } template <bool W = false> - ALWAYS_INLINE void skip(int steps = 4) { - (*(W ? skip_w_func : skip_func))(this, steps); + ALWAYS_INLINE void skip(int chunks = 1) { + (*(W ? skip_w_func : skip_func))(this, chunks); } - ALWAYS_INLINE int draw_span(uint32_t* buf, int len) { - swgl_OutRGBA8 = buf; - swgl_SpanLength = len; - return (*draw_span_RGBA8_func)(this); + ALWAYS_INLINE void draw_span(uint32_t* buf, int len) { + (*draw_span_RGBA8_func)(this, buf, len); } ALWAYS_INLINE bool has_draw_span(uint32_t*) { return draw_span_RGBA8_func != nullptr; } - ALWAYS_INLINE int draw_span(uint8_t* buf, int len) { - swgl_OutR8 = buf; - swgl_SpanLength = len; - return (*draw_span_R8_func)(this); + ALWAYS_INLINE void draw_span(uint8_t* buf, int len) { + (*draw_span_R8_func)(this, buf, len); } ALWAYS_INLINE bool has_draw_span(uint8_t*) { diff --git a/third_party/webrender/swgl/src/rasterize.h b/third_party/webrender/swgl/src/rasterize.h deleted file mode 100644 index 48f3b9e5898..00000000000 --- a/third_party/webrender/swgl/src/rasterize.h +++ /dev/null @@ -1,1670 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -// The SWGL depth buffer is roughly organized as a span buffer where each row -// of the depth buffer is a list of spans, and each span has a constant depth -// and a run length (represented by DepthRun). The span from start..start+count -// is placed directly at that start index in the row's array of runs, so that -// there is no need to explicitly record the start index at all. This also -// avoids the need to move items around in the run array to manage insertions -// since space is implicitly always available for a run between any two -// pre-existing runs. Linkage from one run to the next is implicitly defined by -// the count, so if a run exists from start..start+count, the next run will -// implicitly pick up right at index start+count where that preceding run left -// off. All of the DepthRun items that are after the head of the run can remain -// uninitialized until the run needs to be split and a new run needs to start -// somewhere in between. -// For uses like perspective-correct rasterization or with a discard mask, a -// run is not an efficient representation, and it is more beneficial to have -// a flattened array of individual depth samples that can be masked off easily. -// To support this case, the first run in a given row's run array may have a -// zero count, signaling that this entire row is flattened. Critically, the -// depth and count fields in DepthRun are ordered (endian-dependently) so that -// the DepthRun struct can be interpreted as a sign-extended int32_t depth. It -// is then possible to just treat the entire row as an array of int32_t depth -// samples that can be processed with SIMD comparisons, since the count field -// behaves as just the sign-extension of the depth field. The count field is -// limited to 8 bits so that we can support depth values up to 24 bits. -// When a depth buffer is cleared, each row is initialized to a maximal runs -// spanning the entire row. In the normal case, the depth buffer will continue -// to manage itself as a list of runs. If perspective or discard is used for -// a given row, the row will be converted to the flattened representation to -// support it, after which it will only ever revert back to runs if the depth -// buffer is cleared. - -// The largest 24-bit depth value supported. -constexpr uint32_t MAX_DEPTH_VALUE = 0xFFFFFF; -// The longest 8-bit depth run that is supported, aligned to SIMD chunk size. -constexpr uint32_t MAX_DEPTH_RUN = 255 & ~3; - -struct DepthRun { - // Ensure that depth always occupies the LSB and count the MSB so that we - // can sign-extend depth just by setting count to zero, marking it flat. - // When count is non-zero, then this is interpreted as an actual run and - // depth is read in isolation. -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - uint32_t depth : 24; - uint32_t count : 8; -#else - uint32_t count : 8; - uint32_t depth : 24; -#endif - - DepthRun() = default; - DepthRun(uint32_t depth, uint8_t count) : depth(depth), count(count) {} - - // If count is zero, this is actually a flat depth sample rather than a run. - bool is_flat() const { return !count; } - - // Compare a source depth from rasterization with a stored depth value. - template <int FUNC> - ALWAYS_INLINE bool compare(uint32_t src) const { - switch (FUNC) { - case GL_LEQUAL: - return src <= depth; - case GL_LESS: - return src < depth; - case GL_ALWAYS: - return true; - default: - assert(false); - return false; - } - } -}; - -// Fills runs at the given position with the given depth up to the span width. -static ALWAYS_INLINE void set_depth_runs(DepthRun* runs, uint32_t depth, - uint32_t width) { - // If the width exceeds the maximum run size, then we need to output clamped - // runs first. - for (; width >= MAX_DEPTH_RUN; - runs += MAX_DEPTH_RUN, width -= MAX_DEPTH_RUN) { - *runs = DepthRun(depth, MAX_DEPTH_RUN); - } - // If there are still any left over samples to fill under the maximum run - // size, then output one last run for them. - if (width > 0) { - *runs = DepthRun(depth, width); - } -} - -// A cursor for reading and modifying a row's depth run array. It locates -// and iterates through a desired span within all the runs, testing if -// the depth of this span passes or fails the depth test against existing -// runs. If desired, new runs may be inserted to represent depth occlusion -// from this span in the run array. -struct DepthCursor { - // Current position of run the cursor has advanced to. - DepthRun* cur = nullptr; - // The start of the remaining potential samples in the desired span. - DepthRun* start = nullptr; - // The end of the potential samples in the desired span. - DepthRun* end = nullptr; - - DepthCursor() = default; - - // Construct a cursor with runs for a given row's run array and the bounds - // of the span we wish to iterate within it. - DepthCursor(DepthRun* runs, int num_runs, int span_offset, int span_count) - : cur(runs), start(&runs[span_offset]), end(start + span_count) { - // This cursor should never iterate over flat runs - assert(!runs->is_flat()); - DepthRun* end_runs = &runs[num_runs]; - // Clamp end of span to end of row - if (end > end_runs) { - end = end_runs; - } - // If the span starts past the end of the row, just advance immediately - // to it to signal that we're done. - if (start >= end_runs) { - cur = end_runs; - start = end_runs; - return; - } - // Otherwise, find the first depth run that contains the start of the span. - // If the span starts after the given run, then we need to keep searching - // through the row to find an appropriate run. The check above already - // guaranteed that the span starts within the row's runs, and the search - // won't fall off the end. - for (;;) { - assert(cur < end); - DepthRun* next = cur + cur->count; - if (start < next) { - break; - } - cur = next; - } - } - - // The cursor is valid if the current position is at the end or if the run - // contains the start position. - bool valid() const { - return cur >= end || (cur <= start && start < cur + cur->count); - } - - // Skip past any initial runs that fail the depth test. If we find a run that - // would pass, then return the accumulated length between where we started - // and that position. Otherwise, if we fall off the end, return -1 to signal - // that there are no more passed runs at the end of this failed region and - // so it is safe for the caller to stop processing any more regions in this - // row. - template <int FUNC> - int skip_failed(uint32_t val) { - assert(valid()); - DepthRun* prev = start; - while (cur < end) { - if (cur->compare<FUNC>(val)) { - return start - prev; - } - cur += cur->count; - start = cur; - } - return -1; - } - - // Helper to convert function parameters into template parameters to hoist - // some checks out of inner loops. - ALWAYS_INLINE int skip_failed(uint32_t val, GLenum func) { - switch (func) { - case GL_LEQUAL: - return skip_failed<GL_LEQUAL>(val); - case GL_LESS: - return skip_failed<GL_LESS>(val); - default: - assert(false); - return -1; - } - } - - // Find a region of runs that passes the depth test. It is assumed the caller - // has called skip_failed first to skip past any runs that failed the depth - // test. This stops when it finds a run that fails the depth test or we fall - // off the end of the row. If the write mask is enabled, this will insert runs - // to represent this new region that passed the depth test. The length of the - // region is returned. - template <int FUNC, bool MASK> - int check_passed(uint32_t val) { - assert(valid()); - DepthRun* prev = cur; - while (cur < end) { - if (!cur->compare<FUNC>(val)) { - break; - } - DepthRun* next = cur + cur->count; - if (next > end) { - if (MASK) { - // Chop the current run where the end of the span falls, making a new - // run from the end of the span till the next run. The beginning of - // the current run will be folded into the run from the start of the - // passed region before returning below. - *end = DepthRun(cur->depth, next - end); - } - // If the next run starts past the end, then just advance the current - // run to the end to signal that we're now at the end of the row. - next = end; - } - cur = next; - } - // If we haven't advanced past the start of the span region, then we found - // nothing that passed. - if (cur <= start) { - return 0; - } - // If 'end' fell within the middle of a passing run, then 'cur' will end up - // pointing at the new partial run created at 'end' where the passing run - // was split to accommodate starting in the middle. The preceding runs will - // be fixed below to properly join with this new split. - int passed = cur - start; - if (MASK) { - // If the search started from a run before the start of the span, then - // edit that run to meet up with the start. - if (prev < start) { - prev->count = start - prev; - } - // Create a new run for the entirety of the passed samples. - set_depth_runs(start, val, passed); - } - start = cur; - return passed; - } - - // Helper to convert function parameters into template parameters to hoist - // some checks out of inner loops. - template <bool MASK> - ALWAYS_INLINE int check_passed(uint32_t val, GLenum func) { - switch (func) { - case GL_LEQUAL: - return check_passed<GL_LEQUAL, MASK>(val); - case GL_LESS: - return check_passed<GL_LESS, MASK>(val); - default: - assert(false); - return 0; - } - } - - ALWAYS_INLINE int check_passed(uint32_t val, GLenum func, bool mask) { - return mask ? check_passed<true>(val, func) - : check_passed<false>(val, func); - } - - // Fill a region of runs with a given depth value, bypassing any depth test. - ALWAYS_INLINE void fill(uint32_t depth) { - check_passed<GL_ALWAYS, true>(depth); - } -}; - -// Initialize a depth texture by setting the first run in each row to encompass -// the entire row. -void Texture::init_depth_runs(uint32_t depth) { - if (!buf) return; - DepthRun* runs = (DepthRun*)buf; - for (int y = 0; y < height; y++) { - set_depth_runs(runs, depth, width); - runs += stride() / sizeof(DepthRun); - } - set_cleared(true); -} - -// Fill a portion of the run array with flattened depth samples. -static ALWAYS_INLINE void fill_flat_depth(DepthRun* dst, size_t n, - uint32_t depth) { - fill_n((uint32_t*)dst, n, depth); -} - -// Fills a scissored region of a depth texture with a given depth. -void Texture::fill_depth_runs(uint32_t depth, const IntRect& scissor) { - if (!buf) return; - assert(cleared()); - IntRect bb = bounds().intersection(scissor - offset); - DepthRun* runs = (DepthRun*)sample_ptr(0, bb.y0); - for (int rows = bb.height(); rows > 0; rows--) { - if (bb.width() >= width) { - // If the scissor region encompasses the entire row, reset the row to a - // single run encompassing the entire row. - set_depth_runs(runs, depth, width); - } else if (runs->is_flat()) { - // If the row is flattened, just directly fill the portion of the row. - fill_flat_depth(&runs[bb.x0], bb.width(), depth); - } else { - // Otherwise, if we are still using runs, then set up a cursor to fill - // it with depth runs. - DepthCursor(runs, width, bb.x0, bb.width()).fill(depth); - } - runs += stride() / sizeof(DepthRun); - } -} - -using ZMask = I32; - -#if USE_SSE2 -# define ZMASK_NONE_PASSED 0xFFFF -# define ZMASK_ALL_PASSED 0 -static inline uint32_t zmask_code(ZMask mask) { - return _mm_movemask_epi8(mask); -} -#else -# define ZMASK_NONE_PASSED 0xFFFFFFFFU -# define ZMASK_ALL_PASSED 0 -static inline uint32_t zmask_code(ZMask mask) { - return bit_cast<uint32_t>(CONVERT(mask, U8)); -} -#endif - -// Interprets items in the depth buffer as sign-extended 32-bit depth values -// instead of as runs. Returns a mask that signals which samples in the given -// chunk passed or failed the depth test with given Z value. -template <bool DISCARD> -static ALWAYS_INLINE bool check_depth(I32 src, DepthRun* zbuf, ZMask& outmask, - int span = 4) { - // SSE2 does not support unsigned comparison. So ensure Z value is - // sign-extended to int32_t. - I32 dest = unaligned_load<I32>(zbuf); - // Invert the depth test to check which pixels failed and should be discarded. - ZMask mask = ctx->depthfunc == GL_LEQUAL - ? - // GL_LEQUAL: Not(LessEqual) = Greater - ZMask(src > dest) - : - // GL_LESS: Not(Less) = GreaterEqual - ZMask(src >= dest); - // Mask off any unused lanes in the span. - mask |= ZMask(span) < ZMask{1, 2, 3, 4}; - if (zmask_code(mask) == ZMASK_NONE_PASSED) { - return false; - } - if (!DISCARD && ctx->depthmask) { - unaligned_store(zbuf, (mask & dest) | (~mask & src)); - } - outmask = mask; - return true; -} - -static ALWAYS_INLINE I32 packDepth() { - return cast(fragment_shader->gl_FragCoord.z * MAX_DEPTH_VALUE); -} - -static ALWAYS_INLINE void discard_depth(I32 src, DepthRun* zbuf, I32 mask) { - if (ctx->depthmask) { - I32 dest = unaligned_load<I32>(zbuf); - mask |= fragment_shader->swgl_IsPixelDiscarded; - unaligned_store(zbuf, (mask & dest) | (~mask & src)); - } -} - -static ALWAYS_INLINE void mask_output(uint32_t* buf, ZMask zmask, - int span = 4) { - WideRGBA8 r = pack_pixels_RGBA8(); - PackedRGBA8 dst = load_span<PackedRGBA8>(buf, span); - if (blend_key) r = blend_pixels(buf, dst, r, span); - PackedRGBA8 mask = bit_cast<PackedRGBA8>(zmask); - store_span(buf, (mask & dst) | (~mask & pack(r)), span); -} - -template <bool DISCARD> -static ALWAYS_INLINE void discard_output(uint32_t* buf, int span = 4) { - mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span); -} - -template <> -ALWAYS_INLINE void discard_output<false>(uint32_t* buf, int span) { - WideRGBA8 r = pack_pixels_RGBA8(); - if (blend_key) - r = blend_pixels(buf, load_span<PackedRGBA8>(buf, span), r, span); - store_span(buf, pack(r), span); -} - -static ALWAYS_INLINE void mask_output(uint8_t* buf, ZMask zmask, int span = 4) { - WideR8 r = pack_pixels_R8(); - WideR8 dst = unpack(load_span<PackedR8>(buf, span)); - if (blend_key) r = blend_pixels(buf, dst, r, span); - WideR8 mask = packR8(zmask); - store_span(buf, pack((mask & dst) | (~mask & r)), span); -} - -template <bool DISCARD> -static ALWAYS_INLINE void discard_output(uint8_t* buf, int span = 4) { - mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span); -} - -template <> -ALWAYS_INLINE void discard_output<false>(uint8_t* buf, int span) { - WideR8 r = pack_pixels_R8(); - if (blend_key) - r = blend_pixels(buf, unpack(load_span<PackedR8>(buf, span)), r, span); - store_span(buf, pack(r), span); -} - -struct ClipRect { - float x0; - float y0; - float x1; - float y1; - - explicit ClipRect(const IntRect& i) - : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {} - explicit ClipRect(const Texture& t) : ClipRect(ctx->apply_scissor(t)) { - // If blending is enabled, set blend_key to reflect the resolved blend - // state for the currently drawn primitive. - if (ctx->blend) { - blend_key = ctx->blend_key; - if (swgl_ClipFlags) { - // If there is a blend override set, replace the blend key with it. - if (swgl_ClipFlags & SWGL_CLIP_FLAG_BLEND_OVERRIDE) { - blend_key = swgl_BlendOverride; - } - // If a clip mask is available, set up blending state to use the clip - // mask. - if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { - assert(swgl_ClipMask->format == TextureFormat::R8); - // Constrain the clip mask bounds to always fall within the clip mask. - swgl_ClipMaskBounds.intersect(IntRect{0, 0, int(swgl_ClipMask->width), - int(swgl_ClipMask->height)}); - // The clip mask offset is relative to the viewport. - swgl_ClipMaskOffset += ctx->viewport.origin() - t.offset; - // The clip mask bounds are relative to the clip mask offset. - swgl_ClipMaskBounds.offset(swgl_ClipMaskOffset); - // Finally, constrain the clip rectangle by the clip mask bounds. - intersect(swgl_ClipMaskBounds); - // Modify the blend key so that it will use the clip mask while - // blending. - restore_clip_mask(); - } - if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) { - // Modify the blend key so that it will use AA while blending. - restore_aa(); - } - } - } else { - blend_key = BLEND_KEY_NONE; - swgl_ClipFlags = 0; - } - } - - FloatRange x_range() const { return {x0, x1}; } - - void intersect(const IntRect& c) { - x0 = max(x0, float(c.x0)); - y0 = max(y0, float(c.y0)); - x1 = min(x1, float(c.x1)); - y1 = min(y1, float(c.y1)); - } - - template <typename P> - void set_clip_mask(int x, int y, P* buf) const { - if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { - swgl_SpanBuf = buf; - swgl_ClipMaskBuf = (uint8_t*)swgl_ClipMask->buf + - (y - swgl_ClipMaskOffset.y) * swgl_ClipMask->stride + - (x - swgl_ClipMaskOffset.x); - } - } - - template <typename P> - bool overlaps(int nump, const P* p) const { - // Generate a mask of which side of the clip rect all of a polygon's points - // fall inside of. This is a cheap conservative estimate of whether the - // bounding box of the polygon might overlap the clip rect, rather than an - // exact test that would require multiple slower line intersections. - int sides = 0; - for (int i = 0; i < nump; i++) { - sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2; - sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8; - } - return sides == 0xF; - } -}; - -// Given a current X position at the center Y position of a row, return the X -// position of the left and right intercepts of the row top and bottom. -template <typename E> -static ALWAYS_INLINE FloatRange x_intercepts(const E& e) { - float rad = 0.5f * abs(e.x_slope()); - return {e.cur_x() - rad, e.cur_x() + rad}; -} - -// Return the AA sub-span corresponding to a given edge. If AA is requested, -// then this finds the X intercepts with the row clipped into range of the -// edge and finally conservatively rounds them out. If there is no AA, then -// it just returns the current rounded X position clipped within bounds. -template <typename E> -static ALWAYS_INLINE IntRange aa_edge(const E& e, const FloatRange& bounds) { - return e.edgeMask ? bounds.clip(x_intercepts(e)).round_out() - : bounds.clip({e.cur_x(), e.cur_x()}).round(); -} - -// Calculate the initial AA coverage as an approximation of the distance from -// the center of the pixel in the direction of the edge slope. Given an edge -// (x,y)..(x+dx,y+dy), then the normalized tangent vector along the edge is -// (dx,dy)/sqrt(dx^2+dy^2). We know that for dy=1 then dx=e.x_slope. We rotate -// the tangent vector either -90 or +90 degrees to get the edge normal vector, -// where 'dx=-dy and 'dy=dx. Once normalized by 1/sqrt(dx^2+dy^2), scale into -// the range of 0..256 so that we can cheaply convert to a fixed-point scale -// factor. It is assumed that at exactly the pixel center the opacity is half -// (128) and linearly decreases along the normal vector at 1:1 scale with the -// slope. While not entirely accurate, this gives a reasonably agreeable looking -// approximation of AA. For edges on which there is no AA, just force the -// opacity to maximum (256) with no slope, relying on the span clipping to trim -// pixels outside the span. -template <typename E> -static ALWAYS_INLINE FloatRange aa_dist(const E& e, float dir) { - if (e.edgeMask) { - float dx = (dir * 256.0f) * inversesqrt(1.0f + e.x_slope() * e.x_slope()); - return {128.0f + dx * (e.cur_x() - 0.5f), -dx}; - } else { - return {256.0f, 0.0f}; - } -} - -template <typename P, typename E> -static ALWAYS_INLINE IntRange aa_span(P* buf, const E& left, const E& right, - const FloatRange& bounds) { - // If there is no AA, just return the span from the rounded left edge X - // position to the rounded right edge X position. Clip the span to be within - // the valid bounds. - if (!(swgl_ClipFlags & SWGL_CLIP_FLAG_AA)) { - return bounds.clip({left.cur_x(), right.cur_x()}).round(); - } - - // Calculate the left and right AA spans along with the coverage distances - // and slopes necessary to do blending. - IntRange leftAA = aa_edge(left, bounds); - FloatRange leftDist = aa_dist(left, -1.0f); - IntRange rightAA = aa_edge(right, bounds); - FloatRange rightDist = aa_dist(right, 1.0f); - - // Use the pointer into the destination buffer as a status indicator of the - // coverage offset. The pointer is calculated so that subtracting it with - // the current destination pointer will yield a negative value if the span - // is outside the opaque area and otherwise will yield a positive value - // above the opaque size. This pointer is stored as a uint8 pointer so that - // there are no hidden multiplication instructions and will just return a - // 1:1 linear memory address. Thus the size of the opaque region must also - // be scaled by the pixel size in bytes. - swgl_OpaqueStart = (const uint8_t*)(buf + leftAA.end); - swgl_OpaqueSize = max(rightAA.start - leftAA.end - 3, 0) * sizeof(P); - - // Offset the coverage distances by the end of the left AA span, which - // corresponds to the opaque start pointer, so that pixels become opaque - // immediately after. The distances are also offset for each lane in the - // chunk. - Float offset = cast(leftAA.end + (I32){0, 1, 2, 3}); - swgl_LeftAADist = leftDist.start + offset * leftDist.end; - swgl_RightAADist = rightDist.start + offset * rightDist.end; - swgl_AASlope = - (Float){leftDist.end, rightDist.end, 0.0f, 0.0f} / float(sizeof(P)); - - // Return the full span width from the start of the left span to the end of - // the right span. - return {leftAA.start, rightAA.end}; -} - -// Calculate the span the user clip distances occupy from the left and right -// edges at the current row. -template <typename E> -static ALWAYS_INLINE IntRange clip_distance_range(const E& left, - const E& right) { - Float leftClip = get_clip_distances(left.interp); - Float rightClip = get_clip_distances(right.interp); - // Get the change in clip dist per X step. - Float clipStep = (rightClip - leftClip) / (right.cur_x() - left.cur_x()); - // Find the zero intercepts starting from the left edge. - Float clipDist = left.cur_x() - leftClip * recip(clipStep); - // Find the distance to the start of the span for any clip distances that - // are increasing in value. If the clip distance is constant or decreasing - // in value, then check if it starts outside the clip volume. - Float start = if_then_else(clipStep > 0.0f, clipDist, - if_then_else(leftClip < 0.0f, 1.0e6f, 0.0f)); - // Find the distance to the end of the span for any clip distances that are - // decreasing in value. If the clip distance is constant or increasing in - // value, then check if it ends inside the clip volume. - Float end = if_then_else(clipStep < 0.0f, clipDist, - if_then_else(rightClip >= 0.0f, 1.0e6f, 0.0f)); - // Find the furthest start offset. - start = max(start, start.zwxy); - // Find the closest end offset. - end = min(end, end.zwxy); - // Finally, round the offsets to an integer span that can be used to bound - // the current span. - return FloatRange{max(start.x, start.y), min(end.x, end.y)}.round(); -} - -// Converts a run array into a flattened array of depth samples. This just -// walks through every run and fills the samples with the depth value from -// the run. -static void flatten_depth_runs(DepthRun* runs, size_t width) { - if (runs->is_flat()) { - return; - } - while (width > 0) { - size_t n = runs->count; - fill_flat_depth(runs, n, runs->depth); - runs += n; - width -= n; - } -} - -// Helper function for drawing passed depth runs within the depth buffer. -// Flattened depth (perspective or discard) is not supported. -template <typename P> -static ALWAYS_INLINE void draw_depth_span(uint32_t z, P* buf, - DepthCursor& cursor) { - for (;;) { - // Get the span that passes the depth test. Assume on entry that - // any failed runs have already been skipped. - int span = cursor.check_passed(z, ctx->depthfunc, ctx->depthmask); - // If nothing passed, since we already skipped passed failed runs - // previously, we must have hit the end of the row. Bail out. - if (span <= 0) { - break; - } - if (span >= 4) { - // If we have a draw specialization, try to process as many 4-pixel - // chunks as possible using it. - if (fragment_shader->has_draw_span(buf)) { - int drawn = fragment_shader->draw_span(buf, span & ~3); - buf += drawn; - span -= drawn; - } - // Otherwise, just process each chunk individually. - while (span >= 4) { - fragment_shader->run(); - discard_output<false>(buf); - buf += 4; - span -= 4; - } - } - // If we have a partial chunk left over, we still have to process it as if - // it were a full chunk. Mask off only the part of the chunk we want to - // use. - if (span > 0) { - fragment_shader->run(); - discard_output<false>(buf, span); - buf += span; - } - // Skip past any runs that fail the depth test. - int skip = cursor.skip_failed(z, ctx->depthfunc); - // If there aren't any, that means we won't encounter any more passing runs - // and so it's safe to bail out. - if (skip <= 0) { - break; - } - // Advance interpolants for the fragment shader past the skipped region. - // If we processed a partial chunk above, we actually advanced the - // interpolants a full chunk in the fragment shader's run function. Thus, - // we need to first subtract off that 4-pixel chunk and only partially - // advance them to that partial chunk before we can add on the rest of the - // skips. This is combined with the skip here for efficiency's sake. - fragment_shader->skip(skip - (span > 0 ? 4 - span : 0)); - buf += skip; - } -} - -// Draw a simple span in 4-pixel wide chunks, optionally using depth. -template <bool DISCARD, bool W, typename P, typename Z> -static ALWAYS_INLINE void draw_span(P* buf, DepthRun* depth, int span, Z z) { - if (depth) { - // Depth testing is enabled. If perspective is used, Z values will vary - // across the span, we use packDepth to generate packed Z values suitable - // for depth testing based on current values from gl_FragCoord.z. - // Otherwise, for the no-perspective case, we just use the provided Z. - // Process 4-pixel chunks first. - for (; span >= 4; span -= 4, buf += 4, depth += 4) { - I32 zsrc = z(); - ZMask zmask; - if (check_depth<DISCARD>(zsrc, depth, zmask)) { - fragment_shader->run<W>(); - mask_output(buf, zmask); - if (DISCARD) discard_depth(zsrc, depth, zmask); - } else { - fragment_shader->skip<W>(); - } - } - // If there are any remaining pixels, do a partial chunk. - if (span > 0) { - I32 zsrc = z(); - ZMask zmask; - if (check_depth<DISCARD>(zsrc, depth, zmask, span)) { - fragment_shader->run<W>(); - mask_output(buf, zmask, span); - if (DISCARD) discard_depth(zsrc, depth, zmask); - } - } - } else { - // Process 4-pixel chunks first. - for (; span >= 4; span -= 4, buf += 4) { - fragment_shader->run<W>(); - discard_output<DISCARD>(buf); - } - // If there are any remaining pixels, do a partial chunk. - if (span > 0) { - fragment_shader->run<W>(); - discard_output<DISCARD>(buf, span); - } - } -} - -// Called during rasterization to forcefully clear a row on which delayed clear -// has been enabled. If we know that we are going to completely overwrite a part -// of the row, then we only need to clear the row outside of that part. However, -// if blending or discard is enabled, the values of that underlying part of the -// row may be used regardless to produce the final rasterization result, so we -// have to then clear the entire underlying row to prepare it. -template <typename P> -static inline void prepare_row(Texture& colortex, int y, int startx, int endx, - bool use_discard, DepthRun* depth, - uint32_t z = 0, DepthCursor* cursor = nullptr) { - assert(colortex.delay_clear > 0); - // Delayed clear is enabled for the color buffer. Check if needs clear. - uint32_t& mask = colortex.cleared_rows[y / 32]; - if ((mask & (1 << (y & 31))) == 0) { - mask |= 1 << (y & 31); - colortex.delay_clear--; - if (blend_key || use_discard) { - // If depth test, blending, or discard is used, old color values - // might be sampled, so we need to clear the entire row to fill it. - force_clear_row<P>(colortex, y); - } else if (depth) { - if (depth->is_flat() || !cursor) { - // If flat depth is used, we can't cheaply predict if which samples will - // pass. - force_clear_row<P>(colortex, y); - } else { - // Otherwise if depth runs are used, see how many samples initially pass - // the depth test and only fill the row outside those. The fragment - // shader will fill the row within the passed samples. - int passed = - DepthCursor(*cursor).check_passed<false>(z, ctx->depthfunc); - if (startx > 0 || startx + passed < colortex.width) { - force_clear_row<P>(colortex, y, startx, startx + passed); - } - } - } else if (startx > 0 || endx < colortex.width) { - // Otherwise, we only need to clear the row outside of the span. - // The fragment shader will fill the row within the span itself. - force_clear_row<P>(colortex, y, startx, endx); - } - } -} - -// Perpendicular dot-product is the dot-product of a vector with the -// perpendicular vector of the other, i.e. dot(a, {-b.y, b.x}) -template <typename T> -static ALWAYS_INLINE auto perpDot(T a, T b) { - return a.x * b.y - a.y * b.x; -} - -// Check if the winding of the initial edges is flipped, requiring us to swap -// the edges to avoid spans having negative lengths. Assume that l0.y == r0.y -// due to the initial edge scan in draw_quad/perspective_spans. -template <typename T> -static ALWAYS_INLINE bool checkIfEdgesFlipped(T l0, T l1, T r0, T r1) { - // If the starting point of the left edge is to the right of the starting - // point of the right edge, then just assume the edges are flipped. If the - // left and right starting points are the same, then check the sign of the - // cross-product of the edges to see if the edges are flipped. Otherwise, - // if the left starting point is actually just to the left of the right - // starting point, then assume no edge flip. - return l0.x > r0.x || (l0.x == r0.x && perpDot(l1 - l0, r1 - r0) > 0.0f); -} - -// Draw spans for each row of a given quad (or triangle) with a constant Z -// value. The quad is assumed convex. It is clipped to fall within the given -// clip rect. In short, this function rasterizes a quad by first finding a -// top most starting point and then from there tracing down the left and right -// sides of this quad until it hits the bottom, outputting a span between the -// current left and right positions at each row along the way. Points are -// assumed to be ordered in either CW or CCW to support this, but currently -// both orders (CW and CCW) are supported and equivalent. -template <typename P> -static inline void draw_quad_spans(int nump, Point2D p[4], uint32_t z, - Interpolants interp_outs[4], - Texture& colortex, Texture& depthtex, - const ClipRect& clipRect) { - // Only triangles and convex quads supported. - assert(nump == 3 || nump == 4); - - Point2D l0, r0, l1, r1; - int l0i, r0i, l1i, r1i; - { - // Find the index of the top-most (smallest Y) point from which - // rasterization can start. - int top = nump > 3 && p[3].y < p[2].y - ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3) - : (p[1].y < p[3].y ? 1 : 3)) - : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2) - : (p[1].y < p[2].y ? 1 : 2)); - // Helper to find next index in the points array, walking forward. -#define NEXT_POINT(idx) \ - ({ \ - int cur = (idx) + 1; \ - cur < nump ? cur : 0; \ - }) - // Helper to find the previous index in the points array, walking backward. -#define PREV_POINT(idx) \ - ({ \ - int cur = (idx)-1; \ - cur >= 0 ? cur : nump - 1; \ - }) - // Start looking for "left"-side and "right"-side descending edges starting - // from the determined top point. - int next = NEXT_POINT(top); - int prev = PREV_POINT(top); - if (p[top].y == p[next].y) { - // If the next point is on the same row as the top, then advance one more - // time to the next point and use that as the "left" descending edge. - l0i = next; - l1i = NEXT_POINT(next); - // Assume top and prev form a descending "right" edge, as otherwise this - // will be a collapsed polygon and harmlessly bail out down below. - r0i = top; - r1i = prev; - } else if (p[top].y == p[prev].y) { - // If the prev point is on the same row as the top, then advance to the - // prev again and use that as the "right" descending edge. - // Assume top and next form a non-empty descending "left" edge. - l0i = top; - l1i = next; - r0i = prev; - r1i = PREV_POINT(prev); - } else { - // Both next and prev are on distinct rows from top, so both "left" and - // "right" edges are non-empty/descending. - l0i = r0i = top; - l1i = next; - r1i = prev; - } - // Load the points from the indices. - l0 = p[l0i]; // Start of left edge - r0 = p[r0i]; // End of left edge - l1 = p[l1i]; // Start of right edge - r1 = p[r1i]; // End of right edge - // debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1: - // %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i, - // r1.x, r1.y); - } - - struct Edge { - float yScale; - float xSlope; - float x; - Interpolants interpSlope; - Interpolants interp; - bool edgeMask; - - Edge(float y, const Point2D& p0, const Point2D& p1, const Interpolants& i0, - const Interpolants& i1, int edgeIndex) - : // Inverse Y scale for slope calculations. Avoid divide on 0-length - // edge. Later checks below ensure that Y <= p1.y, or otherwise we - // don't use this edge. We just need to guard against Y == p1.y == - // p0.y. In that case, Y - p0.y == 0 and will cancel out the slopes - // below, except if yScale is Inf for some reason (or worse, NaN), - // which 1/(p1.y-p0.y) might produce if we don't bound it. - yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)), - // Calculate dX/dY slope - xSlope((p1.x - p0.x) * yScale), - // Initialize current X based on Y and slope - x(p0.x + (y - p0.y) * xSlope), - // Calculate change in interpolants per change in Y - interpSlope((i1 - i0) * yScale), - // Initialize current interpolants based on Y and slope - interp(i0 + (y - p0.y) * interpSlope), - // Extract the edge mask status for this edge - edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {} - - void nextRow() { - // step current X and interpolants to next row from slope - x += xSlope; - interp += interpSlope; - } - - float cur_x() const { return x; } - float x_slope() const { return xSlope; } - }; - - // Vertex selection above should result in equal left and right start rows - assert(l0.y == r0.y); - // Find the start y, clip to within the clip rect, and round to row center. - // If AA is enabled, round out conservatively rather than round to nearest. - float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f; - float y = floor(max(l0.y, clipRect.y0) + aaRound) + 0.5f; - // Initialize left and right edges from end points and start Y - Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); - Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); - // WR does not use backface culling, so check if edges are flipped. - bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1); - if (flipped) swap(left, right); - // Get pointer to color buffer and depth buffer at current Y - P* fbuf = (P*)colortex.sample_ptr(0, int(y)); - DepthRun* fdepth = (DepthRun*)depthtex.sample_ptr(0, int(y)); - // Loop along advancing Ys, rasterizing spans at each row - float checkY = min(min(l1.y, r1.y), clipRect.y1); - // Ensure we don't rasterize out edge bounds - FloatRange clipSpan = - clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); - for (;;) { - // Check if we maybe passed edge ends or outside clip rect... - if (y > checkY) { - // If we're outside the clip rect, we're done. - if (y > clipRect.y1) break; - // Helper to find the next non-duplicate vertex that doesn't loop back. -#define STEP_EDGE(y, e0i, e0, e1i, e1, STEP_POINT, end) \ - do { \ - /* Set new start of edge to be end of old edge */ \ - e0i = e1i; \ - e0 = e1; \ - /* Set new end of edge to next point */ \ - e1i = STEP_POINT(e1i); \ - e1 = p[e1i]; \ - /* If the edge crossed the end, we're done. */ \ - if (e0i == end) return; \ - /* Otherwise, it doesn't advance, so keep searching. */ \ - } while (y > e1.y) - // Check if Y advanced past the end of the left edge - if (y > l1.y) { - // Step to next left edge past Y and reset edge interpolants. - STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i); - (flipped ? right : left) = - Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); - } - // Check if Y advanced past the end of the right edge - if (y > r1.y) { - // Step to next right edge past Y and reset edge interpolants. - STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i); - (flipped ? left : right) = - Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); - } - // Reset the clip bounds for the new edges - clipSpan = - clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); - // Reset check condition for next time around. - checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1); - } - - // Calculate a potentially AA'd span and check if it is non-empty. - IntRange span = aa_span(fbuf, left, right, clipSpan); - if (span.len() > 0) { - // If user clip planes are enabled, use them to bound the current span. - if (vertex_shader->use_clip_distance()) { - span = span.intersect(clip_distance_range(left, right)); - if (span.len() <= 0) goto next_span; - } - ctx->shaded_rows++; - ctx->shaded_pixels += span.len(); - // Advance color/depth buffer pointers to the start of the span. - P* buf = fbuf + span.start; - // Check if we will need to use depth-buffer or discard on this span. - DepthRun* depth = - depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr; - DepthCursor cursor; - bool use_discard = fragment_shader->use_discard(); - if (use_discard) { - if (depth) { - // If we're using discard, we may have to unpredictably drop out some - // samples. Flatten the depth run array here to allow this. - if (!depth->is_flat()) { - flatten_depth_runs(depth, depthtex.width); - } - // Advance to the depth sample at the start of the span. - depth += span.start; - } - } else if (depth) { - if (!depth->is_flat()) { - // We're not using discard and the depth row is still organized into - // runs. Skip past any runs that would fail the depth test so we - // don't have to do any extra work to process them with the rest of - // the span. - cursor = DepthCursor(depth, depthtex.width, span.start, span.len()); - int skipped = cursor.skip_failed(z, ctx->depthfunc); - // If we fell off the row, that means we couldn't find any passing - // runs. We can just skip the entire span. - if (skipped < 0) { - goto next_span; - } - buf += skipped; - span.start += skipped; - } else { - // The row is already flattened, so just advance to the span start. - depth += span.start; - } - } - - if (colortex.delay_clear) { - // Delayed clear is enabled for the color buffer. Check if needs clear. - prepare_row<P>(colortex, int(y), span.start, span.end, use_discard, - depth, z, &cursor); - } - - // Initialize fragment shader interpolants to current span position. - fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1); - fragment_shader->gl_FragCoord.y = y; - { - // Change in interpolants is difference between current right and left - // edges per the change in right and left X. - Interpolants step = - (right.interp - left.interp) * (1.0f / (right.x - left.x)); - // Advance current interpolants to X at start of span. - Interpolants o = left.interp + step * (span.start + 0.5f - left.x); - fragment_shader->init_span(&o, &step); - } - clipRect.set_clip_mask(span.start, y, buf); - if (!use_discard) { - // Fast paths for the case where fragment discard is not used. - if (depth) { - // If depth is used, we want to process entire depth runs if depth is - // not flattened. - if (!depth->is_flat()) { - draw_depth_span(z, buf, cursor); - goto next_span; - } - // Otherwise, flattened depth must fall back to the slightly slower - // per-chunk depth test path in draw_span below. - } else { - // Check if the fragment shader has an optimized draw specialization. - if (span.len() >= 4 && fragment_shader->has_draw_span(buf)) { - // Draw specialization expects 4-pixel chunks. - int drawn = fragment_shader->draw_span(buf, span.len() & ~3); - buf += drawn; - span.start += drawn; - } - } - draw_span<false, false>(buf, depth, span.len(), [=] { return z; }); - } else { - // If discard is used, then use slower fallbacks. This should be rare. - // Just needs to work, doesn't need to be too fast yet... - draw_span<true, false>(buf, depth, span.len(), [=] { return z; }); - } - } - next_span: - // Advance Y and edge interpolants to next row. - y++; - left.nextRow(); - right.nextRow(); - // Advance buffers to next row. - fbuf += colortex.stride() / sizeof(P); - fdepth += depthtex.stride() / sizeof(DepthRun); - } -} - -// Draw perspective-correct spans for a convex quad that has been clipped to -// the near and far Z planes, possibly producing a clipped convex polygon with -// more than 4 sides. This assumes the Z value will vary across the spans and -// requires interpolants to factor in W values. This tends to be slower than -// the simpler 2D draw_quad_spans above, especially since we can't optimize the -// depth test easily when Z values, and should be used only rarely if possible. -template <typename P> -static inline void draw_perspective_spans(int nump, Point3D* p, - Interpolants* interp_outs, - Texture& colortex, Texture& depthtex, - const ClipRect& clipRect) { - Point3D l0, r0, l1, r1; - int l0i, r0i, l1i, r1i; - { - // Find the index of the top-most point (smallest Y) from which - // rasterization can start. - int top = 0; - for (int i = 1; i < nump; i++) { - if (p[i].y < p[top].y) { - top = i; - } - } - // Find left-most top point, the start of the left descending edge. - // Advance forward in the points array, searching at most nump points - // in case the polygon is flat. - l0i = top; - for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) { - l0i = i; - } - if (l0i == nump - 1) { - for (int i = 0; i <= top && p[i].y == p[top].y; i++) { - l0i = i; - } - } - // Find right-most top point, the start of the right descending edge. - // Advance backward in the points array, searching at most nump points. - r0i = top; - for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) { - r0i = i; - } - if (r0i == 0) { - for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) { - r0i = i; - } - } - // End of left edge is next point after left edge start. - l1i = NEXT_POINT(l0i); - // End of right edge is prev point after right edge start. - r1i = PREV_POINT(r0i); - l0 = p[l0i]; // Start of left edge - r0 = p[r0i]; // End of left edge - l1 = p[l1i]; // Start of right edge - r1 = p[r1i]; // End of right edge - } - - struct Edge { - float yScale; - // Current coordinates for edge. Where in the 2D case of draw_quad_spans, - // it is enough to just track the X coordinate as we advance along the rows, - // for the perspective case we also need to keep track of Z and W. For - // simplicity, we just use the full 3D point to track all these coordinates. - Point3D pSlope; - Point3D p; - Interpolants interpSlope; - Interpolants interp; - bool edgeMask; - - Edge(float y, const Point3D& p0, const Point3D& p1, const Interpolants& i0, - const Interpolants& i1, int edgeIndex) - : // Inverse Y scale for slope calculations. Avoid divide on 0-length - // edge. - yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)), - // Calculate dX/dY slope - pSlope((p1 - p0) * yScale), - // Initialize current coords based on Y and slope - p(p0 + (y - p0.y) * pSlope), - // Crucially, these interpolants must be scaled by the point's 1/w - // value, which allows linear interpolation in a perspective-correct - // manner. This will be canceled out inside the fragment shader later. - // Calculate change in interpolants per change in Y - interpSlope((i1 * p1.w - i0 * p0.w) * yScale), - // Initialize current interpolants based on Y and slope - interp(i0 * p0.w + (y - p0.y) * interpSlope), - // Extract the edge mask status for this edge - edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {} - - float x() const { return p.x; } - vec2_scalar zw() const { return {p.z, p.w}; } - - void nextRow() { - // step current coords and interpolants to next row from slope - p += pSlope; - interp += interpSlope; - } - - float cur_x() const { return p.x; } - float x_slope() const { return pSlope.x; } - }; - - // Vertex selection above should result in equal left and right start rows - assert(l0.y == r0.y); - // Find the start y, clip to within the clip rect, and round to row center. - // If AA is enabled, round out conservatively rather than round to nearest. - float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f; - float y = floor(max(l0.y, clipRect.y0) + aaRound) + 0.5f; - // Initialize left and right edges from end points and start Y - Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); - Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); - // WR does not use backface culling, so check if edges are flipped. - bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1); - if (flipped) swap(left, right); - // Get pointer to color buffer and depth buffer at current Y - P* fbuf = (P*)colortex.sample_ptr(0, int(y)); - DepthRun* fdepth = (DepthRun*)depthtex.sample_ptr(0, int(y)); - // Loop along advancing Ys, rasterizing spans at each row - float checkY = min(min(l1.y, r1.y), clipRect.y1); - // Ensure we don't rasterize out edge bounds - FloatRange clipSpan = - clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); - for (;;) { - // Check if we maybe passed edge ends or outside clip rect... - if (y > checkY) { - // If we're outside the clip rect, we're done. - if (y > clipRect.y1) break; - // Check if Y advanced past the end of the left edge - if (y > l1.y) { - // Step to next left edge past Y and reset edge interpolants. - STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i); - (flipped ? right : left) = - Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i); - } - // Check if Y advanced past the end of the right edge - if (y > r1.y) { - // Step to next right edge past Y and reset edge interpolants. - STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i); - (flipped ? left : right) = - Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i); - } - // Reset the clip bounds for the new edges - clipSpan = - clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1))); - // Reset check condition for next time around. - checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1); - } - - // Calculate a potentially AA'd span and check if it is non-empty. - IntRange span = aa_span(fbuf, left, right, clipSpan); - if (span.len() > 0) { - // If user clip planes are enabled, use them to bound the current span. - if (vertex_shader->use_clip_distance()) { - span = span.intersect(clip_distance_range(left, right)); - if (span.len() <= 0) goto next_span; - } - ctx->shaded_rows++; - ctx->shaded_pixels += span.len(); - // Advance color/depth buffer pointers to the start of the span. - P* buf = fbuf + span.start; - // Check if the we will need to use depth-buffer or discard on this span. - DepthRun* depth = - depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr; - bool use_discard = fragment_shader->use_discard(); - if (depth) { - // Perspective may cause the depth value to vary on a per sample basis. - // Ensure the depth row is flattened to allow testing of individual - // samples - if (!depth->is_flat()) { - flatten_depth_runs(depth, depthtex.width); - } - // Advance to the depth sample at the start of the span. - depth += span.start; - } - if (colortex.delay_clear) { - // Delayed clear is enabled for the color buffer. Check if needs clear. - prepare_row<P>(colortex, int(y), span.start, span.end, use_discard, - depth); - } - // Initialize fragment shader interpolants to current span position. - fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1); - fragment_shader->gl_FragCoord.y = y; - { - // Calculate the fragment Z and W change per change in fragment X step. - vec2_scalar stepZW = - (right.zw() - left.zw()) * (1.0f / (right.x() - left.x())); - // Calculate initial Z and W values for span start. - vec2_scalar zw = left.zw() + stepZW * (span.start + 0.5f - left.x()); - // Set fragment shader's Z and W values so that it can use them to - // cancel out the 1/w baked into the interpolants. - fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x); - fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y); - fragment_shader->swgl_StepZW = stepZW; - // Change in interpolants is difference between current right and left - // edges per the change in right and left X. The left and right - // interpolant values were previously multipled by 1/w, so the step and - // initial span values take this into account. - Interpolants step = - (right.interp - left.interp) * (1.0f / (right.x() - left.x())); - // Advance current interpolants to X at start of span. - Interpolants o = left.interp + step * (span.start + 0.5f - left.x()); - fragment_shader->init_span<true>(&o, &step); - } - clipRect.set_clip_mask(span.start, y, buf); - if (!use_discard) { - // No discard is used. Common case. - draw_span<false, true>(buf, depth, span.len(), packDepth); - } else { - // Discard is used. Rare. - draw_span<true, true>(buf, depth, span.len(), packDepth); - } - } - next_span: - // Advance Y and edge interpolants to next row. - y++; - left.nextRow(); - right.nextRow(); - // Advance buffers to next row. - fbuf += colortex.stride() / sizeof(P); - fdepth += depthtex.stride() / sizeof(DepthRun); - } -} - -// Clip a primitive against both sides of a view-frustum axis, producing -// intermediate vertexes with interpolated attributes that will no longer -// intersect the selected axis planes. This assumes the primitive is convex -// and should produce at most N+2 vertexes for each invocation (only in the -// worst case where one point falls outside on each of the opposite sides -// with the rest of the points inside). The supplied AA edge mask will be -// modified such that it corresponds to the clipped polygon edges. -template <XYZW AXIS> -static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP, - Interpolants* outInterp, int& outEdgeMask) { - // Potential mask bits of which side of a plane a coordinate falls on. - enum SIDE { POSITIVE = 1, NEGATIVE = 2 }; - int numClip = 0; - int edgeMask = outEdgeMask; - Point3D prev = p[nump - 1]; - Interpolants prevInterp = interp[nump - 1]; - float prevCoord = prev.select(AXIS); - // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and - // if so, remember which side it is outside of. In the special case that W is - // negative and |C| < |W|, both -W <= C and C <= W will be false, such that - // we must consider the coordinate as falling outside of both plane sides - // simultaneously. We test each condition separately and combine them to form - // a mask of which plane sides we exceeded. If we neglect to consider both - // sides simultaneously, points can erroneously oscillate from one plane side - // to the other and exceed the supported maximum number of clip outputs. - int prevMask = (prevCoord < -prev.w ? NEGATIVE : 0) | - (prevCoord > prev.w ? POSITIVE : 0); - // Loop through points, finding edges that cross the planes by evaluating - // the side at each point. - outEdgeMask = 0; - for (int i = 0; i < nump; i++, edgeMask >>= 1) { - Point3D cur = p[i]; - Interpolants curInterp = interp[i]; - float curCoord = cur.select(AXIS); - int curMask = - (curCoord < -cur.w ? NEGATIVE : 0) | (curCoord > cur.w ? POSITIVE : 0); - // Check if the previous and current end points are on different sides. If - // the masks of sides intersect, then we consider them to be on the same - // side. So in the case the masks do not intersect, we then consider them - // to fall on different sides. - if (!(curMask & prevMask)) { - // One of the edge's end points is outside the plane with the other - // inside the plane. Find the offset where it crosses the plane and - // adjust the point and interpolants to there. - if (prevMask) { - // Edge that was previously outside crosses inside. - // Evaluate plane equation for previous and current end-point - // based on previous side and calculate relative offset. - if (numClip >= nump + 2) { - // If for some reason we produced more vertexes than we support, just - // bail out. - assert(false); - return 0; - } - // The positive plane is assigned the sign 1, and the negative plane is - // assigned -1. If the point falls outside both planes, that means W is - // negative. To compensate for this, we must interpolate the coordinate - // till W=0, at which point we can choose a single plane side for the - // coordinate to fall on since W will no longer be negative. To compute - // the coordinate where W=0, we compute K = prev.w / (prev.w-cur.w) and - // interpolate C = prev.C + K*(cur.C - prev.C). The sign of C will be - // the side of the plane we need to consider. Substituting K into the - // comparison C < 0, we can then avoid the division in K with a - // cross-multiplication. - float prevSide = - (prevMask & NEGATIVE) && (!(prevMask & POSITIVE) || - prevCoord * (cur.w - prev.w) < - prev.w * (curCoord - prevCoord)) - ? -1 - : 1; - float prevDist = prevCoord - prevSide * prev.w; - float curDist = curCoord - prevSide * cur.w; - // It may happen that after we interpolate by the weight k that due to - // floating point rounding we've underestimated the value necessary to - // push it over the clipping boundary. Just in case, nudge the mantissa - // by a single increment so that we essentially round it up and move it - // further inside the clipping boundary. We use nextafter to do this in - // a portable fashion. - float k = prevDist / (prevDist - curDist); - Point3D clipped = prev + (cur - prev) * k; - if (prevSide * clipped.select(AXIS) > clipped.w) { - k = nextafterf(k, 1.0f); - clipped = prev + (cur - prev) * k; - } - outP[numClip] = clipped; - outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k; - // Don't output the current edge mask since start point was outside. - numClip++; - } - if (curMask) { - // Edge that was previously inside crosses outside. - // Evaluate plane equation for previous and current end-point - // based on current side and calculate relative offset. - if (numClip >= nump + 2) { - assert(false); - return 0; - } - // In the case the coordinate falls on both plane sides, the computation - // here is much the same as for prevSide, but since we are going from a - // previous W that is positive to current W that is negative, then the - // sign of cur.w - prev.w will flip in the equation. The resulting sign - // is negated to compensate for this. - float curSide = - (curMask & POSITIVE) && (!(curMask & NEGATIVE) || - prevCoord * (cur.w - prev.w) < - prev.w * (curCoord - prevCoord)) - ? 1 - : -1; - float prevDist = prevCoord - curSide * prev.w; - float curDist = curCoord - curSide * cur.w; - // Calculate interpolation weight k and the nudge it inside clipping - // boundary with nextafter. Note that since we were previously inside - // and now crossing outside, we have to flip the nudge direction for - // the weight towards 0 instead of 1. - float k = prevDist / (prevDist - curDist); - Point3D clipped = prev + (cur - prev) * k; - if (curSide * clipped.select(AXIS) > clipped.w) { - k = nextafterf(k, 0.0f); - clipped = prev + (cur - prev) * k; - } - outP[numClip] = clipped; - outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k; - // Output the current edge mask since the end point is inside. - outEdgeMask |= (edgeMask & 1) << numClip; - numClip++; - } - } - if (!curMask) { - // The current end point is inside the plane, so output point unmodified. - if (numClip >= nump + 2) { - assert(false); - return 0; - } - outP[numClip] = cur; - outInterp[numClip] = curInterp; - // Output the current edge mask since the end point is inside. - outEdgeMask |= (edgeMask & 1) << numClip; - numClip++; - } - prev = cur; - prevInterp = curInterp; - prevCoord = curCoord; - prevMask = curMask; - } - return numClip; -} - -// Helper function to dispatch to perspective span drawing with points that -// have already been transformed and clipped. -static inline void draw_perspective_clipped(int nump, Point3D* p_clip, - Interpolants* interp_clip, - Texture& colortex, - Texture& depthtex) { - // If polygon is ouside clip rect, nothing to draw. - ClipRect clipRect(colortex); - if (!clipRect.overlaps(nump, p_clip)) { - return; - } - - // Finally draw perspective-correct spans for the polygon. - if (colortex.internal_format == GL_RGBA8) { - draw_perspective_spans<uint32_t>(nump, p_clip, interp_clip, colortex, - depthtex, clipRect); - } else if (colortex.internal_format == GL_R8) { - draw_perspective_spans<uint8_t>(nump, p_clip, interp_clip, colortex, - depthtex, clipRect); - } else { - assert(false); - } -} - -// Draws a perspective-correct 3D primitive with varying Z value, as opposed -// to a simple 2D planar primitive with a constant Z value that could be -// trivially Z rejected. This requires clipping the primitive against the near -// and far planes to ensure it stays within the valid Z-buffer range. The Z -// and W of each fragment of the primitives are interpolated across the -// generated spans and then depth-tested as appropriate. -// Additionally, vertex attributes must be interpolated with perspective- -// correction by dividing by W before interpolation, and then later multiplied -// by W again to produce the final correct attribute value for each fragment. -// This process is expensive and should be avoided if possible for primitive -// batches that are known ahead of time to not need perspective-correction. -static void draw_perspective(int nump, Interpolants interp_outs[4], - Texture& colortex, Texture& depthtex) { - // Lines are not supported with perspective. - assert(nump >= 3); - // Convert output of vertex shader to screen space. - vec4 pos = vertex_shader->gl_Position; - vec3_scalar scale = - vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f; - vec3_scalar offset = - make_vec3(make_vec2(ctx->viewport.origin() - colortex.offset), 0.0f) + - scale; - // Verify if point is between near and far planes, rejecting NaN. - if (test_all(pos.z > -pos.w && pos.z < pos.w)) { - // No points cross the near or far planes, so no clipping required. - // Just divide coords by W and convert to viewport. We assume the W - // coordinate is non-zero and the reciprocal is finite since it would - // otherwise fail the test_none condition. - Float w = 1.0f / pos.w; - vec3 screen = pos.sel(X, Y, Z) * w * scale + offset; - Point3D p[4] = {{screen.x.x, screen.y.x, screen.z.x, w.x}, - {screen.x.y, screen.y.y, screen.z.y, w.y}, - {screen.x.z, screen.y.z, screen.z.z, w.z}, - {screen.x.w, screen.y.w, screen.z.w, w.w}}; - draw_perspective_clipped(nump, p, interp_outs, colortex, depthtex); - } else { - // Points cross the near or far planes, so we need to clip. - // Start with the original 3 or 4 points... - Point3D p[4] = {{pos.x.x, pos.y.x, pos.z.x, pos.w.x}, - {pos.x.y, pos.y.y, pos.z.y, pos.w.y}, - {pos.x.z, pos.y.z, pos.z.z, pos.w.z}, - {pos.x.w, pos.y.w, pos.z.w, pos.w.w}}; - // Clipping can expand the points by 1 for each of 6 view frustum planes. - Point3D p_clip[4 + 6]; - Interpolants interp_clip[4 + 6]; - // Clip against near and far Z planes. - nump = clip_side<Z>(nump, p, interp_outs, p_clip, interp_clip, - swgl_AAEdgeMask); - // If no points are left inside the view frustum, there's nothing to draw. - if (nump < 3) { - return; - } - // After clipping against only the near and far planes, we might still - // produce points where W = 0, exactly at the camera plane. OpenGL specifies - // that for clip coordinates, points must satisfy: - // -W <= X <= W - // -W <= Y <= W - // -W <= Z <= W - // When Z = W = 0, this is trivially satisfied, but when we transform and - // divide by W below it will produce a divide by 0. Usually we want to only - // clip Z to avoid the extra work of clipping X and Y. We can still project - // points that fall outside the view frustum X and Y so long as Z is valid. - // The span drawing code will then ensure X and Y are clamped to viewport - // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y, - // will push W further inside the view frustum so that it is no longer 0, - // allowing us to finally proceed to projecting the points to the screen. - for (int i = 0; i < nump; i++) { - // Found an invalid W, so need to clip against X and Y... - if (p_clip[i].w <= 0.0f) { - // Ping-pong p_clip -> p_tmp -> p_clip. - Point3D p_tmp[4 + 6]; - Interpolants interp_tmp[4 + 6]; - nump = clip_side<X>(nump, p_clip, interp_clip, p_tmp, interp_tmp, - swgl_AAEdgeMask); - if (nump < 3) return; - nump = clip_side<Y>(nump, p_tmp, interp_tmp, p_clip, interp_clip, - swgl_AAEdgeMask); - if (nump < 3) return; - // After clipping against X and Y planes, there's still points left - // to draw, so proceed to trying projection now... - break; - } - } - // Divide coords by W and convert to viewport. - for (int i = 0; i < nump; i++) { - float w = 1.0f / p_clip[i].w; - // If the W coord is essentially zero, small enough that division would - // result in Inf/NaN, then just set the reciprocal itself to zero so that - // the coordinates becomes zeroed out, as the only valid point that - // satisfies -W <= X/Y/Z <= W is all zeroes. - if (!isfinite(w)) w = 0.0f; - p_clip[i] = Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w); - } - draw_perspective_clipped(nump, p_clip, interp_clip, colortex, depthtex); - } -} - -static void draw_quad(int nump, Texture& colortex, Texture& depthtex) { - // Run vertex shader once for the primitive's vertices. - // Reserve space for 6 sets of interpolants, in case we need to clip against - // near and far planes in the perspective case. - Interpolants interp_outs[4]; - swgl_ClipFlags = 0; - vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants)); - vec4 pos = vertex_shader->gl_Position; - // Check if any vertex W is different from another. If so, use perspective. - if (test_any(pos.w != pos.w.x)) { - draw_perspective(nump, interp_outs, colortex, depthtex); - return; - } - - // Convert output of vertex shader to screen space. - // Divide coords by W and convert to viewport. - float w = 1.0f / pos.w.x; - // If the W coord is essentially zero, small enough that division would - // result in Inf/NaN, then just set the reciprocal itself to zero so that - // the coordinates becomes zeroed out, as the only valid point that - // satisfies -W <= X/Y/Z <= W is all zeroes. - if (!isfinite(w)) w = 0.0f; - vec2 screen = (pos.sel(X, Y) * w + 1) * 0.5f * - vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) + - make_vec2(ctx->viewport.origin() - colortex.offset); - Point2D p[4] = {{screen.x.x, screen.y.x}, - {screen.x.y, screen.y.y}, - {screen.x.z, screen.y.z}, - {screen.x.w, screen.y.w}}; - - // If quad is ouside clip rect, nothing to draw. - ClipRect clipRect(colortex); - if (!clipRect.overlaps(nump, p)) { - return; - } - - // Since the quad is assumed 2D, Z is constant across the quad. - float screenZ = (pos.z.x * w + 1) * 0.5f; - if (screenZ < 0 || screenZ > 1) { - // Z values would cross the near or far plane, so just bail. - return; - } - // Since Z doesn't need to be interpolated, just set the fragment shader's - // Z and W values here, once and for all fragment shader invocations. - uint32_t z = uint32_t(MAX_DEPTH_VALUE * screenZ); - fragment_shader->gl_FragCoord.z = screenZ; - fragment_shader->gl_FragCoord.w = w; - - // If supplied a line, adjust it so that it is a quad at least 1 pixel thick. - // Assume that for a line that all 4 SIMD lanes were actually filled with - // vertexes 0, 1, 1, 0. - if (nump == 2) { - // Nudge Y height to span at least 1 pixel by advancing to next pixel - // boundary so that we step at least 1 row when drawing spans. - if (int(p[0].y + 0.5f) == int(p[1].y + 0.5f)) { - p[2].y = 1 + int(p[1].y + 0.5f); - p[3].y = p[2].y; - // Nudge X width to span at least 1 pixel so that rounded coords fall on - // separate pixels. - if (int(p[0].x + 0.5f) == int(p[1].x + 0.5f)) { - p[1].x += 1.0f; - p[2].x += 1.0f; - } - } else { - // If the line already spans at least 1 row, then assume line is vertical - // or diagonal and just needs to be dilated horizontally. - p[2].x += 1.0f; - p[3].x += 1.0f; - } - // Pretend that it's a quad now... - nump = 4; - } - - // Finally draw 2D spans for the quad. Currently only supports drawing to - // RGBA8 and R8 color buffers. - if (colortex.internal_format == GL_RGBA8) { - draw_quad_spans<uint32_t>(nump, p, z, interp_outs, colortex, depthtex, - clipRect); - } else if (colortex.internal_format == GL_R8) { - draw_quad_spans<uint8_t>(nump, p, z, interp_outs, colortex, depthtex, - clipRect); - } else { - assert(false); - } -} - -template <typename INDEX> -static inline void draw_elements(GLsizei count, GLsizei instancecount, - size_t offset, VertexArray& v, - Texture& colortex, Texture& depthtex) { - Buffer& indices_buf = ctx->buffers[v.element_array_buffer_binding]; - if (!indices_buf.buf || offset >= indices_buf.size) { - return; - } - assert((offset & (sizeof(INDEX) - 1)) == 0); - INDEX* indices = (INDEX*)(indices_buf.buf + offset); - count = min(count, (GLsizei)((indices_buf.size - offset) / sizeof(INDEX))); - // Triangles must be indexed at offsets 0, 1, 2. - // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3. - if (count == 6 && indices[1] == indices[0] + 1 && - indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) { - assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1); - // Fast path - since there is only a single quad, we only load per-vertex - // attribs once for all instances, as they won't change across instances - // or within an instance. - vertex_shader->load_attribs(v.attribs, indices[0], 0, 4); - draw_quad(4, colortex, depthtex); - for (GLsizei instance = 1; instance < instancecount; instance++) { - vertex_shader->load_attribs(v.attribs, indices[0], instance, 0); - draw_quad(4, colortex, depthtex); - } - } else { - for (GLsizei instance = 0; instance < instancecount; instance++) { - for (GLsizei i = 0; i + 3 <= count; i += 3) { - if (indices[i + 1] != indices[i] + 1 || - indices[i + 2] != indices[i] + 2) { - continue; - } - if (i + 6 <= count && indices[i + 5] == indices[i] + 3) { - assert(indices[i + 3] == indices[i] + 2 && - indices[i + 4] == indices[i] + 1); - vertex_shader->load_attribs(v.attribs, indices[i], instance, 4); - draw_quad(4, colortex, depthtex); - i += 3; - } else { - vertex_shader->load_attribs(v.attribs, indices[i], instance, 3); - draw_quad(3, colortex, depthtex); - } - } - } - } -} diff --git a/third_party/webrender/swgl/src/swgl_ext.h b/third_party/webrender/swgl/src/swgl_ext.h deleted file mode 100644 index 52d240e0818..00000000000 --- a/third_party/webrender/swgl/src/swgl_ext.h +++ /dev/null @@ -1,1826 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -// When using a solid color with clip masking, the cost of loading the clip mask -// in the blend stage exceeds the cost of processing the color. Here we handle -// the entire span of clip mask texture before the blend stage to more -// efficiently process it and modulate it with color without incurring blend -// stage overheads. -template <typename P, typename C> -static void commit_masked_solid_span(P* buf, C color, int len) { - override_clip_mask(); - uint8_t* mask = get_clip_mask(buf); - for (P* end = &buf[len]; buf < end; buf += 4, mask += 4) { - commit_span( - buf, - blend_span( - buf, - applyColor(expand_mask(buf, unpack(unaligned_load<PackedR8>(mask))), - color))); - } - restore_clip_mask(); -} - -// When using a solid color with anti-aliasing, most of the solid span will not -// benefit from anti-aliasing in the opaque region. We only want to apply the AA -// blend stage in the non-opaque start and end of the span where AA is needed. -template <typename P, typename R> -static ALWAYS_INLINE void commit_aa_solid_span(P* buf, R r, int len) { - if (int start = min((get_aa_opaque_start(buf) + 3) & ~3, len)) { - commit_solid_span<true>(buf, r, start); - buf += start; - len -= start; - } - if (int opaque = min((get_aa_opaque_size(buf) + 3) & ~3, len)) { - override_aa(); - commit_solid_span<true>(buf, r, opaque); - restore_aa(); - buf += opaque; - len -= opaque; - } - if (len > 0) { - commit_solid_span<true>(buf, r, len); - } -} - -// Forces a value with vector run-class to have scalar run-class. -template <typename T> -static ALWAYS_INLINE auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) { - return force_scalar(v); -} - -// Advance all varying inperpolants by a single chunk -#define swgl_stepInterp() step_interp_inputs() - -// Pseudo-intrinsic that accesses the interpolation step for a given varying -#define swgl_interpStep(v) (interp_step.v) - -// Commit an entire span of a solid color. This dispatches to clip-masked and -// anti-aliased fast-paths as appropriate. -#define swgl_commitSolid(format, v, n) \ - do { \ - int len = (n); \ - if (blend_key) { \ - if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) { \ - commit_masked_solid_span(swgl_Out##format, \ - packColor(swgl_Out##format, (v)), len); \ - } else if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) { \ - commit_aa_solid_span(swgl_Out##format, \ - pack_span(swgl_Out##format, (v)), len); \ - } else { \ - commit_solid_span<true>(swgl_Out##format, \ - pack_span(swgl_Out##format, (v)), len); \ - } \ - } else { \ - commit_solid_span<false>(swgl_Out##format, \ - pack_span(swgl_Out##format, (v)), len); \ - } \ - swgl_Out##format += len; \ - swgl_SpanLength -= len; \ - } while (0) -#define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v, swgl_SpanLength) -#define swgl_commitSolidR8(v) swgl_commitSolid(R8, v, swgl_SpanLength) -#define swgl_commitPartialSolidRGBA8(len, v) \ - swgl_commitSolid(RGBA8, v, min(int(len), swgl_SpanLength)) -#define swgl_commitPartialSolidR8(len, v) \ - swgl_commitSolid(R8, v, min(int(len), swgl_SpanLength)) - -#define swgl_commitChunk(format, chunk) \ - do { \ - auto r = chunk; \ - if (blend_key) r = blend_span(swgl_Out##format, r); \ - commit_span(swgl_Out##format, r); \ - swgl_Out##format += swgl_StepSize; \ - swgl_SpanLength -= swgl_StepSize; \ - } while (0) - -// Commit a single chunk of a color -#define swgl_commitColor(format, color) \ - swgl_commitChunk(format, pack_pixels_##format(color)) -#define swgl_commitColorRGBA8(color) swgl_commitColor(RGBA8, color) -#define swgl_commitColorR8(color) swgl_commitColor(R8, color) - -template <typename S> -static ALWAYS_INLINE bool swgl_isTextureLinear(S s) { - return s->filter == TextureFilter::LINEAR; -} - -template <typename S> -static ALWAYS_INLINE bool swgl_isTextureRGBA8(S s) { - return s->format == TextureFormat::RGBA8; -} - -template <typename S> -static ALWAYS_INLINE bool swgl_isTextureR8(S s) { - return s->format == TextureFormat::R8; -} - -// Use the default linear quantization scale of 128. This gives 7 bits of -// fractional precision, which when multiplied with a signed 9 bit value -// still fits in a 16 bit integer. -const int swgl_LinearQuantizeScale = 128; - -// Quantizes UVs for access into a linear texture. -template <typename S, typename T> -static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) { - return linearQuantize(p, swgl_LinearQuantizeScale, s); -} - -// Quantizes an interpolation step for UVs for access into a linear texture. -template <typename S, typename T> -static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) { - return samplerScale(s, p) * swgl_LinearQuantizeScale; -} - -template <typename S> -static ALWAYS_INLINE WideRGBA8 textureLinearUnpacked(UNUSED uint32_t* buf, - S sampler, ivec2 i) { - return textureLinearUnpackedRGBA8(sampler, i); -} - -template <typename S> -static ALWAYS_INLINE WideR8 textureLinearUnpacked(UNUSED uint8_t* buf, - S sampler, ivec2 i) { - return textureLinearUnpackedR8(sampler, i); -} - -template <typename S> -static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint32_t* buf) { - return swgl_isTextureRGBA8(s); -} - -template <typename S> -static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint8_t* buf) { - return swgl_isTextureR8(s); -} - -// Quantizes the UVs to the 2^7 scale needed for calculating fractional offsets -// for linear sampling. -#define LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv) \ - uv = swgl_linearQuantize(sampler, uv); \ - vec2_scalar uv_step = \ - float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; \ - vec2_scalar min_uv = max( \ - swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); \ - vec2_scalar max_uv = \ - max(swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), \ - min_uv); - -// Implements the fallback linear filter that can deal with clamping and -// arbitrary scales. -template <bool BLEND, typename S, typename C, typename P> -static P* blendTextureLinearFallback(S sampler, vec2 uv, int span, - vec2_scalar uv_step, vec2_scalar min_uv, - vec2_scalar max_uv, C color, P* buf) { - for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { - commit_blend_span<BLEND>( - buf, applyColor(textureLinearUnpacked(buf, sampler, - ivec2(clamp(uv, min_uv, max_uv))), - color)); - } - return buf; -} - -static ALWAYS_INLINE U64 castForShuffle(V16<int16_t> r) { - return bit_cast<U64>(r); -} -static ALWAYS_INLINE U16 castForShuffle(V4<int16_t> r) { - return bit_cast<U16>(r); -} - -static ALWAYS_INLINE V16<int16_t> applyFracX(V16<int16_t> r, I16 fracx) { - return r * fracx.xxxxyyyyzzzzwwww; -} -static ALWAYS_INLINE V4<int16_t> applyFracX(V4<int16_t> r, I16 fracx) { - return r * fracx; -} - -// Implements a faster linear filter that works with axis-aligned constant Y but -// scales less than 1, i.e. upscaling. In this case we can optimize for the -// constant Y fraction as well as load all chunks from memory in a single tap -// for each row. -template <bool BLEND, typename S, typename C, typename P> -static void blendTextureLinearUpscale(S sampler, vec2 uv, int span, - vec2_scalar uv_step, vec2_scalar min_uv, - vec2_scalar max_uv, C color, P* buf) { - typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; - typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type; - typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type; - - ivec2 i(clamp(uv, min_uv, max_uv)); - ivec2 frac = i; - i >>= 7; - P* row0 = (P*)sampler->buf + computeRow(sampler, ivec2_scalar(0, i.y.x)); - P* row1 = row0 + computeNextRowOffset(sampler, ivec2_scalar(0, i.y.x)); - I16 fracx = computeFracX(sampler, i, frac); - int16_t fracy = computeFracY(frac).x; - auto src0 = - CONVERT(unaligned_load<packed_type>(&row0[i.x.x]), signed_unpacked_type); - auto src1 = - CONVERT(unaligned_load<packed_type>(&row1[i.x.x]), signed_unpacked_type); - auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); - - // We attempt to sample ahead by one chunk and interpolate it with the current - // one. However, due to the complication of upscaling, we may not necessarily - // shift in all the next set of samples. - for (P* end = buf + span; buf < end; buf += 4) { - uv.x += uv_step.x; - I32 ixn = cast(uv.x); - I16 fracn = computeFracNoClamp(ixn); - ixn >>= 7; - auto src0n = CONVERT(unaligned_load<packed_type>(&row0[ixn.x]), - signed_unpacked_type); - auto src1n = CONVERT(unaligned_load<packed_type>(&row1[ixn.x]), - signed_unpacked_type); - auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); - - // Since we're upscaling, we know that a source pixel has a larger footprint - // than the destination pixel, and thus all the source pixels needed for - // this chunk will fall within a single chunk of texture data. However, - // since the source pixels don't map 1:1 with destination pixels, we need to - // shift the source pixels over based on their offset from the start of the - // chunk. This could conceivably be optimized better with usage of PSHUFB or - // VTBL instructions However, since PSHUFB requires SSSE3, instead we resort - // to masking in the correct pixels to avoid having to index into memory. - // For the last sample to interpolate with, we need to potentially shift in - // a sample from the next chunk over in the case the samples fill out an - // entire chunk. - auto shuf = src; - auto shufn = SHUFFLE(src, ixn.x == i.x.w ? srcn.yyyy : srcn, 1, 2, 3, 4); - if (i.x.y == i.x.x) { - shuf = shuf.xxyz; - shufn = shufn.xxyz; - } - if (i.x.z == i.x.y) { - shuf = shuf.xyyz; - shufn = shufn.xyyz; - } - if (i.x.w == i.x.z) { - shuf = shuf.xyzz; - shufn = shufn.xyzz; - } - - // Convert back to a signed unpacked type so that we can interpolate the - // final result. - auto interp = bit_cast<signed_unpacked_type>(shuf); - auto interpn = bit_cast<signed_unpacked_type>(shufn); - interp += applyFracX(interpn - interp, fracx) >> 7; - - commit_blend_span<BLEND>( - buf, applyColor(bit_cast<unpacked_type>(interp), color)); - - i.x = ixn; - fracx = fracn; - src = srcn; - } -} - -// This is the fastest variant of the linear filter that still provides -// filtering. In cases where there is no scaling required, but we have a -// subpixel offset that forces us to blend in neighboring pixels, we can -// optimize away most of the memory loads and shuffling that is required by the -// fallback filter. -template <bool BLEND, typename S, typename C, typename P> -static void blendTextureLinearFast(S sampler, vec2 uv, int span, - vec2_scalar min_uv, vec2_scalar max_uv, - C color, P* buf) { - typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; - typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type; - typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type; - - ivec2 i(clamp(uv, min_uv, max_uv)); - ivec2 frac = i; - i >>= 7; - P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i)); - P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i)); - int16_t fracx = computeFracX(sampler, i, frac).x; - int16_t fracy = computeFracY(frac).x; - auto src0 = CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type); - auto src1 = CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type); - auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); - - // Since there is no scaling, we sample ahead by one chunk and interpolate it - // with the current one. We can then reuse this value on the next iteration. - for (P* end = buf + span; buf < end; buf += 4) { - row0 += 4; - row1 += 4; - auto src0n = - CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type); - auto src1n = - CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type); - auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); - - // For the last sample to interpolate with, we need to potentially shift in - // a sample from the next chunk over since the samples fill out an entire - // chunk. - auto interp = bit_cast<signed_unpacked_type>(src); - auto interpn = - bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 2, 3, 4)); - interp += ((interpn - interp) * fracx) >> 7; - - commit_blend_span<BLEND>( - buf, applyColor(bit_cast<unpacked_type>(interp), color)); - - src = srcn; - } -} - -// Implements a faster linear filter that works with axis-aligned constant Y but -// downscaling the texture by half. In this case we can optimize for the -// constant X/Y fractions and reduction factor while minimizing shuffling. -template <bool BLEND, typename S, typename C, typename P> -static NO_INLINE void blendTextureLinearDownscale(S sampler, vec2 uv, int span, - vec2_scalar min_uv, - vec2_scalar max_uv, C color, - P* buf) { - typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; - typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type; - typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type; - - ivec2 i(clamp(uv, min_uv, max_uv)); - ivec2 frac = i; - i >>= 7; - P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i)); - P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i)); - int16_t fracx = computeFracX(sampler, i, frac).x; - int16_t fracy = computeFracY(frac).x; - - for (P* end = buf + span; buf < end; buf += 4) { - auto src0 = - CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type); - auto src1 = - CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type); - auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7)); - row0 += 4; - row1 += 4; - auto src0n = - CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type); - auto src1n = - CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type); - auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7)); - row0 += 4; - row1 += 4; - - auto interp = - bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 0, 2, 4, 6)); - auto interpn = - bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 3, 5, 7)); - interp += ((interpn - interp) * fracx) >> 7; - - commit_blend_span<BLEND>( - buf, applyColor(bit_cast<unpacked_type>(interp), color)); - } -} - -enum LinearFilter { - // No linear filter is needed. - LINEAR_FILTER_NEAREST = 0, - // The most general linear filter that handles clamping and varying scales. - LINEAR_FILTER_FALLBACK, - // A linear filter optimized for axis-aligned upscaling. - LINEAR_FILTER_UPSCALE, - // A linear filter with no scaling but with subpixel offset. - LINEAR_FILTER_FAST, - // A linear filter optimized for 2x axis-aligned downscaling. - LINEAR_FILTER_DOWNSCALE -}; - -// Dispatches to an appropriate linear filter depending on the selected filter. -template <bool BLEND, typename S, typename C, typename P> -static P* blendTextureLinearDispatch(S sampler, vec2 uv, int span, - vec2_scalar uv_step, vec2_scalar min_uv, - vec2_scalar max_uv, C color, P* buf, - LinearFilter filter) { - P* end = buf + span; - if (filter != LINEAR_FILTER_FALLBACK) { - // If we're not using the fallback, then Y is constant across the entire - // row. We just need to ensure that we handle any samples that might pull - // data from before the start of the row and require clamping. - float beforeDist = max(0.0f, min_uv.x) - uv.x.x; - if (beforeDist > 0) { - int before = clamp(int(ceil(beforeDist / uv_step.x)) * swgl_StepSize, 0, - int(end - buf)); - buf = blendTextureLinearFallback<BLEND>(sampler, uv, before, uv_step, - min_uv, max_uv, color, buf); - uv.x += (before / swgl_StepSize) * uv_step.x; - } - // We need to check how many samples we can take from inside the row without - // requiring clamping. In case the filter oversamples the row by a step, we - // subtract off a step from the width to leave some room. - float insideDist = - min(max_uv.x, float((int(sampler->width) - swgl_StepSize) * - swgl_LinearQuantizeScale)) - - uv.x.x; - if (uv_step.x > 0.0f && insideDist >= uv_step.x) { - int inside = int(end - buf); - if (filter == LINEAR_FILTER_DOWNSCALE) { - inside = clamp(int(insideDist * (0.5f / swgl_LinearQuantizeScale)) & - ~(swgl_StepSize - 1), - 0, inside); - blendTextureLinearDownscale<BLEND>(sampler, uv, inside, min_uv, max_uv, - color, buf); - } else if (filter == LINEAR_FILTER_UPSCALE) { - inside = clamp(int(insideDist / uv_step.x) * swgl_StepSize, 0, inside); - blendTextureLinearUpscale<BLEND>(sampler, uv, inside, uv_step, min_uv, - max_uv, color, buf); - } else { - inside = clamp(int(insideDist * (1.0f / swgl_LinearQuantizeScale)) & - ~(swgl_StepSize - 1), - 0, inside); - blendTextureLinearFast<BLEND>(sampler, uv, inside, min_uv, max_uv, - color, buf); - } - buf += inside; - uv.x += (inside / swgl_StepSize) * uv_step.x; - } - } - // If the fallback filter was requested, or if there are any samples left that - // may be outside the row and require clamping, then handle that with here. - if (buf < end) { - buf = blendTextureLinearFallback<BLEND>( - sampler, uv, int(end - buf), uv_step, min_uv, max_uv, color, buf); - } - return buf; -} - -// Helper function to quantize UVs for linear filtering before dispatch -template <bool BLEND, typename S, typename C, typename P> -static inline int blendTextureLinear(S sampler, vec2 uv, int span, - const vec4_scalar& uv_rect, C color, - P* buf, LinearFilter filter) { - if (!matchTextureFormat(sampler, buf)) { - return 0; - } - LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv); - blendTextureLinearDispatch<BLEND>(sampler, uv, span, uv_step, min_uv, max_uv, - color, buf, filter); - return span; -} - -// Samples an axis-aligned span of on a single row of a texture using 1:1 -// nearest filtering. Sampling is constrained to only fall within the given UV -// bounds. This requires a pointer to the destination buffer. An optional color -// modulus can be supplied. -template <bool BLEND, typename S, typename C, typename P> -static int blendTextureNearestFast(S sampler, vec2 uv, int span, - const vec4_scalar& uv_rect, C color, - P* buf) { - if (!matchTextureFormat(sampler, buf)) { - return 0; - } - - typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; - - ivec2_scalar i = make_ivec2(samplerScale(sampler, force_scalar(uv))); - ivec2_scalar minUV = - make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y})); - ivec2_scalar maxUV = - make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w})); - - // Calculate the row pointer within the buffer, clamping to within valid row - // bounds. - P* row = - &((P*)sampler - ->buf)[clamp(clampCoord(i.y, sampler->height), minUV.y, maxUV.y) * - sampler->stride]; - // Find clamped X bounds within the row. - int minX = clamp(minUV.x, 0, sampler->width - 1); - int maxX = clamp(maxUV.x, minX, sampler->width - 1); - int curX = i.x; - int endX = i.x + span; - // If we need to start sampling below the valid sample bounds, then we need to - // fill this section with a constant clamped sample. - if (curX < minX) { - int n = min(minX, endX) - curX; - auto src = - applyColor(unpack(bit_cast<packed_type>(V4<P>(row[minX]))), color); - commit_solid_span<BLEND>(buf, src, n); - buf += n; - curX += n; - } - // Here we only deal with valid samples within the sample bounds. No clamping - // should occur here within these inner loops. - int n = max(min(maxX + 1, endX) - curX, 0); - // Try to process as many chunks as possible with full loads and stores. - for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) { - auto src = applyColor(unaligned_load<packed_type>(&row[curX]), color); - commit_blend_span<BLEND>(buf, src); - } - n &= 3; - // If we have any leftover samples after processing chunks, use partial loads - // and stores. - if (n > 0) { - auto src = applyColor(partial_load_span<packed_type>(&row[curX], n), color); - commit_blend_span<BLEND>(buf, src, n); - buf += n; - curX += n; - } - // If we still have samples left above the valid sample bounds, then we again - // need to fill this section with a constant clamped sample. - if (curX < endX) { - auto src = - applyColor(unpack(bit_cast<packed_type>(V4<P>(row[maxX]))), color); - commit_solid_span<BLEND>(buf, src, endX - curX); - } - return span; -} - -// We need to verify that the pixel step reasonably approximates stepping by a -// single texel for every pixel we need to reproduce. Try to ensure that the -// margin of error is no more than approximately 2^-7. Also, we check here if -// the scaling can be quantized for acceleration. -template <typename T> -static ALWAYS_INLINE int spanNeedsScale(int span, T P) { - span &= ~(128 - 1); - span += 128; - int scaled = round((P.x.y - P.x.x) * span); - return scaled != span ? (scaled == span * 2 ? 2 : 1) : 0; -} - -// Helper function to decide whether we can safely apply 1:1 nearest filtering -// without diverging too much from the linear filter. -template <typename S, typename T> -static inline LinearFilter needsTextureLinear(S sampler, T P, int span) { - // First verify if the row Y doesn't change across samples - if (P.y.x != P.y.y) { - return LINEAR_FILTER_FALLBACK; - } - P = samplerScale(sampler, P); - if (int scale = spanNeedsScale(span, P)) { - // If the source region is not flipped and smaller than the destination, - // then we can use the upscaling filter since row Y is constant. - return P.x.x < P.x.y && P.x.y - P.x.x <= 1 - ? LINEAR_FILTER_UPSCALE - : (scale == 2 ? LINEAR_FILTER_DOWNSCALE - : LINEAR_FILTER_FALLBACK); - } - // Also verify that we're reasonably close to the center of a texel - // so that it doesn't look that much different than if a linear filter - // was used. - if ((int(P.x.x * 4.0f + 0.5f) & 3) != 2 || - (int(P.y.x * 4.0f + 0.5f) & 3) != 2) { - // The source and destination regions are the same, but there is a - // significant subpixel offset. We can use a faster linear filter to deal - // with the offset in this case. - return LINEAR_FILTER_FAST; - } - // Otherwise, we have a constant 1:1 step and we're stepping reasonably close - // to the center of each pixel, so it's safe to disable the linear filter and - // use nearest. - return LINEAR_FILTER_NEAREST; -} - -// Commit an entire span with linear filtering -#define swgl_commitTextureLinear(format, s, p, uv_rect, color, n) \ - do { \ - auto packed_color = packColor(swgl_Out##format, color); \ - int len = (n); \ - int drawn = 0; \ - if (LinearFilter filter = needsTextureLinear(s, p, len)) { \ - if (blend_key) { \ - drawn = blendTextureLinear<true>(s, p, len, uv_rect, packed_color, \ - swgl_Out##format, filter); \ - } else { \ - drawn = blendTextureLinear<false>(s, p, len, uv_rect, packed_color, \ - swgl_Out##format, filter); \ - } \ - } else if (blend_key) { \ - drawn = blendTextureNearestFast<true>(s, p, len, uv_rect, packed_color, \ - swgl_Out##format); \ - } else { \ - drawn = blendTextureNearestFast<false>(s, p, len, uv_rect, packed_color, \ - swgl_Out##format); \ - } \ - swgl_Out##format += drawn; \ - swgl_SpanLength -= drawn; \ - } while (0) -#define swgl_commitTextureLinearRGBA8(s, p, uv_rect) \ - swgl_commitTextureLinear(RGBA8, s, p, uv_rect, NoColor(), swgl_SpanLength) -#define swgl_commitTextureLinearR8(s, p, uv_rect) \ - swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), swgl_SpanLength) - -// Commit a partial span with linear filtering, optionally inverting the color -#define swgl_commitPartialTextureLinearR8(len, s, p, uv_rect) \ - swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), \ - min(int(len), swgl_SpanLength)) -#define swgl_commitPartialTextureLinearInvertR8(len, s, p, uv_rect) \ - swgl_commitTextureLinear(R8, s, p, uv_rect, InvertColor(), \ - min(int(len), swgl_SpanLength)) - -// Commit an entire span with linear filtering that is scaled by a color -#define swgl_commitTextureLinearColorRGBA8(s, p, uv_rect, color) \ - swgl_commitTextureLinear(RGBA8, s, p, uv_rect, color, swgl_SpanLength) -#define swgl_commitTextureLinearColorR8(s, p, uv_rect, color) \ - swgl_commitTextureLinear(R8, s, p, uv_rect, color, swgl_SpanLength) - -// Helper function that samples from an R8 texture while expanding it to support -// a differing framebuffer format. -template <bool BLEND, typename S, typename C, typename P> -static inline int blendTextureLinearR8(S sampler, vec2 uv, int span, - const vec4_scalar& uv_rect, C color, - P* buf) { - if (!swgl_isTextureR8(sampler)) { - return 0; - } - LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv); - for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { - commit_blend_span<BLEND>( - buf, applyColor(expand_mask(buf, textureLinearUnpackedR8( - sampler, - ivec2(clamp(uv, min_uv, max_uv)))), - color)); - } - return span; -} - -// Commit an entire span with linear filtering while expanding from R8 to RGBA8 -#define swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, color) \ - do { \ - auto packed_color = packColor(swgl_OutRGBA8, color); \ - int drawn = 0; \ - if (blend_key) { \ - drawn = blendTextureLinearR8<true>(s, p, swgl_SpanLength, uv_rect, \ - packed_color, swgl_OutRGBA8); \ - } else { \ - drawn = blendTextureLinearR8<false>(s, p, swgl_SpanLength, uv_rect, \ - packed_color, swgl_OutRGBA8); \ - } \ - swgl_OutRGBA8 += drawn; \ - swgl_SpanLength -= drawn; \ - } while (0) -#define swgl_commitTextureLinearR8ToRGBA8(s, p, uv_rect) \ - swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, NoColor()) - -// Compute repeating UVs, possibly constrained by tile repeat limits -static inline vec2 tileRepeatUV(vec2 uv, const vec2_scalar& tile_repeat) { - if (tile_repeat.x > 0.0f) { - // Clamp to a number slightly less than the tile repeat limit so that - // it results in a number close to but not equal to 1 after fract(). - // This avoids fract() yielding 0 if the limit was left as whole integer. - uv = clamp(uv, vec2_scalar(0.0f), tile_repeat - 1.0e-6f); - } - return fract(uv); -} - -// Compute the number of non-repeating steps before we need to potentially -// repeat the UVs. -static inline int computeNoRepeatSteps(Float uv, float uv_step, - float tile_repeat, int steps) { - if (uv.w < uv.x) { - // Ensure the UV taps are ordered low to high. - uv = uv.wzyx; - } - // Check if the samples cross the boundary of the next whole integer or the - // tile repeat limit, whichever is lower. - float limit = floor(uv.x) + 1.0f; - if (tile_repeat > 0.0f) { - limit = min(limit, tile_repeat); - } - return uv.x >= 0.0f && uv.w < limit - ? (uv_step != 0.0f - ? int(min(float(steps), (limit - uv.x) / uv_step)) - : steps) - : 0; -} - -// Blends an entire span of texture with linear filtering and repeating UVs. -template <bool BLEND, typename S, typename C, typename P> -static int blendTextureLinearRepeat(S sampler, vec2 uv, int span, - const vec2_scalar& tile_repeat, - const vec4_scalar& uv_repeat, - const vec4_scalar& uv_rect, C color, - P* buf) { - if (!matchTextureFormat(sampler, buf)) { - return 0; - } - vec2_scalar uv_scale = {uv_repeat.z - uv_repeat.x, uv_repeat.w - uv_repeat.y}; - vec2_scalar uv_offset = {uv_repeat.x, uv_repeat.y}; - // Choose a linear filter to use for no-repeat sub-spans - LinearFilter filter = - needsTextureLinear(sampler, uv * uv_scale + uv_offset, span); - // We need to step UVs unscaled and unquantized so that we can modulo them - // with fract. We use uv_scale and uv_offset to map them into the correct - // range. - vec2_scalar uv_step = - float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; - uv_scale = swgl_linearQuantizeStep(sampler, uv_scale); - uv_offset = swgl_linearQuantize(sampler, uv_offset); - vec2_scalar min_uv = max( - swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); - vec2_scalar max_uv = max( - swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), min_uv); - for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { - int steps = int(end - buf) / swgl_StepSize; - // Find the sub-span before UVs repeat to avoid expensive repeat math - steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps); - if (steps > 0) { - steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps); - if (steps > 0) { - buf = blendTextureLinearDispatch<BLEND>( - sampler, fract(uv) * uv_scale + uv_offset, steps * swgl_StepSize, - uv_step * uv_scale, min_uv, max_uv, color, buf, filter); - if (buf >= end) { - break; - } - uv += steps * uv_step; - } - } - // UVs might repeat within this step, so explicitly compute repeated UVs - vec2 repeated_uv = clamp( - tileRepeatUV(uv, tile_repeat) * uv_scale + uv_offset, min_uv, max_uv); - commit_blend_span<BLEND>( - buf, applyColor(textureLinearUnpacked(buf, sampler, ivec2(repeated_uv)), - color)); - } - return span; -} - -// Commit an entire span with linear filtering and repeating UVs -#define swgl_commitTextureLinearRepeat(format, s, p, tile_repeat, uv_repeat, \ - uv_rect, color) \ - do { \ - auto packed_color = packColor(swgl_Out##format, color); \ - int drawn = 0; \ - if (blend_key) { \ - drawn = blendTextureLinearRepeat<true>(s, p, swgl_SpanLength, \ - tile_repeat, uv_repeat, uv_rect, \ - packed_color, swgl_Out##format); \ - } else { \ - drawn = blendTextureLinearRepeat<false>(s, p, swgl_SpanLength, \ - tile_repeat, uv_repeat, uv_rect, \ - packed_color, swgl_Out##format); \ - } \ - swgl_Out##format += drawn; \ - swgl_SpanLength -= drawn; \ - } while (0) -#define swgl_commitTextureLinearRepeatRGBA8(s, p, tile_repeat, uv_repeat, \ - uv_rect) \ - swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \ - NoColor()) -#define swgl_commitTextureLinearRepeatColorRGBA8(s, p, tile_repeat, uv_repeat, \ - uv_rect, color) \ - swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \ - color) - -template <typename S> -static ALWAYS_INLINE PackedRGBA8 textureNearestPacked(UNUSED uint32_t* buf, - S sampler, ivec2 i) { - return textureNearestPackedRGBA8(sampler, i); -} - -// Blends an entire span of texture with nearest filtering and either -// repeated or clamped UVs. -template <bool BLEND, bool REPEAT, typename S, typename C, typename P> -static int blendTextureNearestRepeat(S sampler, vec2 uv, int span, - const vec2_scalar& tile_repeat, - const vec4_scalar& uv_rect, C color, - P* buf) { - if (!matchTextureFormat(sampler, buf)) { - return 0; - } - if (!REPEAT) { - // If clamping, then we step pre-scaled to the sampler. For repeat modes, - // this will be accomplished via uv_scale instead. - uv = samplerScale(sampler, uv); - } - vec2_scalar uv_step = - float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x}; - vec2_scalar min_uv = samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y}); - vec2_scalar max_uv = samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w}); - vec2_scalar uv_scale = max_uv - min_uv; - // If the effective sampling area of this texture is only a single pixel, then - // treat it as a solid span. For repeat modes, the bounds are specified on - // pixel boundaries, whereas for clamp modes, bounds are on pixel centers, so - // the test varies depending on which. If the sample range on an axis is - // greater than one pixel, we can still check if we don't move far enough from - // the pixel center on that axis to hit the next pixel. - if ((int(min_uv.x) + (REPEAT ? 1 : 0) >= int(max_uv.x) || - (uv_step.x * span * (REPEAT ? uv_scale.x : 1.0f) < 0.5f)) && - (int(min_uv.y) + (REPEAT ? 1 : 0) >= int(max_uv.y) || - (uv_step.y * span * (REPEAT ? uv_scale.y : 1.0f) < 0.5f))) { - vec2 repeated_uv = REPEAT - ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv - : clamp(uv, min_uv, max_uv); - commit_solid_span<BLEND>(buf, - applyColor(unpack(textureNearestPacked( - buf, sampler, ivec2(repeated_uv))), - color), - span); - } else { - for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) { - if (REPEAT) { - int steps = int(end - buf) / swgl_StepSize; - // Find the sub-span before UVs repeat to avoid expensive repeat math - steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps); - if (steps > 0) { - steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps); - if (steps > 0) { - vec2 inside_uv = fract(uv) * uv_scale + min_uv; - vec2 inside_step = uv_step * uv_scale; - for (P* outside = &buf[steps * swgl_StepSize]; buf < outside; - buf += swgl_StepSize, inside_uv += inside_step) { - commit_blend_span<BLEND>( - buf, applyColor( - textureNearestPacked(buf, sampler, ivec2(inside_uv)), - color)); - } - if (buf >= end) { - break; - } - uv += steps * uv_step; - } - } - } - - // UVs might repeat within this step, so explicitly compute repeated UVs - vec2 repeated_uv = REPEAT - ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv - : clamp(uv, min_uv, max_uv); - commit_blend_span<BLEND>( - buf, - applyColor(textureNearestPacked(buf, sampler, ivec2(repeated_uv)), - color)); - } - } - return span; -} - -// Determine if we can use the fast nearest filter for the given nearest mode. -// If the Y coordinate varies more than half a pixel over -// the span (which might cause the texel to alias to the next one), or the span -// needs X scaling, then we have to use the fallback. -template <typename S, typename T> -static ALWAYS_INLINE bool needsNearestFallback(S sampler, T P, int span) { - P = samplerScale(sampler, P); - return (P.y.y - P.y.x) * span >= 0.5f || spanNeedsScale(span, P); -} - -// Commit an entire span with nearest filtering and either clamped or repeating -// UVs -#define swgl_commitTextureNearest(format, s, p, uv_rect, color) \ - do { \ - auto packed_color = packColor(swgl_Out##format, color); \ - int drawn = 0; \ - if (needsNearestFallback(s, p, swgl_SpanLength)) { \ - if (blend_key) { \ - drawn = blendTextureNearestRepeat<true, false>( \ - s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \ - swgl_Out##format); \ - } else { \ - drawn = blendTextureNearestRepeat<false, false>( \ - s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color, \ - swgl_Out##format); \ - } \ - } else if (blend_key) { \ - drawn = blendTextureNearestFast<true>(s, p, swgl_SpanLength, uv_rect, \ - packed_color, swgl_Out##format); \ - } else { \ - drawn = blendTextureNearestFast<false>(s, p, swgl_SpanLength, uv_rect, \ - packed_color, swgl_Out##format); \ - } \ - swgl_Out##format += drawn; \ - swgl_SpanLength -= drawn; \ - } while (0) -#define swgl_commitTextureNearestRGBA8(s, p, uv_rect) \ - swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor()) -#define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color) \ - swgl_commitTextureNearest(RGBA8, s, p, uv_rect, color) - -#define swgl_commitTextureNearestRepeat(format, s, p, tile_repeat, uv_rect, \ - color) \ - do { \ - auto packed_color = packColor(swgl_Out##format, color); \ - int drawn = 0; \ - if (blend_key) { \ - drawn = blendTextureNearestRepeat<true, true>( \ - s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \ - swgl_Out##format); \ - } else { \ - drawn = blendTextureNearestRepeat<false, true>( \ - s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color, \ - swgl_Out##format); \ - } \ - swgl_Out##format += drawn; \ - swgl_SpanLength -= drawn; \ - } while (0) -#define swgl_commitTextureNearestRepeatRGBA8(s, p, tile_repeat, uv_repeat, \ - uv_rect) \ - swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, \ - NoColor()) -#define swgl_commitTextureNearestRepeatColorRGBA8(s, p, tile_repeat, \ - uv_repeat, uv_rect, color) \ - swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, color) - -// Commit an entire span of texture with filtering determined by sampler state. -#define swgl_commitTexture(format, s, ...) \ - do { \ - if (s->filter == TextureFilter::LINEAR) { \ - swgl_commitTextureLinear##format(s, __VA_ARGS__); \ - } else { \ - swgl_commitTextureNearest##format(s, __VA_ARGS__); \ - } \ - } while (0) -#define swgl_commitTextureRGBA8(...) swgl_commitTexture(RGBA8, __VA_ARGS__) -#define swgl_commitTextureColorRGBA8(...) \ - swgl_commitTexture(ColorRGBA8, __VA_ARGS__) -#define swgl_commitTextureRepeatRGBA8(...) \ - swgl_commitTexture(RepeatRGBA8, __VA_ARGS__) -#define swgl_commitTextureRepeatColorRGBA8(...) \ - swgl_commitTexture(RepeatColorRGBA8, __VA_ARGS__) - -// Commit an entire span of a separable pass of a Gaussian blur that falls -// within the given radius scaled by supplied coefficients, clamped to uv_rect -// bounds. -template <bool BLEND, typename S, typename P> -static int blendGaussianBlur(S sampler, vec2 uv, const vec4_scalar& uv_rect, - P* buf, int span, bool hori, int radius, - vec2_scalar coeffs) { - if (!matchTextureFormat(sampler, buf)) { - return 0; - } - vec2_scalar size = {float(sampler->width), float(sampler->height)}; - ivec2_scalar curUV = make_ivec2(force_scalar(uv) * size); - ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size)); - int startX = curUV.x; - int endX = min(bounds.z, curUV.x + span); - if (hori) { - for (; curUV.x + swgl_StepSize <= endX; - buf += swgl_StepSize, curUV.x += swgl_StepSize) { - commit_blend_span<BLEND>( - buf, gaussianBlurHorizontal<P>(sampler, curUV, bounds.x, bounds.z, - radius, coeffs.x, coeffs.y)); - } - } else { - for (; curUV.x + swgl_StepSize <= endX; - buf += swgl_StepSize, curUV.x += swgl_StepSize) { - commit_blend_span<BLEND>( - buf, gaussianBlurVertical<P>(sampler, curUV, bounds.y, bounds.w, - radius, coeffs.x, coeffs.y)); - } - } - return curUV.x - startX; -} - -#define swgl_commitGaussianBlur(format, s, p, uv_rect, hori, radius, coeffs) \ - do { \ - int drawn = 0; \ - if (blend_key) { \ - drawn = blendGaussianBlur<true>(s, p, uv_rect, swgl_Out##format, \ - swgl_SpanLength, hori, radius, coeffs); \ - } else { \ - drawn = blendGaussianBlur<false>(s, p, uv_rect, swgl_Out##format, \ - swgl_SpanLength, hori, radius, coeffs); \ - } \ - swgl_Out##format += drawn; \ - swgl_SpanLength -= drawn; \ - } while (0) -#define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs) \ - swgl_commitGaussianBlur(RGBA8, s, p, uv_rect, hori, radius, coeffs) -#define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs) \ - swgl_commitGaussianBlur(R8, s, p, uv_rect, hori, radius, coeffs) - -// Convert and pack planar YUV samples to RGB output using a color space -static ALWAYS_INLINE PackedRGBA8 convertYUV(int colorSpace, U16 y, U16 u, - U16 v) { - auto yy = V8<int16_t>(zip(y, y)); - auto uv = V8<int16_t>(zip(u, v)); - return yuvMatrix[colorSpace].convert(yy, uv); -} - -// Helper functions to sample from planar YUV textures before converting to RGB -template <typename S0> -static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, - int colorSpace, - UNUSED int rescaleFactor) { - switch (sampler0->format) { - case TextureFormat::RGBA8: { - auto planar = textureLinearPlanarRGBA8(sampler0, uv0); - return convertYUV(colorSpace, highHalf(planar.rg), lowHalf(planar.rg), - lowHalf(planar.ba)); - } - case TextureFormat::YUV422: { - auto planar = textureLinearPlanarYUV422(sampler0, uv0); - return convertYUV(colorSpace, planar.y, planar.u, planar.v); - } - default: - assert(false); - return PackedRGBA8(0); - } -} - -template <bool BLEND, typename S0, typename P, typename C = NoColor> -static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, - const vec4_scalar& uv_rect0, int colorSpace, - int rescaleFactor, C color = C()) { - if (!swgl_isTextureLinear(sampler0)) { - return 0; - } - LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); - auto c = packColor(buf, color); - auto* end = buf + span; - for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0) { - commit_blend_span<BLEND>( - buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), - colorSpace, rescaleFactor), - c)); - } - return span; -} - -template <typename S0, typename S1> -static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1, - ivec2 uv1, int colorSpace, - UNUSED int rescaleFactor) { - switch (sampler1->format) { - case TextureFormat::RG8: { - assert(sampler0->format == TextureFormat::R8); - auto y = textureLinearUnpackedR8(sampler0, uv0); - auto planar = textureLinearPlanarRG8(sampler1, uv1); - return convertYUV(colorSpace, y, lowHalf(planar.rg), highHalf(planar.rg)); - } - case TextureFormat::RGBA8: { - assert(sampler0->format == TextureFormat::R8); - auto y = textureLinearUnpackedR8(sampler0, uv0); - auto planar = textureLinearPlanarRGBA8(sampler1, uv1); - return convertYUV(colorSpace, y, lowHalf(planar.ba), highHalf(planar.rg)); - } - default: - assert(false); - return PackedRGBA8(0); - } -} - -template <bool BLEND, typename S0, typename S1, typename P, - typename C = NoColor> -static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, - const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1, - const vec4_scalar& uv_rect1, int colorSpace, - int rescaleFactor, C color = C()) { - if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1)) { - return 0; - } - LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); - LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); - auto c = packColor(buf, color); - auto* end = buf + span; - for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0, uv1 += uv_step1) { - commit_blend_span<BLEND>( - buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), - sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)), - colorSpace, rescaleFactor), - c)); - } - return span; -} - -template <typename S0, typename S1, typename S2> -static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1, - ivec2 uv1, S2 sampler2, ivec2 uv2, - int colorSpace, int rescaleFactor) { - assert(sampler0->format == sampler1->format && - sampler0->format == sampler2->format); - switch (sampler0->format) { - case TextureFormat::R8: { - auto y = textureLinearUnpackedR8(sampler0, uv0); - auto u = textureLinearUnpackedR8(sampler1, uv1); - auto v = textureLinearUnpackedR8(sampler2, uv2); - return convertYUV(colorSpace, y, u, v); - } - case TextureFormat::R16: { - // The rescaling factor represents how many bits to add to renormalize the - // texture to 16 bits, and so the color depth is actually 16 minus the - // rescaling factor. - // Need to right shift the sample by the amount of bits over 8 it - // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit - // of precision at the low end already, hence 1 is subtracted from the - // color depth. - int colorDepth = 16 - rescaleFactor; - int rescaleBits = (colorDepth - 1) - 8; - auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits; - auto u = textureLinearUnpackedR16(sampler1, uv1) >> rescaleBits; - auto v = textureLinearUnpackedR16(sampler2, uv2) >> rescaleBits; - return convertYUV(colorSpace, U16(y), U16(u), U16(v)); - } - default: - assert(false); - return PackedRGBA8(0); - } -} - -// Fallback helper for when we can't specifically accelerate YUV with -// composition. -template <bool BLEND, typename S0, typename S1, typename S2, typename P, - typename C> -static void blendYUVFallback(P* buf, int span, S0 sampler0, vec2 uv0, - vec2_scalar uv_step0, vec2_scalar min_uv0, - vec2_scalar max_uv0, S1 sampler1, vec2 uv1, - vec2_scalar uv_step1, vec2_scalar min_uv1, - vec2_scalar max_uv1, S2 sampler2, vec2 uv2, - vec2_scalar uv_step2, vec2_scalar min_uv2, - vec2_scalar max_uv2, int colorSpace, - int rescaleFactor, C color) { - for (auto* end = buf + span; buf < end; buf += swgl_StepSize, uv0 += uv_step0, - uv1 += uv_step1, uv2 += uv_step2) { - commit_blend_span<BLEND>( - buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)), - sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)), - sampler2, ivec2(clamp(uv2, min_uv2, max_uv2)), - colorSpace, rescaleFactor), - color)); - } -} - -template <bool BLEND, typename S0, typename S1, typename S2, typename P, - typename C = NoColor> -static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0, - const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1, - const vec4_scalar& uv_rect1, S2 sampler2, vec2 uv2, - const vec4_scalar& uv_rect2, int colorSpace, - int rescaleFactor, C color = C()) { - if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) || - !swgl_isTextureLinear(sampler2)) { - return 0; - } - LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); - LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); - LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2); - auto c = packColor(buf, color); - blendYUVFallback<BLEND>(buf, span, sampler0, uv0, uv_step0, min_uv0, max_uv0, - sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2, - uv2, uv_step2, min_uv2, max_uv2, colorSpace, - rescaleFactor, c); - return span; -} - -// A variant of the blendYUV that attempts to reuse the inner loops from the -// CompositeYUV infrastructure. CompositeYUV imposes stricter requirements on -// the source data, which in turn allows it to be much faster than blendYUV. -// At a minimum, we need to ensure that we are outputting to a BGRA8 framebuffer -// and that no color scaling is applied, which we can accomplish via template -// specialization. We need to further validate inside that texture formats -// and dimensions are sane for video and that the video is axis-aligned before -// acceleration can proceed. -template <bool BLEND> -static int blendYUV(uint32_t* buf, int span, sampler2DRect sampler0, vec2 uv0, - const vec4_scalar& uv_rect0, sampler2DRect sampler1, - vec2 uv1, const vec4_scalar& uv_rect1, - sampler2DRect sampler2, vec2 uv2, - const vec4_scalar& uv_rect2, int colorSpace, - int rescaleFactor, NoColor noColor = NoColor()) { - if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) || - !swgl_isTextureLinear(sampler2)) { - return 0; - } - LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0); - LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1); - LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2); - auto* end = buf + span; - // CompositeYUV imposes further restrictions on the source textures, such that - // the the Y/U/V samplers must all have a matching format, the U/V samplers - // must have matching sizes and sample coordinates, and there must be no - // change in row across the entire span. - if (sampler0->format == sampler1->format && - sampler1->format == sampler2->format && - sampler1->width == sampler2->width && - sampler1->height == sampler2->height && uv_step0.y == 0 && - uv_step0.x > 0 && uv_step1.y == 0 && uv_step1.x > 0 && - uv_step1 == uv_step2 && uv1.x.x == uv2.x.x && uv1.y.x == uv2.y.x) { - // CompositeYUV does not support a clamp rect, so we must take care to - // advance till we're inside the bounds of the clamp rect. - int outside = min(int(ceil(max((min_uv0.x - uv0.x.x) / uv_step0.x, - (min_uv1.x - uv1.x.x) / uv_step1.x))), - (end - buf) / swgl_StepSize); - if (outside > 0) { - blendYUVFallback<BLEND>( - buf, outside * swgl_StepSize, sampler0, uv0, uv_step0, min_uv0, - max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2, uv2, - uv_step2, min_uv2, max_uv2, colorSpace, rescaleFactor, noColor); - buf += outside * swgl_StepSize; - uv0.x += outside * uv_step0.x; - uv1.x += outside * uv_step1.x; - uv2.x += outside * uv_step2.x; - } - // Find the amount of chunks inside the clamp rect before we hit the - // maximum. If there are any chunks inside, we can finally dispatch to - // CompositeYUV. - int inside = min(int(min((max_uv0.x - uv0.x.x) / uv_step0.x, - (max_uv1.x - uv1.x.x) / uv_step1.x)), - (end - buf) / swgl_StepSize); - if (inside > 0) { - // We need the color depth, which is relative to the texture format and - // rescale factor. - int colorDepth = - (sampler0->format == TextureFormat::R16 ? 16 : 8) - rescaleFactor; - // Finally, call the inner loop of CompositeYUV. - linear_row_yuv<BLEND>( - buf, inside * swgl_StepSize, sampler0, force_scalar(uv0), - uv_step0.x / swgl_StepSize, sampler1, sampler2, force_scalar(uv1), - uv_step1.x / swgl_StepSize, colorDepth, yuvMatrix[colorSpace]); - // Now that we're done, advance past the processed inside portion. - buf += inside * swgl_StepSize; - uv0.x += inside * uv_step0.x; - uv1.x += inside * uv_step1.x; - uv2.x += inside * uv_step2.x; - } - } - // We either got here because we have some samples outside the clamp rect, or - // because some of the preconditions were not satisfied. Process whatever is - // left of the span. - blendYUVFallback<BLEND>(buf, end - buf, sampler0, uv0, uv_step0, min_uv0, - max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1, - sampler2, uv2, uv_step2, min_uv2, max_uv2, colorSpace, - rescaleFactor, noColor); - return span; -} - -// Commit a single chunk of a YUV surface represented by multiple planar -// textures. This requires a color space specifier selecting how to convert -// from YUV to RGB output. In the case of HDR formats, a rescaling factor -// selects how many bits of precision must be utilized on conversion. See the -// sampleYUV dispatcher functions for the various supported plane -// configurations this intrinsic accepts. -#define swgl_commitTextureLinearYUV(...) \ - do { \ - int drawn = 0; \ - if (blend_key) { \ - drawn = blendYUV<true>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \ - } else { \ - drawn = blendYUV<false>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \ - } \ - swgl_OutRGBA8 += drawn; \ - swgl_SpanLength -= drawn; \ - } while (0) - -// Commit a single chunk of a YUV surface scaled by a color. -#define swgl_commitTextureLinearColorYUV(...) \ - swgl_commitTextureLinearYUV(__VA_ARGS__) - -// Each gradient stops entry is a pair of RGBA32F start color and end step. -struct GradientStops { - Float startColor; - union { - Float stepColor; - vec4_scalar stepData; - }; - - // Whether this gradient entry can be merged with an adjacent entry. The - // step will be equal with the adjacent step if and only if they can be - // merged, or rather, that the stops are actually part of a single larger - // gradient. - bool can_merge(const GradientStops& next) const { - return stepData == next.stepData; - } - - // Get the interpolated color within the entry based on the offset from its - // start. - Float interpolate(float offset) const { - return startColor + stepColor * offset; - } - - // Get the end color of the entry where interpolation stops. - Float end_color() const { return startColor + stepColor; } -}; - -// Checks if a gradient table of the specified size exists at the UV coords of -// the address within an RGBA32F texture. If so, a linear address within the -// texture is returned that may be used to sample the gradient table later. If -// the address doesn't describe a valid gradient, then a negative value is -// returned. -static inline int swgl_validateGradient(sampler2D sampler, ivec2_scalar address, - int entries) { - return sampler->format == TextureFormat::RGBA32F && address.y >= 0 && - address.y < int(sampler->height) && address.x >= 0 && - address.x < int(sampler->width) && entries > 0 && - address.x + - int(sizeof(GradientStops) / sizeof(Float)) * entries <= - int(sampler->width) - ? address.y * sampler->stride + address.x * 4 - : -1; -} - -static inline WideRGBA8 sampleGradient(sampler2D sampler, int address, - Float entry) { - assert(sampler->format == TextureFormat::RGBA32F); - assert(address >= 0 && address < int(sampler->height * sampler->stride)); - // Get the integer portion of the entry index to find the entry colors. - I32 index = cast(entry); - // Use the fractional portion of the entry index to control blending between - // entry colors. - Float offset = entry - cast(index); - // Every entry is a pair of colors blended by the fractional offset. - assert(test_all(index >= 0 && - index * int(sizeof(GradientStops) / sizeof(Float)) < - int(sampler->width))); - GradientStops* stops = (GradientStops*)&sampler->buf[address]; - // Blend between the colors for each SIMD lane, then pack them to RGBA8 - // result. Since the layout of the RGBA8 framebuffer is actually BGRA while - // the gradient table has RGBA colors, swizzling is required. - return combine( - packRGBA8(round_pixel(stops[index.x].interpolate(offset.x).zyxw), - round_pixel(stops[index.y].interpolate(offset.y).zyxw)), - packRGBA8(round_pixel(stops[index.z].interpolate(offset.z).zyxw), - round_pixel(stops[index.w].interpolate(offset.w).zyxw))); -} - -// Samples a gradient entry from the gradient at the provided linearized -// address. The integer portion of the entry index is used to find the entry -// within the table whereas the fractional portion is used to blend between -// adjacent table entries. -#define swgl_commitGradientRGBA8(sampler, address, entry) \ - swgl_commitChunk(RGBA8, sampleGradient(sampler, address, entry)) - -// Variant that allows specifying a color multiplier of the gradient result. -#define swgl_commitGradientColorRGBA8(sampler, address, entry, color) \ - swgl_commitChunk(RGBA8, applyColor(sampleGradient(sampler, address, entry), \ - packColor(swgl_OutRGBA, color))) - -// Samples an entire span of a linear gradient by crawling the gradient table -// and looking for consecutive stops that can be merged into a single larger -// gradient, then interpolating between those larger gradients within the span. -template <bool BLEND> -static bool commitLinearGradient(sampler2D sampler, int address, float size, - bool repeat, Float offset, uint32_t* buf, - int span) { - assert(sampler->format == TextureFormat::RGBA32F); - assert(address >= 0 && address < int(sampler->height * sampler->stride)); - GradientStops* stops = (GradientStops*)&sampler->buf[address]; - // Get the chunk delta from the difference in offset steps. This represents - // how far within the gradient table we advance for every step in output, - // normalized to gradient table size. - float delta = (offset.y - offset.x) * 4.0f; - if (!isfinite(delta)) { - return false; - } - for (; span > 0;) { - // If repeat is desired, we need to limit the offset to a fractional value. - if (repeat) { - offset = fract(offset); - } - // Try to process as many chunks as are within the span if possible. - float chunks = 0.25f * span; - // To properly handle both clamping and repeating of the table offset, we - // need to ensure we don't run past the 0 and 1 points. Here we compute the - // intercept points depending on whether advancing forwards or backwards in - // the gradient table to ensure the chunk count is limited by the amount - // before intersection. If there is no delta, then we compute no intercept. - float startEntry; - int minIndex, maxIndex; - if (offset.x < 0) { - // If we're below the gradient table, use the first color stop. We can - // only intercept the table if walking forward. - startEntry = 0; - minIndex = int(startEntry); - maxIndex = minIndex; - if (delta > 0) { - chunks = min(chunks, -offset.x / delta); - } - } else if (offset.x < 1) { - // Otherwise, we're inside the gradient table. Depending on the direction - // we're walking the the table, we may intersect either the 0 or 1 offset. - // Compute the start entry based on our initial offset, and compute the - // end entry based on the available chunks limited by intercepts. Clamp - // them into the valid range of the table. - startEntry = 1.0f + offset.x * size; - if (delta < 0) { - chunks = min(chunks, -offset.x / delta); - } else if (delta > 0) { - chunks = min(chunks, (1 - offset.x) / delta); - } - float endEntry = clamp(1.0f + (offset.x + delta * int(chunks)) * size, - 0.0f, 1.0f + size); - // Now that we know the range of entries we need to sample, we want to - // find the largest possible merged gradient within that range. Depending - // on which direction we are advancing in the table, we either walk up or - // down the table trying to merge the current entry with the adjacent - // entry. We finally limit the chunks to only sample from this merged - // gradient. - minIndex = int(startEntry); - maxIndex = minIndex; - if (delta > 0) { - while (maxIndex + 1 < endEntry && - stops[maxIndex].can_merge(stops[maxIndex + 1])) { - maxIndex++; - } - chunks = min(chunks, (maxIndex + 1 - startEntry) / (delta * size)); - } else if (delta < 0) { - while (minIndex - 1 > endEntry && - stops[minIndex - 1].can_merge(stops[minIndex])) { - minIndex--; - } - chunks = min(chunks, (minIndex - startEntry) / (delta * size)); - } - } else { - // If we're above the gradient table, use the last color stop. We can - // only intercept the table if walking backward. - startEntry = 1.0f + size; - minIndex = int(startEntry); - maxIndex = minIndex; - if (delta < 0) { - chunks = min(chunks, (1 - offset.x) / delta); - } - } - // If there are any amount of whole chunks of a merged gradient found, - // then we want to process that as a single gradient span with the start - // and end colors from the min and max entries. - if (chunks >= 1.0f) { - int inside = int(chunks); - // Sample the start color from the min entry and the end color from the - // max entry of the merged gradient. These are scaled to a range of - // 0..0xFF00, as that is the largest shifted value that can fit in a U16. - // Since we are only doing addition with the step value, we can still - // represent negative step values without having to use an explicit sign - // bit, as the result will still come out the same, allowing us to gain an - // extra bit of precision. We will later shift these into 8 bit output - // range while committing the span, but stepping with higher precision to - // avoid banding. We convert from RGBA to BGRA here to avoid doing this in - // the inner loop. - auto minColorF = stops[minIndex].startColor.zyxw * float(0xFF00); - auto maxColorF = stops[maxIndex].end_color().zyxw * float(0xFF00); - // Get the color range of the merged gradient, normalized to its size. - auto colorRangeF = - (maxColorF - minColorF) * (1.0f / (maxIndex + 1 - minIndex)); - // Compute the actual starting color of the current start offset within - // the merged gradient. The value 0.5 is added to the low bits (0x80) so - // that the color will effective round to the nearest increment below. - auto colorF = - minColorF + colorRangeF * (startEntry - minIndex) + float(0x80); - // Compute the portion of the color range that we advance on each chunk. - Float deltaColorF = colorRangeF * (delta * size); - // Quantize the color delta and current color. These have already been - // scaled to the 0..0xFF00 range, so we just need to round them to U16. - auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16)); - auto color = - combine(CONVERT(round_pixel(colorF, 1), U16), - CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16), - CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16), - CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16)); - // Finally, step the current color through the output chunks, shifting - // it into 8 bit range and outputting as we go. - for (auto* end = buf + inside * 4; buf < end; buf += 4) { - commit_blend_span<BLEND>(buf, bit_cast<WideRGBA8>(color >> 8)); - color += deltaColor; - } - // Deduct the number of chunks inside the gradient from the remaining - // overall span. If we exhausted the span, bail out. - span -= inside * 4; - if (span <= 0) { - break; - } - // Otherwise, assume we're in a transitional section of the gradient that - // will probably require per-sample table lookups, so fall through below. - offset += inside * delta; - if (repeat) { - offset = fract(offset); - } - } - // If we get here, there were no whole chunks of a merged gradient found - // that we could process, but we still have a non-zero amount of span left. - // That means we have segments of gradient that begin or end at the current - // entry we're on. For this case, we just fall back to sampleGradient which - // will calculate a table entry for each sample, assuming the samples may - // have different table entries. - Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size); - commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry)); - span -= 4; - buf += 4; - offset += delta; - } - return true; -} - -// Commits an entire span of a linear gradient, given the address of a table -// previously resolved with swgl_validateGradient. The size of the inner portion -// of the table is given, assuming the table start and ends with a single entry -// each to deal with clamping. Repeating will be handled if necessary. The -// initial offset within the table is used to designate where to start the span -// and how to step through the gradient table. -#define swgl_commitLinearGradientRGBA8(sampler, address, size, repeat, offset) \ - do { \ - bool drawn = false; \ - if (blend_key) { \ - drawn = \ - commitLinearGradient<true>(sampler, address, size, repeat, offset, \ - swgl_OutRGBA8, swgl_SpanLength); \ - } else { \ - drawn = \ - commitLinearGradient<false>(sampler, address, size, repeat, offset, \ - swgl_OutRGBA8, swgl_SpanLength); \ - } \ - if (drawn) { \ - swgl_OutRGBA8 += swgl_SpanLength; \ - swgl_SpanLength = 0; \ - } \ - } while (0) - -template <bool CLAMP, typename V> -static ALWAYS_INLINE V fastSqrt(V v) { -#if USE_SSE2 || USE_NEON - // Clamp to avoid zero in inversesqrt. - return v * inversesqrt(CLAMP ? max(v, V(1.0e-10f)) : v); -#else - return sqrt(v); -#endif -} - -template <bool CLAMP, typename V> -static ALWAYS_INLINE auto fastLength(V v) { - return fastSqrt<CLAMP>(dot(v, v)); -} - -// Samples an entire span of a radial gradient by crawling the gradient table -// and looking for consecutive stops that can be merged into a single larger -// gradient, then interpolating between those larger gradients within the span -// based on the computed position relative to a radius. -template <bool BLEND> -static bool commitRadialGradient(sampler2D sampler, int address, float size, - bool repeat, vec2 pos, float radius, - uint32_t* buf, int span) { - assert(sampler->format == TextureFormat::RGBA32F); - assert(address >= 0 && address < int(sampler->height * sampler->stride)); - GradientStops* stops = (GradientStops*)&sampler->buf[address]; - // clang-format off - // Given position p, delta d, and radius r, we need to repeatedly solve the - // following quadratic for the pixel offset t: - // length(p + t*d) = r - // (px + t*dx)^2 + (py + t*dy)^2 = r^2 - // Rearranged into quadratic equation form (t^2*a + t*b + c = 0) this is: - // t^2*(dx^2+dy^2) + t*2*(dx*px+dy*py) + (px^2+py^2-r^2) = 0 - // t^2*d.d + t*2*d.p + (p.p-r^2) = 0 - // The solution of the quadratic formula t=(-b+-sqrt(b^2-4ac))/2a reduces to: - // t = -d.p/d.d +- sqrt((d.p/d.d)^2 - (p.p-r^2)/d.d) - // Note that d.p, d.d, p.p, and r^2 are constant across the gradient, and so - // we cache them below for faster computation. - // - // The quadratic has two solutions, representing the span intersecting the - // given radius of gradient, which can occur at two offsets. If there is only - // one solution (where b^2-4ac = 0), this represents the point at which the - // span runs tangent to the radius. This middle point is significant in that - // before it, we walk down the gradient ramp, and after it, we walk up the - // ramp. - // clang-format on - vec2_scalar pos0 = {pos.x.x, pos.y.x}; - vec2_scalar delta = {pos.x.y - pos.x.x, pos.y.y - pos.y.x}; - float deltaDelta = dot(delta, delta); - if (!isfinite(deltaDelta) || !isfinite(radius)) { - return false; - } - float invDelta, middleT, middleB; - if (deltaDelta > 0) { - invDelta = 1.0f / deltaDelta; - middleT = -dot(delta, pos0) * invDelta; - middleB = middleT * middleT - dot(pos0, pos0) * invDelta; - } else { - // If position is invariant, just set the coefficients so the quadratic - // always reduces to the end of the span. - invDelta = 0.0f; - middleT = float(span); - middleB = 0.0f; - } - // We only want search for merged gradients up to the minimum of either the - // mid-point or the span length. Cache those offsets here as they don't vary - // in the inner loop. - Float middleEndRadius = fastLength<true>( - pos0 + delta * (Float){middleT, float(span), 0.0f, 0.0f}); - float middleRadius = span < middleT ? middleEndRadius.y : middleEndRadius.x; - float endRadius = middleEndRadius.y; - // Convert delta to change in position per chunk. - delta *= 4; - deltaDelta *= 4 * 4; - // clang-format off - // Given current position p and delta d, we reduce: - // length(p) = sqrt(dot(p,p)) = dot(p,p) * invsqrt(dot(p,p)) - // where dot(p+d,p+d) can be accumulated as: - // (x+dx)^2+(y+dy)^2 = (x^2+y^2) + 2(x*dx+y*dy) + (dx^2+dy^2) - // = p.p + 2p.d + d.d - // Since p increases by d every loop iteration, p.d increases by d.d, and thus - // we can accumulate d.d to calculate 2p.d, then allowing us to get the next - // dot-product by adding it to dot-product p.p of the prior iteration. This - // saves us some multiplications and an expensive sqrt inside the inner loop. - // clang-format on - Float dotPos = dot(pos, pos); - Float dotPosDelta = 2.0f * dot(pos, delta) + deltaDelta; - float deltaDelta2 = 2.0f * deltaDelta; - for (int t = 0; t < span;) { - // Compute the gradient table offset from the current position. - Float offset = fastSqrt<true>(dotPos) - radius; - float startRadius = radius; - // If repeat is desired, we need to limit the offset to a fractional value. - if (repeat) { - // The non-repeating radius at which the gradient table actually starts, - // radius + floor(offset) = radius + (offset - fract(offset)). - startRadius += offset.x; - offset = fract(offset); - startRadius -= offset.x; - } - // We need to find the min/max index in the table of the gradient we want to - // use as well as the intercept point where we leave this gradient. - float intercept = -1; - int minIndex = 0; - int maxIndex = int(1.0f + size); - if (offset.x < 0) { - // If inside the inner radius of the gradient table, then use the first - // stop. Set the intercept to advance forward to the start of the gradient - // table. - maxIndex = minIndex; - if (t >= middleT) { - intercept = radius; - } - } else if (offset.x < 1) { - // Otherwise, we're inside the valid part of the gradient table. - minIndex = int(1.0f + offset.x * size); - maxIndex = minIndex; - // Find the offset in the gradient that corresponds to the search limit. - // We only search up to the minimum of either the mid-point or the span - // length. Get the table index that corresponds to this offset, clamped so - // that we avoid hitting the beginning (0) or end (1 + size) of the table. - float searchOffset = - (t >= middleT ? endRadius : middleRadius) - startRadius; - int searchIndex = int(clamp(1.0f + size * searchOffset, 1.0f, size)); - // If we are past the mid-point, walk up the gradient table trying to - // merge stops. If we're below the mid-point, we need to walk down the - // table. We note the table index at which we need to look for an - // intercept to determine a valid span. - if (t >= middleT) { - while (maxIndex + 1 <= searchIndex && - stops[maxIndex].can_merge(stops[maxIndex + 1])) { - maxIndex++; - } - intercept = maxIndex + 1; - } else { - while (minIndex - 1 >= searchIndex && - stops[minIndex - 1].can_merge(stops[minIndex])) { - minIndex--; - } - intercept = minIndex; - } - // Convert from a table index into units of radius from the center of the - // gradient. - intercept = clamp((intercept - 1.0f) / size, 0.0f, 1.0f) + startRadius; - } else { - // If outside the outer radius of the gradient table, then use the last - // stop. Set the intercept to advance toward the valid part of the - // gradient table if going in, or just run to the end of the span if going - // away from the gradient. - minIndex = maxIndex; - if (t < middleT) { - intercept = radius + 1; - } - } - // Solve the quadratic for t to find where the merged gradient ends. If no - // intercept is found, just go to the middle or end of the span. - float endT = t >= middleT ? span : min(span, int(middleT)); - if (intercept >= 0) { - float b = middleB + intercept * intercept * invDelta; - if (b > 0) { - b = fastSqrt<false>(b); - endT = min(endT, t >= middleT ? middleT + b : middleT - b); - } - } - // Figure out how many chunks are actually inside the merged gradient. - if (t + 4.0f <= endT) { - int inside = int(endT - t) & ~3; - // Convert start and end colors to BGRA and scale to 0..255 range later. - auto minColorF = stops[minIndex].startColor.zyxw * 255.0f; - auto maxColorF = stops[maxIndex].end_color().zyxw * 255.0f; - // Compute the change in color per change in gradient offset. - auto deltaColorF = - (maxColorF - minColorF) * (size / (maxIndex + 1 - minIndex)); - // Subtract off the color difference of the beginning of the current span - // from the beginning of the gradient. - Float colorF = - minColorF - deltaColorF * (startRadius + (minIndex - 1) / size); - // Finally, walk over the span accumulating the position dot product and - // getting its sqrt as an offset into the color ramp. Since we're already - // in BGRA format and scaled to 255, we just need to round to an integer - // and pack down to pixel format. - for (auto* end = buf + inside; buf < end; buf += 4) { - Float offsetG = fastSqrt<false>(dotPos); - commit_blend_span<BLEND>( - buf, - combine( - packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1), - round_pixel(colorF + deltaColorF * offsetG.y, 1)), - packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1), - round_pixel(colorF + deltaColorF * offsetG.w, 1)))); - dotPos += dotPosDelta; - dotPosDelta += deltaDelta2; - } - // Advance past the portion of gradient we just processed. - t += inside; - // If we hit the end of the span, exit out now. - if (t >= span) { - break; - } - // Otherwise, we are most likely in a transitional section of the gradient - // between stops that will likely require doing per-sample table lookups. - // Rather than having to redo all the searching above to figure that out, - // just assume that to be the case and fall through below to doing the - // table lookups to hopefully avoid an iteration. - offset = fastSqrt<true>(dotPos) - radius; - if (repeat) { - offset = fract(offset); - } - } - // If we got here, that means we still have span left to process but did not - // have any whole chunks that fell within a merged gradient. Just fall back - // to doing a table lookup for each sample. - Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size); - commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry)); - buf += 4; - t += 4; - dotPos += dotPosDelta; - dotPosDelta += deltaDelta2; - } - return true; -} - -// Commits an entire span of a radial gradient similar to -// swglcommitLinearGradient, but given a varying 2D position scaled to -// gradient-space and a radius at which the distance from the origin maps to the -// start of the gradient table. -#define swgl_commitRadialGradientRGBA8(sampler, address, size, repeat, pos, \ - radius) \ - do { \ - bool drawn = false; \ - if (blend_key) { \ - drawn = \ - commitRadialGradient<true>(sampler, address, size, repeat, pos, \ - radius, swgl_OutRGBA8, swgl_SpanLength); \ - } else { \ - drawn = \ - commitRadialGradient<false>(sampler, address, size, repeat, pos, \ - radius, swgl_OutRGBA8, swgl_SpanLength); \ - } \ - if (drawn) { \ - swgl_OutRGBA8 += swgl_SpanLength; \ - swgl_SpanLength = 0; \ - } \ - } while (0) - -// Extension to set a clip mask image to be sampled during blending. The offset -// specifies the positioning of the clip mask image relative to the viewport -// origin. The bounding box specifies the rectangle relative to the clip mask's -// origin that constrains sampling within the clip mask. Blending must be -// enabled for this to work. -static sampler2D swgl_ClipMask = nullptr; -static IntPoint swgl_ClipMaskOffset = {0, 0}; -static IntRect swgl_ClipMaskBounds = {0, 0, 0, 0}; -#define swgl_clipMask(mask, offset, bb_origin, bb_size) \ - do { \ - if (bb_size != vec2_scalar(0.0f, 0.0f)) { \ - swgl_ClipFlags |= SWGL_CLIP_FLAG_MASK; \ - swgl_ClipMask = mask; \ - swgl_ClipMaskOffset = make_ivec2(offset); \ - swgl_ClipMaskBounds = \ - IntRect(make_ivec2(bb_origin), make_ivec2(bb_size)); \ - } \ - } while (0) - -// Extension to enable anti-aliasing for the given edges of a quad. -// Blending must be enable for this to work. -static int swgl_AAEdgeMask = 0; - -static ALWAYS_INLINE int calcAAEdgeMask(bool on) { return on ? 0xF : 0; } -static ALWAYS_INLINE int calcAAEdgeMask(int mask) { return mask; } -static ALWAYS_INLINE int calcAAEdgeMask(bvec4_scalar mask) { - return (mask.x ? 1 : 0) | (mask.y ? 2 : 0) | (mask.z ? 4 : 0) | - (mask.w ? 8 : 0); -} - -#define swgl_antiAlias(edges) \ - do { \ - swgl_AAEdgeMask = calcAAEdgeMask(edges); \ - if (swgl_AAEdgeMask) { \ - swgl_ClipFlags |= SWGL_CLIP_FLAG_AA; \ - } \ - } while (0) - -#define swgl_blendDropShadow(color) \ - do { \ - swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \ - swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_DROP_SHADOW); \ - swgl_BlendColorRGBA8 = packColor<uint32_t>(color); \ - } while (0) - -#define swgl_blendSubpixelText(color) \ - do { \ - swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE; \ - swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_SUBPIXEL_TEXT); \ - swgl_BlendColorRGBA8 = packColor<uint32_t>(color); \ - swgl_BlendAlphaRGBA8 = alphas(swgl_BlendColorRGBA8); \ - } while (0) - -// Dispatch helper used by the GLSL translator to swgl_drawSpan functions. -// The number of pixels committed is tracked by checking for the difference in -// swgl_SpanLength. Any varying interpolants used will be advanced past the -// committed part of the span in case the fragment shader must be executed for -// any remaining pixels that were not committed by the span shader. -#define DISPATCH_DRAW_SPAN(self, format) \ - do { \ - int total = self->swgl_SpanLength; \ - self->swgl_drawSpan##format(); \ - int drawn = total - self->swgl_SpanLength; \ - if (drawn) self->step_interp_inputs(drawn); \ - return drawn; \ - } while (0) diff --git a/third_party/webrender/swgl/src/swgl_fns.rs b/third_party/webrender/swgl/src/swgl_fns.rs index fdb55058afe..0cb60c6d4c8 100644 --- a/third_party/webrender/swgl/src/swgl_fns.rs +++ b/third_party/webrender/swgl/src/swgl_fns.rs @@ -14,12 +14,8 @@ macro_rules! debug { ($($x:tt)*) => {}; } -#[repr(C)] -struct LockedTexture { - _private: [u8; 0], -} +extern "C" {} -#[allow(dead_code)] extern "C" { fn ActiveTexture(texture: GLenum); fn BindTexture(target: GLenum, texture: GLuint); @@ -65,7 +61,19 @@ extern "C" { level: GLint, ); fn CheckFramebufferStatus(target: GLenum) -> GLenum; - fn InvalidateFramebuffer(target: GLenum, num_attachments: GLsizei, attachments: *const GLenum); + fn InvalidateFramebuffer( + target: GLenum, + num_attachments: GLsizei, + attachments: *const GLenum, + ); + fn TexStorage3D( + target: GLenum, + levels: GLint, + internal_format: GLenum, + width: GLsizei, + height: GLsizei, + depth: GLsizei, + ); fn TexImage2D( target: GLenum, level: GLint, @@ -77,6 +85,18 @@ extern "C" { ty: GLenum, data: *const c_void, ); + fn TexImage3D( + target: GLenum, + level: GLint, + internal_format: GLint, + width: GLsizei, + height: GLsizei, + depth: GLsizei, + border: GLint, + format: GLenum, + ty: GLenum, + data: *const c_void, + ); fn TexSubImage2D( target: GLenum, level: GLint, @@ -88,6 +108,19 @@ extern "C" { ty: GLenum, data: *const c_void, ); + fn TexSubImage3D( + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + zoffset: GLint, + width: GLsizei, + height: GLsizei, + depth: GLsizei, + format: GLenum, + ty: GLenum, + data: *const c_void, + ); fn GenerateMipmap(target: GLenum); fn GetUniformLocation(program: GLuint, name: *const GLchar) -> GLint; fn BindAttribLocation(program: GLuint, index: GLuint, name: *const GLchar); @@ -119,19 +152,26 @@ extern "C" { transpose: GLboolean, value: *const GLfloat, ); + fn DrawElementsInstanced( mode: GLenum, count: GLsizei, type_: GLenum, - indices: GLintptr, + indices: *const c_void, instancecount: GLsizei, ); fn EnableVertexAttribArray(index: GLuint); fn VertexAttribDivisor(index: GLuint, divisor: GLuint); fn LinkProgram(program: GLuint); - fn GetLinkStatus(program: GLuint) -> GLint; fn UseProgram(program: GLuint); fn SetViewport(x: GLint, y: GLint, width: GLsizei, height: GLsizei); + fn FramebufferTextureLayer( + target: GLenum, + attachment: GLenum, + texture: GLuint, + level: GLint, + layer: GLint, + ); fn FramebufferRenderbuffer( target: GLenum, attachment: GLenum, @@ -145,31 +185,6 @@ extern "C" { fn ClearColor(r: GLfloat, g: GLfloat, b: GLfloat, a: GLfloat); fn ClearDepth(depth: GLdouble); fn Clear(mask: GLbitfield); - fn ClearTexSubImage( - target: GLenum, - level: GLint, - xoffset: GLint, - yoffset: GLint, - zoffset: GLint, - width: GLsizei, - height: GLsizei, - depth: GLsizei, - format: GLenum, - ty: GLenum, - data: *const c_void, - ); - fn ClearTexImage(target: GLenum, level: GLint, format: GLenum, ty: GLenum, data: *const c_void); - fn ClearColorRect( - fbo: GLuint, - xoffset: GLint, - yoffset: GLint, - width: GLsizei, - height: GLsizei, - r: GLfloat, - g: GLfloat, - b: GLfloat, - a: GLfloat, - ); fn PixelStorei(name: GLenum, param: GLint); fn ReadPixels( x: GLint, @@ -210,6 +225,17 @@ extern "C" { width: GLsizei, height: GLsizei, ); + fn CopyTexSubImage3D( + target: GLenum, + level: GLint, + xoffset: GLint, + yoffset: GLint, + zoffset: GLint, + x: GLint, + y: GLint, + width: GLsizei, + height: GLsizei, + ); fn BlitFramebuffer( src_x0: GLint, src_y0: GLint, @@ -227,33 +253,22 @@ extern "C" { fn GetString(name: GLenum) -> *const c_char; fn GetStringi(name: GLenum, index: GLuint) -> *const c_char; fn GetError() -> GLenum; - fn InitDefaultFramebuffer( - x: i32, - y: i32, - width: i32, - height: i32, - stride: i32, - buf: *mut c_void, - ); + fn InitDefaultFramebuffer(width: i32, height: i32); fn GetColorBuffer( fbo: GLuint, flush: GLboolean, width: *mut i32, height: *mut i32, - stride: *mut i32, ) -> *mut c_void; - fn ResolveFramebuffer(fbo: GLuint); fn SetTextureBuffer( tex: GLuint, internal_format: GLenum, width: GLsizei, height: GLsizei, - stride: GLsizei, buf: *mut c_void, min_width: GLsizei, min_height: GLsizei, ); - fn SetTextureParameter(tex: GLuint, pname: GLenum, param: GLint); fn DeleteTexture(n: GLuint); fn DeleteRenderbuffer(n: GLuint); fn DeleteFramebuffer(n: GLuint); @@ -262,64 +277,23 @@ extern "C" { fn DeleteQuery(n: GLuint); fn DeleteShader(shader: GLuint); fn DeleteProgram(program: GLuint); - fn LockFramebuffer(fbo: GLuint) -> *mut LockedTexture; - fn LockTexture(tex: GLuint) -> *mut LockedTexture; - fn LockResource(resource: *mut LockedTexture); - fn UnlockResource(resource: *mut LockedTexture); - fn GetResourceBuffer( - resource: *mut LockedTexture, - width: *mut i32, - height: *mut i32, - stride: *mut i32, - ) -> *mut c_void; fn Composite( - locked_dst: *mut LockedTexture, - locked_src: *mut LockedTexture, + src_id: GLuint, src_x: GLint, src_y: GLint, src_width: GLsizei, src_height: GLsizei, dst_x: GLint, dst_y: GLint, - dst_width: GLsizei, - dst_height: GLsizei, opaque: GLboolean, flip: GLboolean, - filter: GLenum, - clip_x: GLint, - clip_y: GLint, - clip_width: GLsizei, - clip_height: GLsizei, - ); - fn CompositeYUV( - locked_dst: *mut LockedTexture, - locked_y: *mut LockedTexture, - locked_u: *mut LockedTexture, - locked_v: *mut LockedTexture, - color_space: YUVColorSpace, - color_depth: GLuint, - src_x: GLint, - src_y: GLint, - src_width: GLsizei, - src_height: GLsizei, - dst_x: GLint, - dst_y: GLint, - dst_width: GLsizei, - dst_height: GLsizei, - flip: GLboolean, - clip_x: GLint, - clip_y: GLint, - clip_width: GLsizei, - clip_height: GLsizei, ); fn CreateContext() -> *mut c_void; - fn ReferenceContext(ctx: *mut c_void); fn DestroyContext(ctx: *mut c_void); fn MakeCurrent(ctx: *mut c_void); - fn ReportMemory(size_of_op: unsafe extern "C" fn(ptr: *const c_void) -> usize) -> usize; } -#[derive(Clone, Copy)] +#[derive(Clone)] pub struct Context(*mut c_void); impl Context { @@ -327,12 +301,6 @@ impl Context { Context(unsafe { CreateContext() }) } - pub fn reference(&self) { - unsafe { - ReferenceContext(self.0); - } - } - pub fn destroy(&self) { unsafe { DestroyContext(self.0); @@ -345,56 +313,18 @@ impl Context { } } - pub fn init_default_framebuffer( - &self, - x: i32, - y: i32, - width: i32, - height: i32, - stride: i32, - buf: *mut c_void, - ) { + pub fn init_default_framebuffer(&self, width: i32, height: i32) { unsafe { - InitDefaultFramebuffer(x, y, width, height, stride, buf); + InitDefaultFramebuffer(width, height); } } - pub fn get_color_buffer(&self, fbo: GLuint, flush: bool) -> (*mut c_void, i32, i32, i32) { + pub fn get_color_buffer(&self, fbo: GLuint, flush: bool) -> (*mut c_void, i32, i32) { unsafe { let mut width: i32 = 0; let mut height: i32 = 0; - let mut stride: i32 = 0; - let data_ptr = GetColorBuffer( - fbo, - flush as GLboolean, - &mut width, - &mut height, - &mut stride, - ); - (data_ptr, width, height, stride) - } - } - - pub fn resolve_framebuffer(&self, fbo: GLuint) { - unsafe { - ResolveFramebuffer(fbo); - } - } - - pub fn clear_color_rect( - &self, - fbo: GLuint, - xoffset: GLint, - yoffset: GLint, - width: GLsizei, - height: GLsizei, - r: f32, - g: f32, - b: f32, - a: f32, - ) { - unsafe { - ClearColorRect(fbo, xoffset, yoffset, width, height, r, g, b, a); + let data_ptr = GetColorBuffer(fbo, flush as GLboolean, &mut width, &mut height); + (data_ptr, width, height) } } @@ -404,7 +334,6 @@ impl Context { internal_format: GLenum, width: GLsizei, height: GLsizei, - stride: GLsizei, buf: *mut c_void, min_width: GLsizei, min_height: GLsizei, @@ -415,7 +344,6 @@ impl Context { internal_format, width, height, - stride, buf, min_width, min_height, @@ -423,37 +351,32 @@ impl Context { } } - pub fn set_texture_parameter(&self, tex: GLuint, pname: GLenum, param: GLint) { - unsafe { - SetTextureParameter(tex, pname, param); - } - } - - pub fn lock_framebuffer(&self, fbo: GLuint) -> Option<LockedResource> { - unsafe { - let resource = LockFramebuffer(fbo); - if resource != ptr::null_mut() { - Some(LockedResource(resource)) - } else { - None - } - } - } - - pub fn lock_texture(&self, tex: GLuint) -> Option<LockedResource> { + pub fn composite( + &self, + src_id: GLuint, + src_x: GLint, + src_y: GLint, + src_width: GLsizei, + src_height: GLint, + dst_x: GLint, + dst_y: GLint, + opaque: bool, + flip: bool, + ) { unsafe { - let resource = LockTexture(tex); - if resource != ptr::null_mut() { - Some(LockedResource(resource)) - } else { - None - } + Composite( + src_id, + src_x, + src_y, + src_width, + src_height, + dst_x, + dst_y, + opaque as GLboolean, + flip as GLboolean, + ); } } - - pub fn report_memory(size_of_op: unsafe extern "C" fn(ptr: *const c_void) -> usize) -> usize { - unsafe { ReportMemory(size_of_op) } - } } impl From<*mut c_void> for Context { @@ -488,7 +411,6 @@ fn calculate_length(width: GLsizei, height: GLsizei, format: GLenum, pixel_type: UNSIGNED_SHORT => 2, SHORT => 2, FLOAT => 4, - UNSIGNED_INT_8_8_8_8_REV => 1, _ => panic!("unsupported pixel_type for read_pixels: {:?}", pixel_type), }; @@ -563,8 +485,8 @@ impl Gl for Context { let u = str::from_utf8(s).unwrap(); const PREFIX: &'static str = "// shader: "; if let Some(start) = u.find(PREFIX) { - if let Some(end) = u[start..].find('\n') { - let name = u[start + PREFIX.len()..start + end].trim(); + if let Some(end) = u[start ..].find('\n') { + let name = u[start + PREFIX.len() .. start + end].trim(); debug!("shader name: {}", name); unsafe { let c_string = CString::new(name).unwrap(); @@ -1033,6 +955,7 @@ impl Gl for Context { panic!(); } + // FIXME: Does not verify buffer size -- unsafe! fn tex_image_3d( &self, target: GLenum, @@ -1046,7 +969,24 @@ impl Gl for Context { ty: GLenum, opt_data: Option<&[u8]>, ) { - panic!(); + unsafe { + let pdata = match opt_data { + Some(data) => data.as_ptr() as *const GLvoid, + None => ptr::null(), + }; + TexImage3D( + target, + level, + internal_format, + width, + height, + depth, + border, + format, + ty, + pdata, + ); + } } fn copy_tex_image_2d( @@ -1091,7 +1031,11 @@ impl Gl for Context { width: GLsizei, height: GLsizei, ) { - panic!(); + unsafe { + CopyTexSubImage3D( + target, level, xoffset, yoffset, zoffset, x, y, width, height, + ); + } } fn tex_sub_image_2d( @@ -1173,7 +1117,22 @@ impl Gl for Context { data: &[u8], ) { debug!("tex_sub_image_3d"); - panic!(); + //panic!(); + unsafe { + TexSubImage3D( + target, + level, + xoffset, + yoffset, + zoffset, + width, + height, + depth, + format, + ty, + data.as_ptr() as *const c_void, + ); + } } fn tex_sub_image_3d_pbo( @@ -1190,7 +1149,21 @@ impl Gl for Context { ty: GLenum, offset: usize, ) { - panic!(); + unsafe { + TexSubImage3D( + target, + level, + xoffset, + yoffset, + zoffset, + width, + height, + depth, + format, + ty, + offset as *const c_void, + ); + } } fn tex_storage_2d( @@ -1216,7 +1189,10 @@ impl Gl for Context { height: GLsizei, depth: GLsizei, ) { - panic!(); + //panic!(); + unsafe { + TexStorage3D(target, levels, internal_format, width, height, depth); + } } fn get_tex_image_into_buffer( @@ -1376,7 +1352,10 @@ impl Gl for Context { "framebuffer_texture_layer {} {} {} {} {}", target, attachment, texture, level, layer ); - panic!(); + //panic!(); + unsafe { + FramebufferTextureLayer(target, attachment, texture, level, layer); + } } fn blit_framebuffer( @@ -1498,9 +1477,7 @@ impl Gl for Context { } fn draw_arrays(&self, mode: GLenum, first: GLint, count: GLsizei) { - unsafe { - DrawElementsInstanced(mode, count, NONE, first as GLintptr, 1); - } + panic!(); } fn draw_arrays_instanced( @@ -1510,9 +1487,7 @@ impl Gl for Context { count: GLsizei, primcount: GLsizei, ) { - unsafe { - DrawElementsInstanced(mode, count, NONE, first as GLintptr, primcount); - } + panic!(); } fn draw_elements( @@ -1528,7 +1503,13 @@ impl Gl for Context { ); //panic!(); unsafe { - DrawElementsInstanced(mode, count, element_type, indices_offset as GLintptr, 1); + DrawElementsInstanced( + mode, + count, + element_type, + indices_offset as *const c_void, + 1, + ); } } @@ -1550,7 +1531,7 @@ impl Gl for Context { mode, count, element_type, - indices_offset as GLintptr, + indices_offset as *const c_void, primcount, ); } @@ -1843,8 +1824,8 @@ impl Gl for Context { } fn get_program_info_log(&self, program: GLuint) -> String { - debug!("get_program_info_log {}", program); - String::new() + panic!(); + //String::new() } #[inline] @@ -1854,7 +1835,7 @@ impl Gl for Context { assert!(!result.is_empty()); //#define GL_LINK_STATUS 0x8B82 if pname == 0x8b82 { - result[0] = GetLinkStatus(program); + result[0] = 1; } } @@ -2118,7 +2099,7 @@ impl Gl for Context { //ptr::null() } - fn client_wait_sync(&self, sync: GLsync, flags: GLbitfield, timeout: GLuint64) -> GLenum { + fn client_wait_sync(&self, sync: GLsync, flags: GLbitfield, timeout: GLuint64) { panic!(); } @@ -2191,7 +2172,7 @@ impl Gl for Context { // GL_KHR_blend_equation_advanced fn blend_barrier_khr(&self) { - // No barrier required, so nothing to do + panic!(); } // GL_CHROMIUM_copy_texture @@ -2269,158 +2250,4 @@ impl Gl for Context { ) { unimplemented!("Not supported by SWGL"); } - - fn buffer_storage( - &self, - target: GLenum, - size: GLsizeiptr, - data: *const GLvoid, - flags: GLbitfield, - ) { - unimplemented!("Not supported by SWGL"); - } - - fn flush_mapped_buffer_range(&self, target: GLenum, offset: GLintptr, length: GLsizeiptr) { - unimplemented!("Not supported by SWGL"); - } -} - -/// A resource that is intended for sharing between threads. -/// Locked resources such as textures or framebuffers will -/// not allow any further modifications while it remains -/// locked. The resource will be unlocked when LockedResource -/// is dropped. -pub struct LockedResource(*mut LockedTexture); - -unsafe impl Send for LockedResource {} -unsafe impl Sync for LockedResource {} - -#[repr(C)] -pub enum YUVColorSpace { - Rec601 = 0, - Rec709, - Rec2020, - Identity, -} - -impl LockedResource { - /// Composites from a locked resource to another locked resource. The band - /// offset and height are relative to the destination rectangle and specify - /// how to clip the composition into appropriate range for this band. - pub fn composite( - &self, - locked_src: &LockedResource, - src_x: GLint, - src_y: GLint, - src_width: GLsizei, - src_height: GLsizei, - dst_x: GLint, - dst_y: GLint, - dst_width: GLsizei, - dst_height: GLsizei, - opaque: bool, - flip: bool, - filter: GLenum, - clip_x: GLint, - clip_y: GLint, - clip_width: GLsizei, - clip_height: GLsizei, - ) { - unsafe { - Composite( - self.0, - locked_src.0, - src_x, - src_y, - src_width, - src_height, - dst_x, - dst_y, - dst_width, - dst_height, - opaque as GLboolean, - flip as GLboolean, - filter, - clip_x, - clip_y, - clip_width, - clip_height, - ); - } - } - - /// Composites from locked resources representing YUV planes - pub fn composite_yuv( - &self, - locked_y: &LockedResource, - locked_u: &LockedResource, - locked_v: &LockedResource, - color_space: YUVColorSpace, - color_depth: GLuint, - src_x: GLint, - src_y: GLint, - src_width: GLsizei, - src_height: GLsizei, - dst_x: GLint, - dst_y: GLint, - dst_width: GLsizei, - dst_height: GLsizei, - flip: bool, - clip_x: GLint, - clip_y: GLint, - clip_width: GLsizei, - clip_height: GLsizei, - ) { - unsafe { - CompositeYUV( - self.0, - locked_y.0, - locked_u.0, - locked_v.0, - color_space, - color_depth, - src_x, - src_y, - src_width, - src_height, - dst_x, - dst_y, - dst_width, - dst_height, - flip as GLboolean, - clip_x, - clip_y, - clip_width, - clip_height, - ); - } - } - - /// Get the underlying buffer for a locked resource - pub fn get_buffer(&self) -> (*mut c_void, i32, i32, i32) { - unsafe { - let mut width: i32 = 0; - let mut height: i32 = 0; - let mut stride: i32 = 0; - let data_ptr = GetResourceBuffer(self.0, &mut width, &mut height, &mut stride); - (data_ptr, width, height, stride) - } - } -} - -impl Clone for LockedResource { - fn clone(&self) -> Self { - unsafe { - LockResource(self.0); - } - LockedResource(self.0) - } -} - -impl Drop for LockedResource { - fn drop(&mut self) { - unsafe { - UnlockResource(self.0); - } - } } diff --git a/third_party/webrender/swgl/src/texture.h b/third_party/webrender/swgl/src/texture.h index fdace241eb5..0219d078bcf 100644 --- a/third_party/webrender/swgl/src/texture.h +++ b/third_party/webrender/swgl/src/texture.h @@ -2,884 +2,19 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ -namespace glsl { - -using PackedRGBA8 = V16<uint8_t>; -using WideRGBA8 = V16<uint16_t>; -using HalfRGBA8 = V8<uint16_t>; - -SI WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); } - -template <int N> -UNUSED SI VectorType<uint8_t, N> genericPackWide(VectorType<uint16_t, N> p) { - typedef VectorType<uint8_t, N> packed_type; - // Generic conversions only mask off the low byte without actually clamping - // like a real pack. First force the word to all 1s if it overflows, and then - // add on the sign bit to cause it to roll over to 0 if it was negative. - p = (p | (p > 255)) + (p >> 15); - return CONVERT(p, packed_type); -} - -SI PackedRGBA8 pack(WideRGBA8 p) { -#if USE_SSE2 - return _mm_packus_epi16(lowHalf(p), highHalf(p)); -#elif USE_NEON - return vcombine_u8(vqmovn_u16(lowHalf(p)), vqmovn_u16(highHalf(p))); -#else - return genericPackWide(p); -#endif -} - -using PackedR8 = V4<uint8_t>; -using WideR8 = V4<uint16_t>; - -SI WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); } - -SI PackedR8 pack(WideR8 p) { -#if USE_SSE2 - auto m = expand(p); - auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m)); - return SHUFFLE(r, r, 0, 1, 2, 3); -#elif USE_NEON - return lowHalf(bit_cast<V8<uint8_t>>(vqmovn_u16(expand(p)))); -#else - return genericPackWide(p); -#endif -} - -using PackedRG8 = V8<uint8_t>; -using WideRG8 = V8<uint16_t>; - -SI PackedRG8 pack(WideRG8 p) { -#if USE_SSE2 - return lowHalf(bit_cast<V16<uint8_t>>(_mm_packus_epi16(p, p))); -#elif USE_NEON - return bit_cast<V8<uint8_t>>(vqmovn_u16(p)); -#else - return genericPackWide(p); -#endif -} - -SI I32 clampCoord(I32 coord, int limit, int base = 0) { -#if USE_SSE2 - return _mm_min_epi16(_mm_max_epi16(coord, _mm_set1_epi32(base)), - _mm_set1_epi32(limit - 1)); -#else - return clamp(coord, base, limit - 1); -#endif -} - -SI int clampCoord(int coord, int limit, int base = 0) { - return min(max(coord, base), limit - 1); -} - -template <typename T, typename S> -SI T clamp2D(T P, S sampler) { - return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height)}; -} - -SI float to_float(uint32_t x) { return x * (1.f / 255.f); } - -SI vec4 pixel_to_vec4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - U32 pixels = {a, b, c, d}; - return vec4(cast((pixels >> 16) & 0xFF), cast((pixels >> 8) & 0xFF), - cast(pixels & 0xFF), cast(pixels >> 24)) * - (1.0f / 255.0f); -} - -SI vec4 pixel_float_to_vec4(Float a, Float b, Float c, Float d) { - return vec4(Float{a.x, b.x, c.x, d.x}, Float{a.y, b.y, c.y, d.y}, - Float{a.z, b.z, c.z, d.z}, Float{a.w, b.w, c.w, d.w}); -} - -SI ivec4 pixel_int_to_ivec4(I32 a, I32 b, I32 c, I32 d) { - return ivec4(I32{a.x, b.x, c.x, d.x}, I32{a.y, b.y, c.y, d.y}, - I32{a.z, b.z, c.z, d.z}, I32{a.w, b.w, c.w, d.w}); -} - -SI vec4_scalar pixel_to_vec4(uint32_t p) { - U32 i = {(p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF, p >> 24}; - Float f = cast(i) * (1.0f / 255.0f); - return vec4_scalar(f.x, f.y, f.z, f.w); -} - -template <typename S> -SI vec4 fetchOffsetsRGBA8(S sampler, I32 offset) { - return pixel_to_vec4(sampler->buf[offset.x], sampler->buf[offset.y], - sampler->buf[offset.z], sampler->buf[offset.w]); -} - -template <typename S> -vec4 texelFetchRGBA8(S sampler, ivec2 P) { - I32 offset = P.x + P.y * sampler->stride; - return fetchOffsetsRGBA8(sampler, offset); -} - -template <typename S> -SI Float fetchOffsetsR8(S sampler, I32 offset) { - U32 i = { - ((uint8_t*)sampler->buf)[offset.x], ((uint8_t*)sampler->buf)[offset.y], - ((uint8_t*)sampler->buf)[offset.z], ((uint8_t*)sampler->buf)[offset.w]}; - return cast(i) * (1.0f / 255.0f); -} - -template <typename S> -vec4 texelFetchR8(S sampler, ivec2 P) { - I32 offset = P.x + P.y * sampler->stride; - return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f); -} - -template <typename S> -SI vec4 fetchOffsetsRG8(S sampler, I32 offset) { - uint16_t* buf = (uint16_t*)sampler->buf; - U16 pixels = {buf[offset.x], buf[offset.y], buf[offset.z], buf[offset.w]}; - Float r = CONVERT(pixels & 0xFF, Float) * (1.0f / 255.0f); - Float g = CONVERT(pixels >> 8, Float) * (1.0f / 255.0f); - return vec4(r, g, 0.0f, 1.0f); -} - -template <typename S> -vec4 texelFetchRG8(S sampler, ivec2 P) { - I32 offset = P.x + P.y * sampler->stride; - return fetchOffsetsRG8(sampler, offset); -} - template <typename S> -SI Float fetchOffsetsR16(S sampler, I32 offset) { - U32 i = { - ((uint16_t*)sampler->buf)[offset.x], ((uint16_t*)sampler->buf)[offset.y], - ((uint16_t*)sampler->buf)[offset.z], ((uint16_t*)sampler->buf)[offset.w]}; - return cast(i) * (1.0f / 65535.0f); -} - -template <typename S> -vec4 texelFetchR16(S sampler, ivec2 P) { - I32 offset = P.x + P.y * sampler->stride; - return vec4(fetchOffsetsR16(sampler, offset), 0.0f, 0.0f, 1.0f); -} - -template <typename S> -SI vec4 fetchOffsetsFloat(S sampler, I32 offset) { - return pixel_float_to_vec4( - *(Float*)&sampler->buf[offset.x], *(Float*)&sampler->buf[offset.y], - *(Float*)&sampler->buf[offset.z], *(Float*)&sampler->buf[offset.w]); -} - -vec4 texelFetchFloat(sampler2D sampler, ivec2 P) { - I32 offset = P.x * 4 + P.y * sampler->stride; - return fetchOffsetsFloat(sampler, offset); -} - -template <typename S> -SI vec4 fetchOffsetsYUV422(S sampler, I32 offset) { - // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R. - // Offset is aligned to a chunk rather than a pixel, and selector specifies - // pixel within the chunk. - I32 selector = offset & 1; - offset &= ~1; - uint16_t* buf = (uint16_t*)sampler->buf; - U32 pixels = {*(uint32_t*)&buf[offset.x], *(uint32_t*)&buf[offset.y], - *(uint32_t*)&buf[offset.z], *(uint32_t*)&buf[offset.w]}; - Float b = CONVERT((pixels >> 8) & 0xFF, Float) * (1.0f / 255.0f); - Float r = CONVERT((pixels >> 24), Float) * (1.0f / 255.0f); - Float g = - CONVERT(if_then_else(-selector, pixels >> 16, pixels) & 0xFF, Float) * - (1.0f / 255.0f); - return vec4(r, g, b, 1.0f); -} - -template <typename S> -vec4 texelFetchYUV422(S sampler, ivec2 P) { - I32 offset = P.x + P.y * sampler->stride; - return fetchOffsetsYUV422(sampler, offset); -} - -vec4 texelFetch(sampler2D sampler, ivec2 P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); - switch (sampler->format) { - case TextureFormat::RGBA32F: - return texelFetchFloat(sampler, P); - case TextureFormat::RGBA8: - return texelFetchRGBA8(sampler, P); - case TextureFormat::R8: - return texelFetchR8(sampler, P); - case TextureFormat::RG8: - return texelFetchRG8(sampler, P); - case TextureFormat::R16: - return texelFetchR16(sampler, P); - case TextureFormat::YUV422: - return texelFetchYUV422(sampler, P); - default: - assert(false); - return vec4(); - } -} - -vec4 texelFetch(sampler2DRGBA32F sampler, ivec2 P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); - assert(sampler->format == TextureFormat::RGBA32F); - return texelFetchFloat(sampler, P); -} - -vec4 texelFetch(sampler2DRGBA8 sampler, ivec2 P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); +static PackedRGBA8 textureLinearPackedRGBA8(S sampler, ivec2 i, int zoffset) { assert(sampler->format == TextureFormat::RGBA8); - return texelFetchRGBA8(sampler, P); -} - -vec4 texelFetch(sampler2DR8 sampler, ivec2 P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); - assert(sampler->format == TextureFormat::R8); - return texelFetchR8(sampler, P); -} - -vec4 texelFetch(sampler2DRG8 sampler, ivec2 P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); - assert(sampler->format == TextureFormat::RG8); - return texelFetchRG8(sampler, P); -} - -vec4_scalar texelFetch(sampler2D sampler, ivec2_scalar P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); - if (sampler->format == TextureFormat::RGBA32F) { - return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; - } else { - assert(sampler->format == TextureFormat::RGBA8); - return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]); - } -} - -vec4_scalar texelFetch(sampler2DRGBA32F sampler, ivec2_scalar P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); - assert(sampler->format == TextureFormat::RGBA32F); - return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; -} - -vec4_scalar texelFetch(sampler2DRGBA8 sampler, ivec2_scalar P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); - assert(sampler->format == TextureFormat::RGBA8); - return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]); -} - -vec4_scalar texelFetch(sampler2DR8 sampler, ivec2_scalar P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); - assert(sampler->format == TextureFormat::R8); - return vec4_scalar{ - to_float(((uint8_t*)sampler->buf)[P.x + P.y * sampler->stride]), 0.0f, - 0.0f, 1.0f}; -} - -vec4_scalar texelFetch(sampler2DRG8 sampler, ivec2_scalar P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); - assert(sampler->format == TextureFormat::RG8); - uint16_t pixel = ((uint16_t*)sampler->buf)[P.x + P.y * sampler->stride]; - return vec4_scalar{to_float(pixel & 0xFF), to_float(pixel >> 8), 0.0f, 1.0f}; -} - -vec4 texelFetch(sampler2DRect sampler, ivec2 P) { - P = clamp2D(P, sampler); - switch (sampler->format) { - case TextureFormat::RGBA8: - return texelFetchRGBA8(sampler, P); - case TextureFormat::R8: - return texelFetchR8(sampler, P); - case TextureFormat::RG8: - return texelFetchRG8(sampler, P); - case TextureFormat::R16: - return texelFetchR16(sampler, P); - case TextureFormat::YUV422: - return texelFetchYUV422(sampler, P); - default: - assert(false); - return vec4(); - } -} - -template <typename S> -SI ivec4 fetchOffsetsInt(S sampler, I32 offset) { - return pixel_int_to_ivec4( - *(I32*)&sampler->buf[offset.x], *(I32*)&sampler->buf[offset.y], - *(I32*)&sampler->buf[offset.z], *(I32*)&sampler->buf[offset.w]); -} - -ivec4 texelFetch(isampler2D sampler, ivec2 P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); - assert(sampler->format == TextureFormat::RGBA32I); - I32 offset = P.x * 4 + P.y * sampler->stride; - return fetchOffsetsInt(sampler, offset); -} - -ivec4_scalar texelFetch(isampler2D sampler, ivec2_scalar P, int lod) { - assert(lod == 0); - P = clamp2D(P, sampler); - assert(sampler->format == TextureFormat::RGBA32I); - return *(ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; -} - -SI vec4_scalar* texelFetchPtr(sampler2D sampler, ivec2_scalar P, int min_x, - int max_x, int min_y, int max_y) { - P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x); - P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y); - assert(sampler->format == TextureFormat::RGBA32F); - return (vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; -} - -SI ivec4_scalar* texelFetchPtr(isampler2D sampler, ivec2_scalar P, int min_x, - int max_x, int min_y, int max_y) { - P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x); - P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y); - assert(sampler->format == TextureFormat::RGBA32I); - return (ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride]; -} - -template <typename S> -SI I32 texelFetchPtr(S sampler, ivec2 P, int min_x, int max_x, int min_y, - int max_y) { - P.x = clampCoord(P.x, int(sampler->width) - max_x, -min_x); - P.y = clampCoord(P.y, int(sampler->height) - max_y, -min_y); - return P.x * 4 + P.y * sampler->stride; -} - -template <typename S, typename P> -SI P texelFetchUnchecked(S sampler, P* ptr, int x, int y = 0) { - return ptr[x + y * (sampler->stride >> 2)]; -} - -SI vec4 texelFetchUnchecked(sampler2D sampler, I32 offset, int x, int y = 0) { - assert(sampler->format == TextureFormat::RGBA32F); - return fetchOffsetsFloat(sampler, offset + (x * 4 + y * sampler->stride)); -} - -SI ivec4 texelFetchUnchecked(isampler2D sampler, I32 offset, int x, int y = 0) { - assert(sampler->format == TextureFormat::RGBA32I); - return fetchOffsetsInt(sampler, offset + (x * 4 + y * sampler->stride)); -} - -#define texelFetchOffset(sampler, P, lod, offset) \ - texelFetch(sampler, (P) + (offset), lod) - -// Scale texture coords for quantization, subtract offset for filtering -// (assuming coords already offset to texel centers), and round to nearest -// 1/scale increment -template <typename T> -SI T linearQuantize(T P, float scale) { - return P * scale + (0.5f - 0.5f * scale); -} - -// Helper version that also scales normalized texture coords for sampler -template <typename T, typename S> -SI T samplerScale(S sampler, T P) { - P.x *= sampler->width; - P.y *= sampler->height; - return P; -} - -template <typename T> -SI T samplerScale(UNUSED sampler2DRect sampler, T P) { - return P; -} - -template <typename T, typename S> -SI T linearQuantize(T P, float scale, S sampler) { - return linearQuantize(samplerScale(sampler, P), scale); -} - -// Compute clamped offset of first row for linear interpolation -template <typename S, typename I> -SI auto computeRow(S sampler, I i, size_t margin = 1) -> decltype(i.x) { - return clampCoord(i.x, sampler->width - margin) + - clampCoord(i.y, sampler->height) * sampler->stride; -} - -// Compute clamped offset of second row for linear interpolation from first row -template <typename S, typename I> -SI auto computeNextRowOffset(S sampler, I i) -> decltype(i.x) { - return if_then_else(i.y >= 0 && i.y < int32_t(sampler->height) - 1, - sampler->stride, 0); -} - -// Convert X coordinate to a 2^7 scale fraction for interpolation -template <typename S> -SI I16 computeFracX(S sampler, ivec2 i, ivec2 frac) { - auto overread = i.x > int32_t(sampler->width) - 2; - return CONVERT((((frac.x & (i.x >= 0)) | overread) & 0x7F) - overread, I16); -} - -// Convert Y coordinate to a 2^7 scale fraction for interpolation -SI I16 computeFracNoClamp(I32 frac) { return CONVERT(frac & 0x7F, I16); } -SI I16 computeFracY(ivec2 frac) { return computeFracNoClamp(frac.y); } - -struct WidePlanarRGBA8 { - V8<uint16_t> rg; - V8<uint16_t> ba; -}; - -template <typename S> -SI WidePlanarRGBA8 textureLinearPlanarRGBA8(S sampler, ivec2 i) { - assert(sampler->format == TextureFormat::RGBA8); - - ivec2 frac = i; - i >>= 7; - - I32 row0 = computeRow(sampler, i); - I32 row1 = row0 + computeNextRowOffset(sampler, i); - I16 fracx = computeFracX(sampler, i, frac); - I16 fracy = computeFracY(frac); - - auto a0 = - CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>); - auto a1 = - CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.x]), V8<int16_t>); - a0 += ((a1 - a0) * fracy.x) >> 7; - - auto b0 = - CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.y]), V8<int16_t>); - auto b1 = - CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.y]), V8<int16_t>); - b0 += ((b1 - b0) * fracy.y) >> 7; - - auto abl = zipLow(a0, b0); - auto abh = zipHigh(a0, b0); - abl += ((abh - abl) * fracx.xyxyxyxy) >> 7; - - auto c0 = - CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.z]), V8<int16_t>); - auto c1 = - CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.z]), V8<int16_t>); - c0 += ((c1 - c0) * fracy.z) >> 7; - - auto d0 = - CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.w]), V8<int16_t>); - auto d1 = - CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.w]), V8<int16_t>); - d0 += ((d1 - d0) * fracy.w) >> 7; - - auto cdl = zipLow(c0, d0); - auto cdh = zipHigh(c0, d0); - cdl += ((cdh - cdl) * fracx.zwzwzwzw) >> 7; - - auto rg = V8<uint16_t>(zip2Low(abl, cdl)); - auto ba = V8<uint16_t>(zip2High(abl, cdl)); - return WidePlanarRGBA8{rg, ba}; -} - -template <typename S> -vec4 textureLinearRGBA8(S sampler, vec2 P) { - ivec2 i(linearQuantize(P, 128, sampler)); - auto planar = textureLinearPlanarRGBA8(sampler, i); - auto rg = CONVERT(planar.rg, V8<float>); - auto ba = CONVERT(planar.ba, V8<float>); - auto r = lowHalf(rg); - auto g = highHalf(rg); - auto b = lowHalf(ba); - auto a = highHalf(ba); - return vec4(b, g, r, a) * (1.0f / 255.0f); -} - -template <typename S> -static inline U16 textureLinearUnpackedR8(S sampler, ivec2 i) { - assert(sampler->format == TextureFormat::R8); - ivec2 frac = i; - i >>= 7; - - I32 row0 = computeRow(sampler, i); - I32 row1 = row0 + computeNextRowOffset(sampler, i); - I16 fracx = computeFracX(sampler, i, frac); - I16 fracy = computeFracY(frac); - - uint8_t* buf = (uint8_t*)sampler->buf; - auto a0 = unaligned_load<V2<uint8_t>>(&buf[row0.x]); - auto b0 = unaligned_load<V2<uint8_t>>(&buf[row0.y]); - auto c0 = unaligned_load<V2<uint8_t>>(&buf[row0.z]); - auto d0 = unaligned_load<V2<uint8_t>>(&buf[row0.w]); - auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>); - - auto a1 = unaligned_load<V2<uint8_t>>(&buf[row1.x]); - auto b1 = unaligned_load<V2<uint8_t>>(&buf[row1.y]); - auto c1 = unaligned_load<V2<uint8_t>>(&buf[row1.z]); - auto d1 = unaligned_load<V2<uint8_t>>(&buf[row1.w]); - auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>); - - abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7; - - abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7); - auto abcdl = lowHalf(abcd0); - auto abcdh = highHalf(abcd0); - abcdl += ((abcdh - abcdl) * fracx) >> 7; - - return U16(abcdl); -} - -template <typename S> -vec4 textureLinearR8(S sampler, vec2 P) { - assert(sampler->format == TextureFormat::R8); - - ivec2 i(linearQuantize(P, 128, sampler)); - Float r = CONVERT(textureLinearUnpackedR8(sampler, i), Float); - return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f); -} - -struct WidePlanarRG8 { - V8<uint16_t> rg; -}; - -template <typename S> -SI WidePlanarRG8 textureLinearPlanarRG8(S sampler, ivec2 i) { - assert(sampler->format == TextureFormat::RG8); - - ivec2 frac = i; - i >>= 7; - - I32 row0 = computeRow(sampler, i); - I32 row1 = row0 + computeNextRowOffset(sampler, i); - I16 fracx = computeFracX(sampler, i, frac); - I16 fracy = computeFracY(frac); - - uint16_t* buf = (uint16_t*)sampler->buf; - - // Load RG bytes for two adjacent pixels - rgRG - auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]); - auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]); - auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>); - // Load two pixels for next row - auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]); - auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]); - auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>); - // Blend rows - ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7; - - auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]); - auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]); - auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>); - auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]); - auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]); - auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>); - // Blend rows - cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7; - - // ab = a.rgRG,b.rgRG - // cd = c.rgRG,d.rgRG - // ... ac = ar,cr,ag,cg,aR,cR,aG,cG - // ... bd = br,dr,bg,dg,bR,dR,bG,dG - auto ac = zipLow(ab0, cd0); - auto bd = zipHigh(ab0, cd0); - // ar,br,cr,dr,ag,bg,cg,dg - // aR,bR,cR,dR,aG,bG,cG,dG - auto abcdl = zipLow(ac, bd); - auto abcdh = zipHigh(ac, bd); - // Blend columns - abcdl += ((abcdh - abcdl) * fracx.xyzwxyzw) >> 7; - - auto rg = V8<uint16_t>(abcdl); - return WidePlanarRG8{rg}; -} - -template <typename S> -vec4 textureLinearRG8(S sampler, vec2 P) { - ivec2 i(linearQuantize(P, 128, sampler)); - auto planar = textureLinearPlanarRG8(sampler, i); - auto rg = CONVERT(planar.rg, V8<float>) * (1.0f / 255.0f); - auto r = lowHalf(rg); - auto g = highHalf(rg); - return vec4(r, g, 0.0f, 1.0f); -} - -// Samples R16 texture with linear filtering and returns results packed as -// signed I16. One bit of precision is shifted away from the bottom end to -// accommodate the sign bit, so only 15 bits of precision is left. -template <typename S> -static inline I16 textureLinearUnpackedR16(S sampler, ivec2 i) { - assert(sampler->format == TextureFormat::R16); - - ivec2 frac = i; + ivec2 frac = i & 0x7F; i >>= 7; - I32 row0 = computeRow(sampler, i); - I32 row1 = row0 + computeNextRowOffset(sampler, i); - + I32 row0 = clampCoord(i.x, sampler->width) + + clampCoord(i.y, sampler->height) * sampler->stride + zoffset; + I32 row1 = row0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) & + I32(sampler->stride)); I16 fracx = - CONVERT( - ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F, - I16) - << 8; - I16 fracy = computeFracY(frac) << 8; - - // Sample the 16 bit data for both rows - uint16_t* buf = (uint16_t*)sampler->buf; - auto a0 = unaligned_load<V2<uint16_t>>(&buf[row0.x]); - auto b0 = unaligned_load<V2<uint16_t>>(&buf[row0.y]); - auto c0 = unaligned_load<V2<uint16_t>>(&buf[row0.z]); - auto d0 = unaligned_load<V2<uint16_t>>(&buf[row0.w]); - auto abcd0 = CONVERT(combine(a0, b0, c0, d0) >> 1, V8<int16_t>); - - auto a1 = unaligned_load<V2<uint16_t>>(&buf[row1.x]); - auto b1 = unaligned_load<V2<uint16_t>>(&buf[row1.y]); - auto c1 = unaligned_load<V2<uint16_t>>(&buf[row1.z]); - auto d1 = unaligned_load<V2<uint16_t>>(&buf[row1.w]); - auto abcd1 = CONVERT(combine(a1, b1, c1, d1) >> 1, V8<int16_t>); - - // The samples occupy 15 bits and the fraction occupies 15 bits, so that when - // they are multiplied together, the new scaled sample will fit in the high - // 14 bits of the result. It is left shifted once to make it 15 bits again - // for the final multiply. -#if USE_SSE2 - abcd0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(abcd1 - abcd0, fracy.xxyyzzww)) - << 1; -#elif USE_NEON - // NEON has a convenient instruction that does both the multiply and the - // doubling, so doesn't need an extra shift. - abcd0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(abcd1 - abcd0, fracy.xxyyzzww)); -#else - abcd0 += CONVERT((CONVERT(abcd1 - abcd0, V8<int32_t>) * - CONVERT(fracy.xxyyzzww, V8<int32_t>)) >> - 16, - V8<int16_t>) - << 1; -#endif - - abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7); - auto abcdl = lowHalf(abcd0); - auto abcdh = highHalf(abcd0); -#if USE_SSE2 - abcdl += lowHalf(bit_cast<V8<int16_t>>( - _mm_mulhi_epi16(expand(abcdh - abcdl), expand(fracx)))) - << 1; -#elif USE_NEON - abcdl += bit_cast<V4<int16_t>>(vqrdmulh_s16(abcdh - abcdl, fracx)); -#else - abcdl += CONVERT((CONVERT(abcdh - abcdl, V4<int32_t>) * - CONVERT(fracx, V4<int32_t>)) >> - 16, - V4<int16_t>) - << 1; -#endif - - return abcdl; -} - -template <typename S> -vec4 textureLinearR16(S sampler, vec2 P) { - assert(sampler->format == TextureFormat::R16); - - ivec2 i(linearQuantize(P, 128, sampler)); - Float r = CONVERT(textureLinearUnpackedR16(sampler, i), Float); - return vec4(r * (1.0f / 32767.0f), 0.0f, 0.0f, 1.0f); -} - -using PackedRGBA32F = V16<float>; -using WideRGBA32F = V16<float>; - -template <typename S> -vec4 textureLinearRGBA32F(S sampler, vec2 P) { - assert(sampler->format == TextureFormat::RGBA32F); - P = samplerScale(sampler, P); - P -= 0.5f; - vec2 f = floor(P); - vec2 r = P - f; - ivec2 i(f); - ivec2 c(clampCoord(i.x, sampler->width - 1), - clampCoord(i.y, sampler->height)); - r.x = if_then_else(i.x >= 0, if_then_else(i.x < sampler->width - 1, r.x, 1.0), - 0.0f); - I32 offset0 = c.x * 4 + c.y * sampler->stride; - I32 offset1 = offset0 + computeNextRowOffset(sampler, i); - - Float c0 = mix(mix(*(Float*)&sampler->buf[offset0.x], - *(Float*)&sampler->buf[offset0.x + 4], r.x), - mix(*(Float*)&sampler->buf[offset1.x], - *(Float*)&sampler->buf[offset1.x + 4], r.x), - r.y); - Float c1 = mix(mix(*(Float*)&sampler->buf[offset0.y], - *(Float*)&sampler->buf[offset0.y + 4], r.x), - mix(*(Float*)&sampler->buf[offset1.y], - *(Float*)&sampler->buf[offset1.y + 4], r.x), - r.y); - Float c2 = mix(mix(*(Float*)&sampler->buf[offset0.z], - *(Float*)&sampler->buf[offset0.z + 4], r.x), - mix(*(Float*)&sampler->buf[offset1.z], - *(Float*)&sampler->buf[offset1.z + 4], r.x), - r.y); - Float c3 = mix(mix(*(Float*)&sampler->buf[offset0.w], - *(Float*)&sampler->buf[offset0.w + 4], r.x), - mix(*(Float*)&sampler->buf[offset1.w], - *(Float*)&sampler->buf[offset1.w + 4], r.x), - r.y); - return pixel_float_to_vec4(c0, c1, c2, c3); -} - -struct WidePlanarYUV8 { - U16 y; - U16 u; - U16 v; -}; - -template <typename S> -SI WidePlanarYUV8 textureLinearPlanarYUV422(S sampler, ivec2 i) { - assert(sampler->format == TextureFormat::YUV422); - - ivec2 frac = i; - i >>= 7; - - I32 row0 = computeRow(sampler, i, 2); - // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R. - // Get the selector for the pixel within the chunk. - I32 selector = row0 & 1; - // Align the row index to the chunk. - row0 &= ~1; - I32 row1 = row0 + computeNextRowOffset(sampler, i); - // G only needs to be clamped to a pixel boundary for safe interpolation, - // whereas the BR fraction needs to be clamped 1 extra pixel inside to a chunk - // boundary. - frac.x &= (i.x >= 0); - auto fracx = - CONVERT(combine(frac.x | (i.x > int32_t(sampler->width) - 3), - (frac.x >> 1) | (i.x > int32_t(sampler->width) - 3)) & - 0x7F, - V8<int16_t>); - I16 fracy = computeFracY(frac); - - uint16_t* buf = (uint16_t*)sampler->buf; - - // Load bytes for two adjacent chunks - g0,b,g1,r,G0,B,G1,R - // We always need to interpolate between (b,r) and (B,R). - // Depending on selector we need to either interpolate between g0 and g1 - // or between g1 and G0. So for now we just interpolate both cases for g - // and will select the appropriate one on output. - auto a0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.x]), V8<int16_t>); - auto a1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.x]), V8<int16_t>); - // Combine with next row. - a0 += ((a1 - a0) * fracy.x) >> 7; - - auto b0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.y]), V8<int16_t>); - auto b1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.y]), V8<int16_t>); - b0 += ((b1 - b0) * fracy.y) >> 7; - - auto c0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.z]), V8<int16_t>); - auto c1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.z]), V8<int16_t>); - c0 += ((c1 - c0) * fracy.z) >> 7; - - auto d0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.w]), V8<int16_t>); - auto d1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.w]), V8<int16_t>); - d0 += ((d1 - d0) * fracy.w) >> 7; - - // Shuffle things around so we end up with g0,g0,g0,g0,b,b,b,b and - // g1,g1,g1,g1,r,r,r,r. - auto abl = zipLow(a0, b0); - auto cdl = zipLow(c0, d0); - auto g0b = zip2Low(abl, cdl); - auto g1r = zip2High(abl, cdl); - - // Need to zip g1,B,G0,R. Instead of using a bunch of complicated masks and - // and shifts, just shuffle here instead... We finally end up with - // g1,g1,g1,g1,B,B,B,B and G0,G0,G0,G0,R,R,R,R. - auto abh = SHUFFLE(a0, b0, 2, 10, 5, 13, 4, 12, 7, 15); - auto cdh = SHUFFLE(c0, d0, 2, 10, 5, 13, 4, 12, 7, 15); - auto g1B = zip2Low(abh, cdh); - auto G0R = zip2High(abh, cdh); - - // Finally interpolate between adjacent columns. - g0b += ((g1B - g0b) * fracx) >> 7; - g1r += ((G0R - g1r) * fracx) >> 7; - - // Choose either g0 or g1 based on selector. - return WidePlanarYUV8{ - U16(if_then_else(CONVERT(-selector, I16), lowHalf(g1r), lowHalf(g0b))), - U16(highHalf(g0b)), U16(highHalf(g1r))}; -} - -template <typename S> -vec4 textureLinearYUV422(S sampler, vec2 P) { - ivec2 i(linearQuantize(P, 128, sampler)); - auto planar = textureLinearPlanarYUV422(sampler, i); - auto y = CONVERT(planar.y, Float) * (1.0f / 255.0f); - auto u = CONVERT(planar.u, Float) * (1.0f / 255.0f); - auto v = CONVERT(planar.v, Float) * (1.0f / 255.0f); - return vec4(v, y, u, 1.0f); -} - -SI vec4 texture(sampler2D sampler, vec2 P) { - if (sampler->filter == TextureFilter::LINEAR) { - switch (sampler->format) { - case TextureFormat::RGBA32F: - return textureLinearRGBA32F(sampler, P); - case TextureFormat::RGBA8: - return textureLinearRGBA8(sampler, P); - case TextureFormat::R8: - return textureLinearR8(sampler, P); - case TextureFormat::RG8: - return textureLinearRG8(sampler, P); - case TextureFormat::R16: - return textureLinearR16(sampler, P); - case TextureFormat::YUV422: - return textureLinearYUV422(sampler, P); - default: - assert(false); - return vec4(); - } - } else { - ivec2 coord(roundzero(P.x, sampler->width), - roundzero(P.y, sampler->height)); - return texelFetch(sampler, coord, 0); - } -} - -vec4 texture(sampler2DRect sampler, vec2 P) { - if (sampler->filter == TextureFilter::LINEAR) { - switch (sampler->format) { - case TextureFormat::RGBA8: - return textureLinearRGBA8(sampler, P); - case TextureFormat::R8: - return textureLinearR8(sampler, P); - case TextureFormat::RG8: - return textureLinearRG8(sampler, P); - case TextureFormat::R16: - return textureLinearR16(sampler, P); - case TextureFormat::YUV422: - return textureLinearYUV422(sampler, P); - default: - assert(false); - return vec4(); - } - } else { - ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f)); - return texelFetch(sampler, coord); - } -} - -template <typename S> -vec4_scalar texture(S sampler, vec2_scalar P) { - return force_scalar(texture(sampler, vec2(P))); -} - -ivec2_scalar textureSize(sampler2D sampler, int) { - return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)}; -} - -ivec2_scalar textureSize(sampler2DRect sampler) { - return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)}; -} - -template <typename S> -static WideRGBA8 textureLinearUnpackedRGBA8(S sampler, ivec2 i) { - assert(sampler->format == TextureFormat::RGBA8); - ivec2 frac = i; - i >>= 7; - - I32 row0 = computeRow(sampler, i); - I32 row1 = row0 + computeNextRowOffset(sampler, i); - I16 fracx = computeFracX(sampler, i, frac); - I16 fracy = computeFracY(frac); + CONVERT(frac.x & (i.x >= 0 && i.x < int32_t(sampler->width) - 1), I16); + I16 fracy = CONVERT(frac.y, I16); auto a0 = CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>); @@ -913,233 +48,80 @@ static WideRGBA8 textureLinearUnpackedRGBA8(S sampler, ivec2 i) { auto cdh = combine(highHalf(c0), highHalf(d0)); cdl += ((cdh - cdl) * fracx.zzzzwwww) >> 7; - return combine(HalfRGBA8(abl), HalfRGBA8(cdl)); + return pack(combine(HalfRGBA8(abl), HalfRGBA8(cdl))); } template <typename S> -static PackedRGBA8 textureLinearPackedRGBA8(S sampler, ivec2 i) { - return pack(textureLinearUnpackedRGBA8(sampler, i)); +static inline void textureLinearCommit4(S sampler, ivec2 i, int zoffset, + uint32_t* buf) { + commit_span(buf, textureLinearPackedRGBA8(sampler, i, zoffset)); } template <typename S> -static PackedRGBA8 textureNearestPackedRGBA8(S sampler, ivec2 i) { +static void textureLinearCommit8(S sampler, ivec2_scalar i, int zoffset, + uint32_t* buf) { assert(sampler->format == TextureFormat::RGBA8); - I32 row = computeRow(sampler, i, 0); - return combine(unaligned_load<V4<uint8_t>>(&sampler->buf[row.x]), - unaligned_load<V4<uint8_t>>(&sampler->buf[row.y]), - unaligned_load<V4<uint8_t>>(&sampler->buf[row.z]), - unaligned_load<V4<uint8_t>>(&sampler->buf[row.w])); -} - -template <typename S> -static PackedR8 textureLinearPackedR8(S sampler, ivec2 i) { - return pack(textureLinearUnpackedR8(sampler, i)); -} - -template <typename S> -static WideRG8 textureLinearUnpackedRG8(S sampler, ivec2 i) { - assert(sampler->format == TextureFormat::RG8); - ivec2 frac = i & 0x7F; + ivec2_scalar frac = i & 0x7F; i >>= 7; - I32 row0 = computeRow(sampler, i); - I32 row1 = row0 + computeNextRowOffset(sampler, i); - I16 fracx = computeFracX(sampler, i, frac); - I16 fracy = computeFracY(frac); - - uint16_t* buf = (uint16_t*)sampler->buf; - - // Load RG bytes for two adjacent pixels - rgRG - auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]); - auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]); - auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>); - // Load two pixels for next row - auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]); - auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]); - auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>); - // Blend rows - ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7; - - auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]); - auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]); - auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>); - auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]); - auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]); - auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>); - // Blend rows - cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7; - - // ab = a.rgRG,b.rgRG - // cd = c.rgRG,d.rgRG - // ... ac = a.rg,c.rg,a.RG,c.RG - // ... bd = b.rg,d.rg,b.RG,d.RG - auto ac = zip2Low(ab0, cd0); - auto bd = zip2High(ab0, cd0); - // a.rg,b.rg,c.rg,d.rg - // a.RG,b.RG,c.RG,d.RG - auto abcdl = zip2Low(ac, bd); - auto abcdh = zip2High(ac, bd); - // Blend columns - abcdl += ((abcdh - abcdl) * fracx.xxyyzzww) >> 7; - - return WideRG8(abcdl); -} - -template <typename S> -static PackedRG8 textureLinearPackedRG8(S sampler, ivec2 i) { - return pack(textureLinearUnpackedRG8(sampler, i)); -} - -template <int N> -static ALWAYS_INLINE VectorType<uint16_t, N> addsat(VectorType<uint16_t, N> x, - VectorType<uint16_t, N> y) { - auto r = x + y; - return r | (r < x); -} - -template <typename P, typename S> -static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurHorizontal( - S sampler, const ivec2_scalar& i, int minX, int maxX, int radius, - float coeff, float coeffStep) { - // Packed and unpacked vectors for a chunk of the given pixel type. - typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; - typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type; - - // Pre-scale the coefficient by 8 bits of fractional precision, so that when - // the sample is multiplied by it, it will yield a 16 bit unsigned integer - // that will use all 16 bits of precision to accumulate the sum. - coeff *= 1 << 8; - float coeffStep2 = coeffStep * coeffStep; - - int row = computeRow(sampler, i); - P* buf = (P*)sampler->buf; - auto pixelsRight = unaligned_load<V4<P>>(&buf[row]); - auto pixelsLeft = pixelsRight; - auto sum = CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) * - uint16_t(coeff + 0.5f); - - // Here we use some trickery to reuse the pixels within a chunk, shifted over - // by one pixel, to get the next sample for the entire chunk. This allows us - // to sample only one pixel for each offset across the entire chunk in both - // the left and right directions. To avoid clamping within the loop to the - // texture bounds, we compute the valid radius that doesn't require clamping - // and fall back to a slower clamping loop outside of that valid radius. - int offset = 1; - // The left bound is how much we can offset the sample before the start of - // the row bounds. - int leftBound = i.x - max(minX, 0); - // The right bound is how much we can offset the sample before the end of the - // row bounds. - int rightBound = min(maxX, sampler->width - 1) - i.x; - int validRadius = min(radius, min(leftBound, rightBound - (4 - 1))); - for (; offset <= validRadius; offset++) { - // Overwrite the pixel that needs to be shifted out with the new pixel, and - // shift it into the correct location. - pixelsRight.x = unaligned_load<P>(&buf[row + offset + 4 - 1]); - pixelsRight = pixelsRight.yzwx; - pixelsLeft = pixelsLeft.wxyz; - pixelsLeft.x = unaligned_load<P>(&buf[row - offset]); - - // Accumulate the Gaussian coefficients step-wise. - coeff *= coeffStep; - coeffStep *= coeffStep2; - - // Both left and right samples at this offset use the same coefficient. - sum = addsat(sum, - (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) + - CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) * - uint16_t(coeff + 0.5f)); - } - - for (; offset <= radius; offset++) { - pixelsRight.x = - unaligned_load<P>(&buf[row + min(offset + 4 - 1, rightBound)]); - pixelsRight = pixelsRight.yzwx; - pixelsLeft = pixelsLeft.wxyz; - pixelsLeft.x = unaligned_load<P>(&buf[row - min(offset, leftBound)]); - - coeff *= coeffStep; - coeffStep *= coeffStep2; - - sum = addsat(sum, - (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) + - CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) * - uint16_t(coeff + 0.5f)); - } - - // Shift away the intermediate precision. - return sum >> 8; -} - -template <typename P, typename S> -static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurVertical( - S sampler, const ivec2_scalar& i, int minY, int maxY, int radius, - float coeff, float coeffStep) { - // Packed and unpacked vectors for a chunk of the given pixel type. - typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type; - typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type; - - // Pre-scale the coefficient by 8 bits of fractional precision, so that when - // the sample is multiplied by it, it will yield a 16 bit unsigned integer - // that will use all 16 bits of precision to accumulate the sum. - coeff *= 1 << 8; - float coeffStep2 = coeffStep * coeffStep; - - int rowAbove = computeRow(sampler, i); - int rowBelow = rowAbove; - P* buf = (P*)sampler->buf; - auto pixels = unaligned_load<V4<P>>(&buf[rowAbove]); - auto sum = CONVERT(bit_cast<packed_type>(pixels), unpacked_type) * - uint16_t(coeff + 0.5f); - - // For the vertical loop we can't be quite as creative with reusing old values - // as we were in the horizontal loop. We just do the obvious implementation of - // loading a chunk from each row in turn and accumulating it into the sum. We - // compute a valid radius within which we don't need to clamp the sampled row - // and use that to avoid any clamping in the main inner loop. We fall back to - // a slower clamping loop outside of that valid radius. - int offset = 1; - int belowBound = i.y - max(minY, 0); - int aboveBound = min(maxY, sampler->height - 1) - i.y; - int validRadius = min(radius, min(belowBound, aboveBound)); - for (; offset <= validRadius; offset++) { - rowAbove += sampler->stride; - rowBelow -= sampler->stride; - auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]); - auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]); - - // Accumulate the Gaussian coefficients step-wise. - coeff *= coeffStep; - coeffStep *= coeffStep2; - - // Both above and below samples at this offset use the same coefficient. - sum = addsat(sum, - (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) + - CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) * - uint16_t(coeff + 0.5f)); + uint32_t* row0 = + &sampler + ->buf[clampCoord(i.x, sampler->width) + + clampCoord(i.y, sampler->height) * sampler->stride + zoffset]; + uint32_t* row1 = + row0 + + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) ? sampler->stride : 0); + int16_t fracx = i.x >= 0 && i.x < int32_t(sampler->width) - 1 ? frac.x : 0; + int16_t fracy = frac.y; + + U32 pix0 = unaligned_load<U32>(row0); + U32 pix0n = unaligned_load<U32>(row0 + 4); + uint32_t pix0x = row0[8]; + U32 pix1 = unaligned_load<U32>(row1); + U32 pix1n = unaligned_load<U32>(row1 + 4); + uint32_t pix1x = row1[8]; + + { + auto ab0 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0, pix0, 0, 1, 1, 2)), + V16<int16_t>); + auto ab1 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1, pix1, 0, 1, 1, 2)), + V16<int16_t>); + ab0 += ((ab1 - ab0) * fracy) >> 7; + + auto cd0 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0, pix0n, 2, 3, 3, 4)), + V16<int16_t>); + auto cd1 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1, pix1n, 2, 3, 3, 4)), + V16<int16_t>); + cd0 += ((cd1 - cd0) * fracy) >> 7; + + auto abcdl = combine(lowHalf(ab0), lowHalf(cd0)); + auto abcdh = combine(highHalf(ab0), highHalf(cd0)); + abcdl += ((abcdh - abcdl) * fracx) >> 7; + + commit_span(buf, pack(WideRGBA8(abcdl))); } - for (; offset <= radius; offset++) { - if (offset <= aboveBound) { - rowAbove += sampler->stride; - } - if (offset <= belowBound) { - rowBelow -= sampler->stride; - } - auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]); - auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]); - - coeff *= coeffStep; - coeffStep *= coeffStep2; - - sum = addsat(sum, - (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) + - CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) * - uint16_t(coeff + 0.5f)); + { + auto ab0 = + CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0n, pix0n, 0, 1, 1, 2)), + V16<int16_t>); + auto ab1 = + CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1n, pix1n, 0, 1, 1, 2)), + V16<int16_t>); + ab0 += ((ab1 - ab0) * fracy) >> 7; + + auto cd0 = + CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0n, U32(pix0x), 2, 3, 3, 4)), + V16<int16_t>); + auto cd1 = + CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1n, U32(pix1x), 2, 3, 3, 4)), + V16<int16_t>); + cd0 += ((cd1 - cd0) * fracy) >> 7; + + auto abcdl = combine(lowHalf(ab0), lowHalf(cd0)); + auto abcdh = combine(highHalf(ab0), highHalf(cd0)); + abcdl += ((abcdh - abcdl) * fracx) >> 7; + + commit_span(buf + 4, pack(WideRGBA8(abcdl))); } - - // Shift away the intermediate precision. - return sum >> 8; } - -} // namespace glsl diff --git a/third_party/webrender/swgl/src/vector_type.h b/third_party/webrender/swgl/src/vector_type.h index 43364ffcce2..8ec5876c340 100644 --- a/third_party/webrender/swgl/src/vector_type.h +++ b/third_party/webrender/swgl/src/vector_type.h @@ -39,16 +39,6 @@ SI VectorType<T, 16> combine(VectorType<T, 8> a, VectorType<T, 8> b) { } template <typename T> -SI VectorType<T, 2> lowHalf(VectorType<T, 4> a) { - return __builtin_shufflevector(a, a, 0, 1); -} - -template <typename T> -SI VectorType<T, 2> highHalf(VectorType<T, 4> a) { - return __builtin_shufflevector(a, a, 2, 3); -} - -template <typename T> SI VectorType<T, 4> lowHalf(VectorType<T, 8> a) { return __builtin_shufflevector(a, a, 0, 1, 2, 3); } @@ -114,7 +104,7 @@ struct VectorType { }; }; - VectorType() : data{0} {} + VectorType() : data{0} { } constexpr VectorType(const VectorType& rhs) : data(rhs.data) {} // GCC vector extensions only support broadcasting scalars on arithmetic ops, @@ -315,27 +305,10 @@ struct VectorType { return VectorType<T, N * 2>::wrap(data, high.data); } -# define xxxx swizzle(0, 0, 0, 0) -# define yyyy swizzle(1, 1, 1, 1) -# define zzzz swizzle(2, 2, 2, 2) -# define wwww swizzle(3, 3, 3, 3) -# define xxyy swizzle(0, 0, 1, 1) -# define xxzz swizzle(0, 0, 2, 2) -# define yyww swizzle(1, 1, 3, 3) -# define zzww swizzle(2, 2, 3, 3) # define xyxy swizzle(0, 1, 0, 1) -# define xzxz swizzle(0, 2, 0, 2) -# define ywyw swizzle(1, 3, 1, 3) # define zwzw swizzle(2, 3, 2, 3) -# define zwxy swizzle(2, 3, 0, 1) # define zyxw swizzle(2, 1, 0, 3) -# define xxyz swizzle(0, 0, 1, 2) -# define xyyz swizzle(0, 1, 1, 2) # define xyzz swizzle(0, 1, 2, 2) -# define xzyw swizzle(0, 2, 1, 3) -# define yzwx swizzle(1, 2, 3, 0) -# define wxyz swizzle(3, 0, 1, 2) -# define wzyx swizzle(3, 2, 1, 0) # define xxxxyyyy XXXXYYYY() VectorType<T, 8> XXXXYYYY() const { return swizzle(0, 0, 0, 0).combine(swizzle(1, 1, 1, 1)); @@ -358,10 +331,6 @@ struct VectorType { VectorType<T, 8> XXYYZZWW() const { return swizzle(0, 0, 1, 1).combine(swizzle(2, 2, 3, 3)); } -# define xxxxyyyyzzzzwwww XXXXYYYYZZZZWWWW() - VectorType<T, 16> XXXXYYYYZZZZWWWW() { - return XXXXYYYY().combine(ZZZZWWWW()); - } }; template <typename T> @@ -374,17 +343,6 @@ struct VectorType<T, 2> { }; T elements[2]; }; - - SI VectorType wrap(const data_type& data) { - VectorType v; - v.data = data; - return v; - } - - VectorType operator&(VectorType x) const { return wrap(data & x.data); } - VectorType operator&(T x) const { return wrap(data & x); } - VectorType operator|(VectorType x) const { return wrap(data | x.data); } - VectorType operator|(T x) const { return wrap(data | x); } }; # define CONVERT(vector, type) ((type)(vector)) @@ -411,32 +369,6 @@ SI VectorType<T, N * 2> expand(VectorType<T, N> a) { } #endif -template <typename T, int N> -SI VectorType<T, N * 4> combine(VectorType<T, N> a, VectorType<T, N> b, - VectorType<T, N> c, VectorType<T, N> d) { - return combine(combine(a, b), combine(c, d)); -} - -template <typename T, int N> -SI VectorType<T, N> combineLow(VectorType<T, N> a, VectorType<T, N> b) { - return combine(lowHalf(a), lowHalf(b)); -} - -template <typename T, int N> -SI VectorType<T, N> combineHigh(VectorType<T, N> a, VectorType<T, N> b) { - return combine(highHalf(a), highHalf(b)); -} - -template <typename T, int N> -SI VectorType<T, N * 2> repeat2(VectorType<T, N> a) { - return combine(a, a); -} - -template <typename T, int N> -SI VectorType<T, N * 4> repeat4(VectorType<T, N> a) { - return combine(a, a, a, a); -} - template <typename T> SI VectorType<T, 4> zipLow(VectorType<T, 4> a, VectorType<T, 4> b) { return SHUFFLE(a, b, 0, 4, 1, 5); @@ -478,23 +410,6 @@ SI VectorType<T, 8> zip2High(VectorType<T, 8> a, VectorType<T, 8> b) { return SHUFFLE(a, b, 4, 5, 12, 13, 6, 7, 14, 15); } -#ifdef __clang__ -template <typename T> -SI VectorType<T, 8> zip(VectorType<T, 4> a, VectorType<T, 4> b) { - return SHUFFLE(a, b, 0, 4, 1, 5, 2, 6, 3, 7); -} - -template <typename T> -SI VectorType<T, 16> zip(VectorType<T, 8> a, VectorType<T, 8> b) { - return SHUFFLE(a, b, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); -} -#else -template <typename T, int N> -SI VectorType<T, N * 2> zip(VectorType<T, N> a, VectorType<T, N> b) { - return combine(zipLow(a, b), zipHigh(a, b)); -} -#endif - template <typename T> struct Unaligned { template <typename P> |