12 files changed, 3223 insertions, 8566 deletions
diff --git a/third_party/webrender/swgl/src/blend.h b/third_party/webrender/swgl/src/blend.h
deleted file mode 100644
index 8bc1c93994e..00000000000
--- a/third_party/webrender/swgl/src/blend.h
+++ /dev/null
@@ -1,864 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-static ALWAYS_INLINE HalfRGBA8 packRGBA8(I32 a, I32 b) {
-#if USE_SSE2
-  return _mm_packs_epi32(a, b);
-#elif USE_NEON
-  return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b));
-#else
-  return CONVERT(combine(a, b), HalfRGBA8);
-#endif
-}
-
-static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4& v,
-                                                 float scale = 255.0f) {
-  ivec4 i = round_pixel(v, scale);
-  HalfRGBA8 xz = packRGBA8(i.z, i.x);
-  HalfRGBA8 yw = packRGBA8(i.y, i.w);
-  HalfRGBA8 xyzwl = zipLow(xz, yw);
-  HalfRGBA8 xyzwh = zipHigh(xz, yw);
-  HalfRGBA8 lo = zip2Low(xyzwl, xyzwh);
-  HalfRGBA8 hi = zip2High(xyzwl, xyzwh);
-  return combine(lo, hi);
-}
-
-static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(Float alpha,
-                                                 float scale = 255.0f) {
-  I32 i = round_pixel(alpha, scale);
-  HalfRGBA8 c = packRGBA8(i, i);
-  c = zipLow(c, c);
-  return zip(c, c);
-}
-
-static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(float alpha,
-                                                 float scale = 255.0f) {
-  I32 i = round_pixel(alpha, scale);
-  return repeat2(packRGBA8(i, i));
-}
-
-UNUSED static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v,
-                                                        float scale = 255.0f) {
-  I32 i = round_pixel((Float){v.z, v.y, v.x, v.w}, scale);
-  return repeat2(packRGBA8(i, i));
-}
-
-static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8() {
-  return pack_pixels_RGBA8(fragment_shader->gl_FragColor);
-}
-
-static ALWAYS_INLINE WideRGBA8 pack_pixels_RGBA8(WideRGBA32F v,
-                                                 float scale = 255.0f) {
-  ivec4 i = round_pixel(bit_cast<vec4>(v), scale);
-  return combine(packRGBA8(i.x, i.y), packRGBA8(i.z, i.w));
-}
-
-static ALWAYS_INLINE WideR8 packR8(I32 a) {
-#if USE_SSE2
-  return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a)));
-#elif USE_NEON
-  return vqmovun_s32(a);
-#else
-  return CONVERT(a, WideR8);
-#endif
-}
-
-static ALWAYS_INLINE WideR8 pack_pixels_R8(Float c, float scale = 255.0f) {
-  return packR8(round_pixel(c, scale));
-}
-
-static ALWAYS_INLINE WideR8 pack_pixels_R8() {
-  return pack_pixels_R8(fragment_shader->gl_FragColor.x);
-}
-
-// Load a partial span > 0 and < 4 pixels.
-template <typename V, typename P>
-static ALWAYS_INLINE V partial_load_span(const P* src, int span) {
-  return bit_cast<V>(
-      (span >= 2
-           ? combine(unaligned_load<V2<P>>(src),
-                     V2<P>{span > 2 ? unaligned_load<P>(src + 2) : P(0), 0})
-           : V4<P>{unaligned_load<P>(src), 0, 0, 0}));
-}
-
-// Store a partial span > 0 and < 4 pixels.
-template <typename V, typename P>
-static ALWAYS_INLINE void partial_store_span(P* dst, V src, int span) {
-  auto pixels = bit_cast<V4<P>>(src);
-  if (span >= 2) {
-    unaligned_store(dst, lowHalf(pixels));
-    if (span > 2) {
-      unaligned_store(dst + 2, pixels.z);
-    }
-  } else {
-    unaligned_store(dst, pixels.x);
-  }
-}
-
-// Dispatcher that chooses when to load a full or partial span
-template <typename V, typename P>
-static ALWAYS_INLINE V load_span(const P* src, int span) {
-  if (span >= 4) {
-    return unaligned_load<V, P>(src);
-  } else {
-    return partial_load_span<V, P>(src, span);
-  }
-}
-
-// Dispatcher that chooses when to store a full or partial span
-template <typename V, typename P>
-static ALWAYS_INLINE void store_span(P* dst, V src, int span) {
-  if (span >= 4) {
-    unaligned_store<V, P>(dst, src);
-  } else {
-    partial_store_span<V, P>(dst, src, span);
-  }
-}
-
-template <typename T>
-static ALWAYS_INLINE T muldiv256(T x, T y) {
-  return (x * y) >> 8;
-}
-
-// (x*y + x) >> 8, cheap approximation of (x*y) / 255
-template <typename T>
-static ALWAYS_INLINE T muldiv255(T x, T y) {
-  return (x * y + x) >> 8;
-}
-
-template <typename V>
-static ALWAYS_INLINE WideRGBA8 pack_span(uint32_t*, const V& v,
-                                         float scale = 255.0f) {
-  return pack_pixels_RGBA8(v, scale);
-}
-
-template <typename C>
-static ALWAYS_INLINE WideR8 pack_span(uint8_t*, C c, float scale = 255.0f) {
-  return pack_pixels_R8(c, scale);
-}
-
-// Helper functions to apply a color modulus when available.
-struct NoColor {};
-
-template <typename P>
-static ALWAYS_INLINE P applyColor(P src, NoColor) {
-  return src;
-}
-
-struct InvertColor {};
-
-template <typename P>
-static ALWAYS_INLINE P applyColor(P src, InvertColor) {
-  return 255 - src;
-}
-
-template <typename P>
-static ALWAYS_INLINE P applyColor(P src, P color) {
-  return muldiv255(color, src);
-}
-
-static ALWAYS_INLINE WideRGBA8 applyColor(PackedRGBA8 src, WideRGBA8 color) {
-  return applyColor(unpack(src), color);
-}
-
-template <typename P, typename C>
-static ALWAYS_INLINE auto packColor(P* buf, C color) {
-  return pack_span(buf, color, 255.0f);
-}
-
-template <typename P>
-static ALWAYS_INLINE NoColor packColor(UNUSED P* buf, NoColor noColor) {
-  return noColor;
-}
-
-template <typename P>
-static ALWAYS_INLINE InvertColor packColor(UNUSED P* buf,
-                                           InvertColor invertColor) {
-  return invertColor;
-}
-
-// Single argument variation that takes an explicit destination buffer type.
-template <typename P, typename C>
-static ALWAYS_INLINE auto packColor(C color) {
-  // Just pass in a typed null pointer, as the pack routines never use the
-  // pointer's value, just its type.
-  return packColor((P*)0, color);
-}
-
-// Byte-wise addition for when x or y is a signed 8-bit value stored in the
-// low byte of a larger type T only with zeroed-out high bits, where T is
-// greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used
-// upon signed operands, using up all the precision in a 16 bit integer, and
-// potentially losing the sign bit in the last >> 8 shift. Due to the
-// properties of two's complement arithmetic, even though we've discarded the
-// sign bit, we can still represent a negative number under addition (without
-// requiring any extra sign bits), just that any negative number will behave
-// like a large unsigned number under addition, generating a single carry bit
-// on overflow that we need to discard. Thus, just doing a byte-wise add will
-// overflow without the troublesome carry, giving us only the remaining 8 low
-// bits we actually need while keeping the high bits at zero.
-template <typename T>
-static ALWAYS_INLINE T addlow(T x, T y) {
-  typedef VectorType<uint8_t, sizeof(T)> bytes;
-  return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y));
-}
-
-// Replace color components of each pixel with the pixel's alpha values.
-template <typename T>
-static ALWAYS_INLINE T alphas(T c) {
-  return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15);
-}
-
-// Replace the alpha values of the first vector with alpha values from the
-// second, while leaving the color components unmodified.
-template <typename T>
-static ALWAYS_INLINE T set_alphas(T c, T a) {
-  return SHUFFLE(c, a, 0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31);
-}
-
-// Miscellaneous helper functions for working with packed RGBA8 data.
-static ALWAYS_INLINE HalfRGBA8 if_then_else(V8<int16_t> c, HalfRGBA8 t,
-                                            HalfRGBA8 e) {
-  return bit_cast<HalfRGBA8>((c & t) | (~c & e));
-}
-
-template <typename T, typename C, int N>
-static ALWAYS_INLINE VectorType<T, N> if_then_else(VectorType<C, N> c,
-                                                   VectorType<T, N> t,
-                                                   VectorType<T, N> e) {
-  return combine(if_then_else(lowHalf(c), lowHalf(t), lowHalf(e)),
-                 if_then_else(highHalf(c), highHalf(t), highHalf(e)));
-}
-
-static ALWAYS_INLINE HalfRGBA8 min(HalfRGBA8 x, HalfRGBA8 y) {
-#if USE_SSE2
-  return bit_cast<HalfRGBA8>(
-      _mm_min_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y)));
-#elif USE_NEON
-  return vminq_u16(x, y);
-#else
-  return if_then_else(x < y, x, y);
-#endif
-}
-
-template <typename T, int N>
-static ALWAYS_INLINE VectorType<T, N> min(VectorType<T, N> x,
-                                          VectorType<T, N> y) {
-  return combine(min(lowHalf(x), lowHalf(y)), min(highHalf(x), highHalf(y)));
-}
-
-static ALWAYS_INLINE HalfRGBA8 max(HalfRGBA8 x, HalfRGBA8 y) {
-#if USE_SSE2
-  return bit_cast<HalfRGBA8>(
-      _mm_max_epi16(bit_cast<V8<int16_t>>(x), bit_cast<V8<int16_t>>(y)));
-#elif USE_NEON
-  return vmaxq_u16(x, y);
-#else
-  return if_then_else(x > y, x, y);
-#endif
-}
-
-template <typename T, int N>
-static ALWAYS_INLINE VectorType<T, N> max(VectorType<T, N> x,
-                                          VectorType<T, N> y) {
-  return combine(max(lowHalf(x), lowHalf(y)), max(highHalf(x), highHalf(y)));
-}
-
-template <typename T, int N>
-static ALWAYS_INLINE VectorType<T, N> recip(VectorType<T, N> v) {
-  return combine(recip(lowHalf(v)), recip(highHalf(v)));
-}
-
-// Helper to get the reciprocal if the value is non-zero, or otherwise default
-// to the supplied fallback value.
-template <typename V>
-static ALWAYS_INLINE V recip_or(V v, float f) {
-  return if_then_else(v != V(0.0f), recip(v), V(f));
-}
-
-template <typename T, int N>
-static ALWAYS_INLINE VectorType<T, N> inversesqrt(VectorType<T, N> v) {
-  return combine(inversesqrt(lowHalf(v)), inversesqrt(highHalf(v)));
-}
-
-// Extract the alpha components so that we can cheaply calculate the reciprocal
-// on a single SIMD register. Then multiply the duplicated alpha reciprocal with
-// the pixel data. 0 alpha is treated as transparent black.
-static ALWAYS_INLINE WideRGBA32F unpremultiply(WideRGBA32F v) {
-  Float a = recip_or((Float){v[3], v[7], v[11], v[15]}, 0.0f);
-  return v * a.xxxxyyyyzzzzwwww;
-}
-
-// Packed RGBA32F data is AoS in BGRA order. Transpose it to SoA and swizzle to
-// RGBA to unpack.
-static ALWAYS_INLINE vec4 unpack(PackedRGBA32F c) {
-  return bit_cast<vec4>(
-      SHUFFLE(c, c, 2, 6, 10, 14, 1, 5, 9, 13, 0, 4, 8, 12, 3, 7, 11, 15));
-}
-
-// The following lum/sat functions mostly follow the KHR_blend_equation_advanced
-// specification but are rearranged to work on premultiplied data.
-static ALWAYS_INLINE Float lumv3(vec3 v) {
-  return v.x * 0.30f + v.y * 0.59f + v.z * 0.11f;
-}
-
-static ALWAYS_INLINE Float minv3(vec3 v) { return min(min(v.x, v.y), v.z); }
-
-static ALWAYS_INLINE Float maxv3(vec3 v) { return max(max(v.x, v.y), v.z); }
-
-static inline vec3 clip_color(vec3 v, Float lum, Float alpha) {
-  Float mincol = max(-minv3(v), lum);
-  Float maxcol = max(maxv3(v), alpha - lum);
-  return lum + v * (lum * (alpha - lum) * recip_or(mincol * maxcol, 0.0f));
-}
-
-static inline vec3 set_lum(vec3 base, vec3 ref, Float alpha) {
-  return clip_color(base - lumv3(base), lumv3(ref), alpha);
-}
-
-static inline vec3 set_lum_sat(vec3 base, vec3 sref, vec3 lref, Float alpha) {
-  vec3 diff = base - minv3(base);
-  Float sbase = maxv3(diff);
-  Float ssat = maxv3(sref) - minv3(sref);
-  // The sbase range is rescaled to ssat. If sbase has 0 extent, then rescale
-  // to black, as per specification.
-  return set_lum(diff * ssat * recip_or(sbase, 0.0f), lref, alpha);
-}
-
-// Flags the reflect the current blend-stage clipping to be applied.
-enum SWGLClipFlag {
-  SWGL_CLIP_FLAG_MASK = 1 << 0,
-  SWGL_CLIP_FLAG_AA = 1 << 1,
-  SWGL_CLIP_FLAG_BLEND_OVERRIDE = 1 << 2,
-};
-static int swgl_ClipFlags = 0;
-static BlendKey swgl_BlendOverride = BLEND_KEY_NONE;
-static WideRGBA8 swgl_BlendColorRGBA8 = {0};
-static WideRGBA8 swgl_BlendAlphaRGBA8 = {0};
-
-// A pointer into the color buffer for the start of the span.
-static void* swgl_SpanBuf = nullptr;
-// A pointer into the clip mask for the start of the span.
-static uint8_t* swgl_ClipMaskBuf = nullptr;
-
-static ALWAYS_INLINE WideR8 expand_mask(UNUSED uint8_t* buf, WideR8 mask) {
-  return mask;
-}
-static ALWAYS_INLINE WideRGBA8 expand_mask(UNUSED uint32_t* buf, WideR8 mask) {
-  WideRG8 maskRG = zip(mask, mask);
-  return zip(maskRG, maskRG);
-}
-
-// Loads a chunk of clip masks. The current pointer into the color buffer is
-// used to reconstruct the relative position within the span. From there, the
-// pointer into the clip mask can be generated from the start of the clip mask
-// span.
-template <typename P>
-static ALWAYS_INLINE uint8_t* get_clip_mask(P* buf) {
-  return &swgl_ClipMaskBuf[buf - (P*)swgl_SpanBuf];
-}
-
-template <typename P>
-static ALWAYS_INLINE auto load_clip_mask(P* buf, int span)
-    -> decltype(expand_mask(buf, 0)) {
-  return expand_mask(buf,
-                     unpack(load_span<PackedR8>(get_clip_mask(buf), span)));
-}
-
-// Temporarily removes masking from the blend stage, assuming the caller will
-// handle it.
-static ALWAYS_INLINE void override_clip_mask() {
-  blend_key = BlendKey(blend_key - MASK_BLEND_KEY_NONE);
-}
-
-// Restores masking to the blend stage, assuming it was previously overridden.
-static ALWAYS_INLINE void restore_clip_mask() {
-  blend_key = BlendKey(MASK_BLEND_KEY_NONE + blend_key);
-}
-
-// A pointer to the start of the opaque destination region of the span for AA.
-static const uint8_t* swgl_OpaqueStart = nullptr;
-// The size, in bytes, of the opaque region.
-static uint32_t swgl_OpaqueSize = 0;
-// AA coverage distance offsets for the left and right edges.
-static Float swgl_LeftAADist = 0.0f;
-static Float swgl_RightAADist = 0.0f;
-// AA coverage slope values used for accumulating coverage for each step.
-static Float swgl_AASlope = 0.0f;
-
-// Get the amount of pixels we need to process before the start of the opaque
-// region.
-template <typename P>
-static ALWAYS_INLINE int get_aa_opaque_start(P* buf) {
-  return max(int((P*)swgl_OpaqueStart - buf), 0);
-}
-
-// Assuming we are already in the opaque part of the span, return the remaining
-// size of the opaque part.
-template <typename P>
-static ALWAYS_INLINE int get_aa_opaque_size(P* buf) {
-  return max(int((P*)&swgl_OpaqueStart[swgl_OpaqueSize] - buf), 0);
-}
-
-// Temporarily removes anti-aliasing from the blend stage, assuming the caller
-// will handle it.
-static ALWAYS_INLINE void override_aa() {
-  blend_key = BlendKey(blend_key - AA_BLEND_KEY_NONE);
-}
-
-// Restores anti-aliasing to the blend stage, assuming it was previously
-// overridden.
-static ALWAYS_INLINE void restore_aa() {
-  blend_key = BlendKey(AA_BLEND_KEY_NONE + blend_key);
-}
-
-static PREFER_INLINE WideRGBA8 blend_pixels(uint32_t* buf, PackedRGBA8 pdst,
-                                            WideRGBA8 src, int span = 4) {
-  WideRGBA8 dst = unpack(pdst);
-  const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0,      0xFFFF, 0xFFFF,
-                              0xFFFF, 0,      0xFFFF, 0xFFFF, 0xFFFF, 0,
-                              0xFFFF, 0xFFFF, 0xFFFF, 0};
-  const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF,
-                                0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF};
-  const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255,
-                                  0, 0, 0, 255, 0, 0, 0, 255};
-
-// clang-format off
-  // Computes AA for the given pixel based on the offset of the pixel within
-  // destination row. Given the initial coverage offsets for the left and right
-  // edges, the offset is scaled by the slope and accumulated to find the
-  // minimum coverage value for the pixel. A final weight is generated that
-  // can be used to scale the source pixel.
-#define DO_AA(format, body)                                   \
-  do {                                                        \
-    int offset = int((const uint8_t*)buf - swgl_OpaqueStart); \
-    if (uint32_t(offset) >= swgl_OpaqueSize) {                \
-      Float delta = swgl_AASlope * float(offset);             \
-      Float dist = clamp(min(swgl_LeftAADist + delta.x,       \
-                             swgl_RightAADist + delta.y),     \
-                         0.0f, 256.0f);                       \
-      auto aa = pack_pixels_##format(dist, 1.0f);             \
-      body;                                                   \
-    }                                                         \
-  } while (0)
-
-  // Each blend case is preceded by the MASK_ variant. The MASK_ case first
-  // loads the mask values and multiplies the source value by them. After, it
-  // falls through to the normal blending case using the masked source. The
-  // AA_ variations may further precede the blend cases, in which case the
-  // source value is further modified before use.
-#define BLEND_CASE_KEY(key)                          \
-  case AA_##key:                                     \
-    DO_AA(RGBA8, src = muldiv256(src, aa));          \
-    goto key;                                        \
-  case AA_MASK_##key:                                \
-    DO_AA(RGBA8, src = muldiv256(src, aa));          \
-    FALLTHROUGH;                                     \
-  case MASK_##key:                                   \
-    src = muldiv255(src, load_clip_mask(buf, span)); \
-    FALLTHROUGH;                                     \
-  case key: key
-
-#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__))
-
-  switch (blend_key) {
-  BLEND_CASE(GL_ONE, GL_ZERO):
-    return src;
-  BLEND_CASE(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE,
-                  GL_ONE_MINUS_SRC_ALPHA):
-    // dst + src.a*(src.rgb1 - dst)
-    // use addlow for signed overflow
-    return addlow(dst, muldiv255(alphas(src), (src | ALPHA_OPAQUE) - dst));
-  BLEND_CASE(GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
-    return src + dst - muldiv255(dst, alphas(src));
-  BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR):
-    return dst - muldiv255(dst, src);
-  BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE):
-    return dst - (muldiv255(dst, src) & RGB_MASK);
-  BLEND_CASE(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA):
-    return dst - muldiv255(dst, alphas(src));
-  BLEND_CASE(GL_ZERO, GL_SRC_COLOR):
-    return muldiv255(src, dst);
-  BLEND_CASE(GL_ONE, GL_ONE):
-    return src + dst;
-  BLEND_CASE(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
-    return src + dst - (muldiv255(dst, src) & ALPHA_MASK);
-  BLEND_CASE(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE):
-    // src*(1-dst.a) + dst*1 = src - src*dst.a + dst
-    return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK);
-  BLEND_CASE(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR):
-    // src*k + (1-src)*dst = src*k + dst -
-    // src*dst = dst + src*(k - dst) use addlow
-    // for signed overflow
-    return addlow(
-        dst, muldiv255(src, repeat2(ctx->blendcolor) - dst));
-
-  // We must explicitly handle the masked/anti-aliased secondary blend case.
-  // The secondary color as well as the source must be multiplied by the
-  // weights.
-  case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
-    WideRGBA8 secondary =
-        applyColor(dst,
-            packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
-    return src + dst - secondary;
-  }
-  case MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
-    WideRGBA8 secondary =
-        applyColor(dst,
-            packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
-    WideRGBA8 mask = load_clip_mask(buf, span);
-    return muldiv255(src, mask) + dst - muldiv255(secondary, mask);
-  }
-  case AA_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
-    WideRGBA8 secondary =
-        applyColor(dst,
-            packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
-    DO_AA(RGBA8, {
-      src = muldiv256(src, aa);
-      secondary = muldiv256(secondary, aa);
-    });
-    return src + dst - secondary;
-  }
-  case AA_MASK_BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
-    WideRGBA8 secondary =
-        applyColor(dst,
-            packColor<uint32_t>(fragment_shader->gl_SecondaryFragColor));
-    WideRGBA8 mask = load_clip_mask(buf, span);
-    DO_AA(RGBA8, mask = muldiv256(mask, aa));
-    return muldiv255(src, mask) + dst - muldiv255(secondary, mask);
-  }
-
-  BLEND_CASE(GL_MIN):
-    return min(src, dst);
-  BLEND_CASE(GL_MAX):
-    return max(src, dst);
-
-  // The KHR_blend_equation_advanced spec describes the blend equations such
-  // that the unpremultiplied values Cs, Cd, As, Ad and function f combine to
-  // the result:
-  //     Cr = f(Cs,Cd)*As*Ad + Cs*As*(1-Ad) + Cd*AD*(1-As)
-  //     Ar = As*Ad + As*(1-Ad) + Ad*(1-As)
-  // However, working with unpremultiplied values requires expensive math to
-  // unpremultiply and premultiply again during blending. We can use the fact
-  // that premultiplied value P = C*A and simplify the equations such that no
-  // unpremultiplied colors are necessary, allowing us to stay with integer
-  // math that avoids floating-point conversions in the common case. Some of
-  // the blend modes require division or sqrt, in which case we do convert
-  // to (possibly transposed/unpacked) floating-point to implement the mode.
-  // However, most common modes can still use cheaper premultiplied integer
-  // math. As an example, the multiply mode f(Cs,Cd) = Cs*Cd is simplified
-  // to:
-  //     Cr = Cs*Cd*As*Ad + Cs*As*(1-Ad) + Cd*Ad*(1-As)
-  //     .. Pr = Ps*Pd + Ps - Ps*Ad + Pd - Pd*As
-  //     Ar = As*Ad + As - As*Ad + Ad - Ad*As
-  //     .. Ar = As + Ad - As*Ad
-  // Note that the alpha equation is the same for all blend equations, such
-  // that so long as the implementation results in As + Ad - As*Ad, we can
-  // avoid using separate instructions to compute the alpha result, which is
-  // dependent on the math used to implement each blend mode. The exact
-  // reductions used to get the final math for every blend mode are too
-  // involved to show here in comments, but mostly follows from replacing
-  // Cs*As and Cd*Ad with Ps and Ps while factoring out as many common terms
-  // as possible.
-
-  BLEND_CASE(GL_MULTIPLY_KHR): {
-    WideRGBA8 diff = muldiv255(alphas(src) - (src & RGB_MASK),
-                               alphas(dst) - (dst & RGB_MASK));
-    return src + dst + (diff & RGB_MASK) - alphas(diff);
-  }
-  BLEND_CASE(GL_SCREEN_KHR):
-    return src + dst - muldiv255(src, dst);
-  BLEND_CASE(GL_OVERLAY_KHR): {
-    WideRGBA8 srcA = alphas(src);
-    WideRGBA8 dstA = alphas(dst);
-    WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst);
-    return src + dst +
-           if_then_else(dst * 2 <= dstA, (diff & RGB_MASK) - alphas(diff),
-                        -diff);
-  }
-  BLEND_CASE(GL_DARKEN_KHR):
-    return src + dst -
-           max(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src)));
-  BLEND_CASE(GL_LIGHTEN_KHR):
-    return src + dst -
-           min(muldiv255(src, alphas(dst)), muldiv255(dst, alphas(src)));
-
-  BLEND_CASE(GL_COLORDODGE_KHR): {
-    // Color-dodge and color-burn require division, so we convert to FP math
-    // here, but avoid transposing to a vec4.
-    WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
-    WideRGBA32F srcA = alphas(srcF);
-    WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
-    WideRGBA32F dstA = alphas(dstF);
-    return pack_pixels_RGBA8(
-        srcA * set_alphas(
-                   min(dstA, dstF * srcA * recip_or(srcA - srcF, 255.0f)),
-                   dstF) +
-            srcF * (255.0f - dstA) + dstF * (255.0f - srcA),
-        1.0f / 255.0f);
-  }
-  BLEND_CASE(GL_COLORBURN_KHR): {
-    WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
-    WideRGBA32F srcA = alphas(srcF);
-    WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
-    WideRGBA32F dstA = alphas(dstF);
-    return pack_pixels_RGBA8(
-        srcA * set_alphas((dstA - min(dstA, (dstA - dstF) * srcA *
-                                                recip_or(srcF, 255.0f))),
-                          dstF) +
-            srcF * (255.0f - dstA) + dstF * (255.0f - srcA),
-        1.0f / 255.0f);
-  }
-  BLEND_CASE(GL_HARDLIGHT_KHR): {
-    WideRGBA8 srcA = alphas(src);
-    WideRGBA8 dstA = alphas(dst);
-    WideRGBA8 diff = muldiv255(src, dst) + muldiv255(srcA - src, dstA - dst);
-    return src + dst +
-           if_then_else(src * 2 <= srcA, (diff & RGB_MASK) - alphas(diff),
-                        -diff);
-  }
-
-  BLEND_CASE(GL_SOFTLIGHT_KHR): {
-    // Soft-light requires an unpremultiply that can't be factored out as
-    // well as a sqrt, so we convert to FP math here, but avoid transposing
-    // to a vec4.
-    WideRGBA32F srcF = CONVERT(src, WideRGBA32F);
-    WideRGBA32F srcA = alphas(srcF);
-    WideRGBA32F dstF = CONVERT(dst, WideRGBA32F);
-    WideRGBA32F dstA = alphas(dstF);
-    WideRGBA32F dstU = unpremultiply(dstF);
-    WideRGBA32F scale = srcF + srcF - srcA;
-    return pack_pixels_RGBA8(
-        dstF * (255.0f +
-                set_alphas(
-                    scale *
-                        if_then_else(scale < 0.0f, 1.0f - dstU,
-                                     min((16.0f * dstU - 12.0f) * dstU + 3.0f,
-                                         inversesqrt(dstU) - 1.0f)),
-                    WideRGBA32F(0.0f))) +
-            srcF * (255.0f - dstA),
-        1.0f / 255.0f);
-  }
-  BLEND_CASE(GL_DIFFERENCE_KHR): {
-    WideRGBA8 diff =
-        min(muldiv255(dst, alphas(src)), muldiv255(src, alphas(dst)));
-    return src + dst - diff - (diff & RGB_MASK);
-  }
-  BLEND_CASE(GL_EXCLUSION_KHR): {
-    WideRGBA8 diff = muldiv255(src, dst);
-    return src + dst - diff - (diff & RGB_MASK);
-  }
-
-  // The HSL blend modes are non-separable and require complicated use of
-  // division. It is advantageous to convert to FP and transpose to vec4
-  // math to more easily manipulate the individual color components.
-#define DO_HSL(rgb)                                                            \
-  do {                                                                         \
-    vec4 srcV = unpack(CONVERT(src, PackedRGBA32F));                           \
-    vec4 dstV = unpack(CONVERT(dst, PackedRGBA32F));                           \
-    Float srcA = srcV.w * (1.0f / 255.0f);                                     \
-    Float dstA = dstV.w * (1.0f / 255.0f);                                     \
-    Float srcDstA = srcV.w * dstA;                                             \
-    vec3 srcC = vec3(srcV) * dstA;                                             \
-    vec3 dstC = vec3(dstV) * srcA;                                             \
-    return pack_pixels_RGBA8(vec4(rgb + vec3(srcV) - srcC + vec3(dstV) - dstC, \
-                                  srcV.w + dstV.w - srcDstA),                  \
-                             1.0f);                                            \
-  } while (0)
-
-  BLEND_CASE(GL_HSL_HUE_KHR):
-    DO_HSL(set_lum_sat(srcC, dstC, dstC, srcDstA));
-  BLEND_CASE(GL_HSL_SATURATION_KHR):
-    DO_HSL(set_lum_sat(dstC, srcC, dstC, srcDstA));
-  BLEND_CASE(GL_HSL_COLOR_KHR):
-    DO_HSL(set_lum(srcC, dstC, srcDstA));
-  BLEND_CASE(GL_HSL_LUMINOSITY_KHR):
-    DO_HSL(set_lum(dstC, srcC, srcDstA));
-
-  // SWGL-specific extended blend modes.
-  BLEND_CASE(SWGL_BLEND_DROP_SHADOW): {
-    // Premultiplied alpha over blend, but with source color set to source alpha
-    // modulated with a constant color.
-    WideRGBA8 color = applyColor(alphas(src), swgl_BlendColorRGBA8);
-    return color + dst - muldiv255(dst, alphas(color));
-  }
-
-  BLEND_CASE(SWGL_BLEND_SUBPIXEL_TEXT):
-    // Premultiplied alpha over blend, but treats the source as a subpixel mask
-    // modulated with a constant color.
-    return applyColor(src, swgl_BlendColorRGBA8) + dst -
-           muldiv255(dst, applyColor(src, swgl_BlendAlphaRGBA8));
-
-  default:
-    UNREACHABLE;
-    // return src;
-  }
-
-#undef BLEND_CASE
-#undef BLEND_CASE_KEY
-  // clang-format on
-}
-
-static PREFER_INLINE WideR8 blend_pixels(uint8_t* buf, WideR8 dst, WideR8 src,
-                                         int span = 4) {
-// clang-format off
-#define BLEND_CASE_KEY(key)                          \
-  case AA_##key:                                     \
-    DO_AA(R8, src = muldiv256(src, aa));             \
-    goto key;                                        \
-  case AA_MASK_##key:                                \
-    DO_AA(R8, src = muldiv256(src, aa));             \
-    FALLTHROUGH;                                     \
-  case MASK_##key:                                   \
-    src = muldiv255(src, load_clip_mask(buf, span)); \
-    FALLTHROUGH;                                     \
-  case key: key
-
-#define BLEND_CASE(...) BLEND_CASE_KEY(BLEND_KEY(__VA_ARGS__))
-
-  switch (blend_key) {
-  BLEND_CASE(GL_ONE, GL_ZERO):
-    return src;
-  BLEND_CASE(GL_ZERO, GL_SRC_COLOR):
-    return muldiv255(src, dst);
-  BLEND_CASE(GL_ONE, GL_ONE):
-    return src + dst;
-  default:
-    UNREACHABLE;
-    // return src;
-  }
-
-#undef BLEND_CASE
-#undef BLEND_CASE_KEY
-  // clang-format on
-}
-
-static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r) {
-  unaligned_store(buf, pack(r));
-}
-
-static ALWAYS_INLINE void commit_span(uint32_t* buf, WideRGBA8 r, int len) {
-  partial_store_span(buf, pack(r), len);
-}
-
-static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r) {
-  return blend_pixels(buf, unaligned_load<PackedRGBA8>(buf), r);
-}
-
-static ALWAYS_INLINE WideRGBA8 blend_span(uint32_t* buf, WideRGBA8 r, int len) {
-  return blend_pixels(buf, partial_load_span<PackedRGBA8>(buf, len), r, len);
-}
-
-static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r) {
-  unaligned_store(buf, r);
-}
-
-static ALWAYS_INLINE void commit_span(uint32_t* buf, PackedRGBA8 r, int len) {
-  partial_store_span(buf, r, len);
-}
-
-static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r) {
-  return pack(blend_span(buf, unpack(r)));
-}
-
-static ALWAYS_INLINE PackedRGBA8 blend_span(uint32_t* buf, PackedRGBA8 r,
-                                            int len) {
-  return pack(blend_span(buf, unpack(r), len));
-}
-
-static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r) {
-  unaligned_store(buf, pack(r));
-}
-
-static ALWAYS_INLINE void commit_span(uint8_t* buf, WideR8 r, int len) {
-  partial_store_span(buf, pack(r), len);
-}
-
-static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r) {
-  return blend_pixels(buf, unpack(unaligned_load<PackedR8>(buf)), r);
-}
-
-static ALWAYS_INLINE WideR8 blend_span(uint8_t* buf, WideR8 r, int len) {
-  return blend_pixels(buf, unpack(partial_load_span<PackedR8>(buf, len)), r,
-                      len);
-}
-
-static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r) {
-  unaligned_store(buf, r);
-}
-
-static ALWAYS_INLINE void commit_span(uint8_t* buf, PackedR8 r, int len) {
-  partial_store_span(buf, r, len);
-}
-
-static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r) {
-  return pack(blend_span(buf, unpack(r)));
-}
-
-static ALWAYS_INLINE PackedR8 blend_span(uint8_t* buf, PackedR8 r, int len) {
-  return pack(blend_span(buf, unpack(r), len));
-}
-
-template <bool BLEND, typename P, typename R>
-static ALWAYS_INLINE void commit_blend_span(P* buf, R r) {
-  if (BLEND) {
-    commit_span(buf, blend_span(buf, r));
-  } else {
-    commit_span(buf, r);
-  }
-}
-
-template <bool BLEND, typename P, typename R>
-static ALWAYS_INLINE void commit_blend_span(P* buf, R r, int len) {
-  if (BLEND) {
-    commit_span(buf, blend_span(buf, r, len), len);
-  } else {
-    commit_span(buf, r, len);
-  }
-}
-
-template <typename P, typename R>
-static ALWAYS_INLINE void commit_blend_solid_span(P* buf, R r, int len) {
-  for (P* end = &buf[len & ~3]; buf < end; buf += 4) {
-    commit_span(buf, blend_span(buf, r));
-  }
-  len &= 3;
-  if (len > 0) {
-    partial_store_span(buf, pack(blend_span(buf, r, len)), len);
-  }
-}
-
-template <bool BLEND>
-static void commit_solid_span(uint32_t* buf, WideRGBA8 r, int len) {
-  commit_blend_solid_span(buf, r, len);
-}
-
-template <>
-ALWAYS_INLINE void commit_solid_span<false>(uint32_t* buf, WideRGBA8 r,
-                                            int len) {
-  fill_n(buf, len, bit_cast<U32>(pack(r)).x);
-}
-
-template <bool BLEND>
-static void commit_solid_span(uint8_t* buf, WideR8 r, int len) {
-  commit_blend_solid_span(buf, r, len);
-}
-
-template <>
-ALWAYS_INLINE void commit_solid_span<false>(uint8_t* buf, WideR8 r, int len) {
-  PackedR8 p = pack(r);
-  if (uintptr_t(buf) & 3) {
-    int align = 4 - (uintptr_t(buf) & 3);
-    align = min(align, len);
-    partial_store_span(buf, p, align);
-    buf += align;
-    len -= align;
-  }
-  fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(p));
-  buf += len & ~3;
-  len &= 3;
-  if (len > 0) {
-    partial_store_span(buf, p, len);
-  }
-}
diff --git a/third_party/webrender/swgl/src/composite.h b/third_party/webrender/swgl/src/composite.h
deleted file mode 100644
index f88de485fdd..00000000000
--- a/third_party/webrender/swgl/src/composite.h
+++ /dev/null
@@ -1,1069 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-template <bool COMPOSITE, typename P>
-static inline void copy_row(P* dst, const P* src, int span) {
-  // No scaling, so just do a fast copy.
-  memcpy(dst, src, span * sizeof(P));
-}
-
-template <>
-void copy_row<true, uint32_t>(uint32_t* dst, const uint32_t* src, int span) {
-  // No scaling, so just do a fast composite.
-  auto* end = dst + span;
-  while (dst + 4 <= end) {
-    WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
-    WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst));
-    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
-    unaligned_store(dst, r);
-    src += 4;
-    dst += 4;
-  }
-  if (dst < end) {
-    WideRGBA8 srcpx = unpack(partial_load_span<PackedRGBA8>(src, end - dst));
-    WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst));
-    auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
-    partial_store_span(dst, r, end - dst);
-  }
-}
-
-template <bool COMPOSITE, typename P>
-static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth,
-                             int span, int frac) {
-  // Do scaling with different source and dest widths.
-  for (P* end = dst + span; dst < end; dst++) {
-    *dst = *src;
-    // Step source according to width ratio.
-    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
-      src++;
-    }
-  }
-}
-
-template <>
-void scale_row<true, uint32_t>(uint32_t* dst, int dstWidth, const uint32_t* src,
-                               int srcWidth, int span, int frac) {
-  // Do scaling with different source and dest widths.
-  // Gather source pixels four at a time for better packing.
-  auto* end = dst + span;
-  for (; dst + 4 <= end; dst += 4) {
-    U32 srcn;
-    srcn.x = *src;
-    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
-      src++;
-    }
-    srcn.y = *src;
-    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
-      src++;
-    }
-    srcn.z = *src;
-    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
-      src++;
-    }
-    srcn.w = *src;
-    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
-      src++;
-    }
-    WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn));
-    WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst));
-    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
-    unaligned_store(dst, r);
-  }
-  if (dst < end) {
-    // Process any remaining pixels. Try to gather as many pixels as possible
-    // into a single source chunk for compositing.
-    U32 srcn = {*src, 0, 0, 0};
-    if (end - dst > 1) {
-      for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
-        src++;
-      }
-      srcn.y = *src;
-      if (end - dst > 2) {
-        for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
-          src++;
-        }
-        srcn.z = *src;
-      }
-    }
-    WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn));
-    WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst));
-    auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
-    partial_store_span(dst, r, end - dst);
-  }
-}
-
-template <bool COMPOSITE = false>
-static NO_INLINE void scale_blit(Texture& srctex, const IntRect& srcReq,
-                                 Texture& dsttex, const IntRect& dstReq,
-                                 bool invertY, const IntRect& clipRect) {
-  assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 &&
-                        dsttex.internal_format == GL_RGBA8));
-  // Cache scaling ratios
-  int srcWidth = srcReq.width();
-  int srcHeight = srcReq.height();
-  int dstWidth = dstReq.width();
-  int dstHeight = dstReq.height();
-  // Compute valid dest bounds
-  IntRect dstBounds = dsttex.sample_bounds(dstReq).intersect(clipRect);
-  // Compute valid source bounds
-  IntRect srcBounds = srctex.sample_bounds(srcReq, invertY);
-  // If srcReq is outside the source texture, we need to clip the sampling
-  // bounds so that we never sample outside valid source bounds. Get texture
-  // bounds relative to srcReq and scale to dest-space rounding inward, using
-  // this rect to limit the dest bounds further.
-  IntRect srcClip = srctex.bounds() - srcReq.origin();
-  if (invertY) {
-    srcClip.invert_y(srcReq.height());
-  }
-  srcClip.scale(srcWidth, srcHeight, dstWidth, dstHeight, true);
-  dstBounds.intersect(srcClip);
-  // Check if clipped sampling bounds are empty
-  if (dstBounds.is_empty()) {
-    return;
-  }
-
-  // Calculate source and dest pointers from clamped offsets
-  int bpp = srctex.bpp();
-  int srcStride = srctex.stride();
-  int destStride = dsttex.stride();
-  char* dest = dsttex.sample_ptr(dstReq, dstBounds);
-  // Clip the source bounds by the destination offset.
-  int fracX = srcWidth * dstBounds.x0;
-  int fracY = srcHeight * dstBounds.y0;
-  srcBounds.x0 = max(fracX / dstWidth, srcBounds.x0);
-  srcBounds.y0 = max(fracY / dstHeight, srcBounds.y0);
-  fracX %= dstWidth;
-  fracY %= dstHeight;
-  char* src = srctex.sample_ptr(srcReq, srcBounds, invertY);
-  // Inverted Y must step downward along source rows
-  if (invertY) {
-    srcStride = -srcStride;
-  }
-  int span = dstBounds.width();
-  for (int rows = dstBounds.height(); rows > 0; rows--) {
-    switch (bpp) {
-      case 1:
-        if (srcWidth == dstWidth)
-          copy_row<COMPOSITE>((uint8_t*)dest, (uint8_t*)src, span);
-        else
-          scale_row<COMPOSITE>((uint8_t*)dest, dstWidth, (uint8_t*)src,
-                               srcWidth, span, fracX);
-        break;
-      case 2:
-        if (srcWidth == dstWidth)
-          copy_row<COMPOSITE>((uint16_t*)dest, (uint16_t*)src, span);
-        else
-          scale_row<COMPOSITE>((uint16_t*)dest, dstWidth, (uint16_t*)src,
-                               srcWidth, span, fracX);
-        break;
-      case 4:
-        if (srcWidth == dstWidth)
-          copy_row<COMPOSITE>((uint32_t*)dest, (uint32_t*)src, span);
-        else
-          scale_row<COMPOSITE>((uint32_t*)dest, dstWidth, (uint32_t*)src,
-                               srcWidth, span, fracX);
-        break;
-      default:
-        assert(false);
-        break;
-    }
-    dest += destStride;
-    // Step source according to height ratio.
-    for (fracY += srcHeight; fracY >= dstHeight; fracY -= dstHeight) {
-      src += srcStride;
-    }
-  }
-}
-
-template <bool COMPOSITE>
-static void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV,
-                            float srcDU, sampler2D sampler) {
-  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
-  for (; span >= 4; span -= 4) {
-    auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv));
-    unaligned_store(dest, srcpx);
-    dest += 4;
-    uv.x += 4 * srcDU;
-  }
-  if (span > 0) {
-    auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv));
-    partial_store_span(dest, srcpx, span);
-  }
-}
-
-template <>
-void linear_row_blit<true>(uint32_t* dest, int span, const vec2_scalar& srcUV,
-                           float srcDU, sampler2D sampler) {
-  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
-  for (; span >= 4; span -= 4) {
-    WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv));
-    WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
-    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
-    unaligned_store(dest, r);
-
-    dest += 4;
-    uv.x += 4 * srcDU;
-  }
-  if (span > 0) {
-    WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv));
-    WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, span));
-    PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
-    partial_store_span(dest, r, span);
-  }
-}
-
-template <bool COMPOSITE>
-static void linear_row_blit(uint8_t* dest, int span, const vec2_scalar& srcUV,
-                            float srcDU, sampler2D sampler) {
-  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
-  for (; span >= 4; span -= 4) {
-    auto srcpx = textureLinearPackedR8(sampler, ivec2(uv));
-    unaligned_store(dest, srcpx);
-    dest += 4;
-    uv.x += 4 * srcDU;
-  }
-  if (span > 0) {
-    auto srcpx = textureLinearPackedR8(sampler, ivec2(uv));
-    partial_store_span(dest, srcpx, span);
-  }
-}
-
-template <bool COMPOSITE>
-static void linear_row_blit(uint16_t* dest, int span, const vec2_scalar& srcUV,
-                            float srcDU, sampler2D sampler) {
-  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
-  for (; span >= 4; span -= 4) {
-    auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv));
-    unaligned_store(dest, srcpx);
-    dest += 4;
-    uv.x += 4 * srcDU;
-  }
-  if (span > 0) {
-    auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv));
-    partial_store_span(dest, srcpx, span);
-  }
-}
-
-template <bool COMPOSITE = false>
-static NO_INLINE void linear_blit(Texture& srctex, const IntRect& srcReq,
-                                  Texture& dsttex, const IntRect& dstReq,
-                                  bool invertY, const IntRect& clipRect) {
-  assert(srctex.internal_format == GL_RGBA8 ||
-         srctex.internal_format == GL_R8 || srctex.internal_format == GL_RG8);
-  assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 &&
-                        dsttex.internal_format == GL_RGBA8));
-  // Compute valid dest bounds
-  IntRect dstBounds = dsttex.sample_bounds(dstReq);
-  dstBounds.intersect(clipRect);
-  // Check if sampling bounds are empty
-  if (dstBounds.is_empty()) {
-    return;
-  }
-  // Initialize sampler for source texture
-  sampler2D_impl sampler;
-  init_sampler(&sampler, srctex);
-  sampler.filter = TextureFilter::LINEAR;
-  // Compute source UVs
-  vec2_scalar srcUV(srcReq.x0, srcReq.y0);
-  vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
-                     float(srcReq.height()) / dstReq.height());
-  // Inverted Y must step downward along source rows
-  if (invertY) {
-    srcUV.y += srcReq.height();
-    srcDUV.y = -srcDUV.y;
-  }
-  // Skip to clamped source start
-  srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
-  // Scale UVs by lerp precision
-  srcUV = linearQuantize(srcUV, 128);
-  srcDUV *= 128.0f;
-  // Calculate dest pointer from clamped offsets
-  int bpp = dsttex.bpp();
-  int destStride = dsttex.stride();
-  char* dest = dsttex.sample_ptr(dstReq, dstBounds);
-  int span = dstBounds.width();
-  for (int rows = dstBounds.height(); rows > 0; rows--) {
-    switch (bpp) {
-      case 1:
-        linear_row_blit<COMPOSITE>((uint8_t*)dest, span, srcUV, srcDUV.x,
-                                   &sampler);
-        break;
-      case 2:
-        linear_row_blit<COMPOSITE>((uint16_t*)dest, span, srcUV, srcDUV.x,
-                                   &sampler);
-        break;
-      case 4:
-        linear_row_blit<COMPOSITE>((uint32_t*)dest, span, srcUV, srcDUV.x,
-                                   &sampler);
-        break;
-      default:
-        assert(false);
-        break;
-    }
-    dest += destStride;
-    srcUV.y += srcDUV.y;
-  }
-}
-
-extern "C" {
-
-void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
-                     GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
-                     GLbitfield mask, GLenum filter) {
-  assert(mask == GL_COLOR_BUFFER_BIT);
-  Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER);
-  if (!srcfb) return;
-  Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER);
-  if (!dstfb) return;
-  Texture& srctex = ctx->textures[srcfb->color_attachment];
-  if (!srctex.buf) return;
-  Texture& dsttex = ctx->textures[dstfb->color_attachment];
-  if (!dsttex.buf) return;
-  assert(!dsttex.locked);
-  if (srctex.internal_format != dsttex.internal_format) {
-    assert(false);
-    return;
-  }
-  // Force flipped Y onto dest coordinates
-  if (srcY1 < srcY0) {
-    swap(srcY0, srcY1);
-    swap(dstY0, dstY1);
-  }
-  bool invertY = dstY1 < dstY0;
-  if (invertY) {
-    swap(dstY0, dstY1);
-  }
-  IntRect srcReq = IntRect{srcX0, srcY0, srcX1, srcY1} - srctex.offset;
-  IntRect dstReq = IntRect{dstX0, dstY0, dstX1, dstY1} - dsttex.offset;
-  if (srcReq.is_empty() || dstReq.is_empty()) {
-    return;
-  }
-  IntRect clipRect = {0, 0, dstReq.width(), dstReq.height()};
-  prepare_texture(srctex);
-  prepare_texture(dsttex, &dstReq);
-  if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR &&
-      (srctex.internal_format == GL_RGBA8 || srctex.internal_format == GL_R8 ||
-       srctex.internal_format == GL_RG8)) {
-    linear_blit(srctex, srcReq, dsttex, dstReq, invertY, dstReq);
-  } else {
-    scale_blit(srctex, srcReq, dsttex, dstReq, invertY, clipRect);
-  }
-}
-
-typedef Texture LockedTexture;
-
-// Lock the given texture to prevent modification.
-LockedTexture* LockTexture(GLuint texId) {
-  Texture& tex = ctx->textures[texId];
-  if (!tex.buf) {
-    assert(tex.buf != nullptr);
-    return nullptr;
-  }
-  if (__sync_fetch_and_add(&tex.locked, 1) == 0) {
-    // If this is the first time locking the texture, flush any delayed clears.
-    prepare_texture(tex);
-  }
-  return (LockedTexture*)&tex;
-}
-
-// Lock the given framebuffer's color attachment to prevent modification.
-LockedTexture* LockFramebuffer(GLuint fboId) {
-  Framebuffer& fb = ctx->framebuffers[fboId];
-  // Only allow locking a framebuffer if it has a valid color attachment.
-  if (!fb.color_attachment) {
-    assert(fb.color_attachment != 0);
-    return nullptr;
-  }
-  return LockTexture(fb.color_attachment);
-}
-
-// Reference an already locked resource
-void LockResource(LockedTexture* resource) {
-  if (!resource) {
-    return;
-  }
-  __sync_fetch_and_add(&resource->locked, 1);
-}
-
-// Remove a lock on a texture that has been previously locked
-void UnlockResource(LockedTexture* resource) {
-  if (!resource) {
-    return;
-  }
-  if (__sync_fetch_and_add(&resource->locked, -1) <= 0) {
-    // The lock should always be non-zero before unlocking.
-    assert(0);
-  }
-}
-
-// Get the underlying buffer for a locked resource
-void* GetResourceBuffer(LockedTexture* resource, int32_t* width,
-                        int32_t* height, int32_t* stride) {
-  *width = resource->width;
-  *height = resource->height;
-  *stride = resource->stride();
-  return resource->buf;
-}
-
-// Extension for optimized compositing of textures or framebuffers that may be
-// safely used across threads. The source and destination must be locked to
-// ensure that they can be safely accessed while the SWGL context might be used
-// by another thread. Band extents along the Y axis may be used to clip the
-// destination rectangle without effecting the integer scaling ratios.
-void Composite(LockedTexture* lockedDst, LockedTexture* lockedSrc, GLint srcX,
-               GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX,
-               GLint dstY, GLsizei dstWidth, GLsizei dstHeight,
-               GLboolean opaque, GLboolean flip, GLenum filter, GLint clipX,
-               GLint clipY, GLsizei clipWidth, GLsizei clipHeight) {
-  if (!lockedDst || !lockedSrc) {
-    return;
-  }
-  Texture& srctex = *lockedSrc;
-  Texture& dsttex = *lockedDst;
-  assert(srctex.bpp() == 4);
-  assert(dsttex.bpp() == 4);
-
-  IntRect srcReq =
-      IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - srctex.offset;
-  IntRect dstReq =
-      IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset;
-  // Compute clip rect as relative to the dstReq, as that's the same coords
-  // as used for the sampling bounds.
-  IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth,
-                      clipY - dstY + clipHeight};
-
-  if (opaque) {
-    // Ensure we have rows of at least 2 pixels when using the linear filter
-    // to avoid overreading the row.
-    if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR) {
-      linear_blit<false>(srctex, srcReq, dsttex, dstReq, flip, clipRect);
-    } else {
-      scale_blit<false>(srctex, srcReq, dsttex, dstReq, flip, clipRect);
-    }
-  } else {
-    if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR) {
-      linear_blit<true>(srctex, srcReq, dsttex, dstReq, flip, clipRect);
-    } else {
-      scale_blit<true>(srctex, srcReq, dsttex, dstReq, flip, clipRect);
-    }
-  }
-}
-
-}  // extern "C"
-
-// Saturated add helper for YUV conversion. Supported platforms have intrinsics
-// to do this natively, but support a slower generic fallback just in case.
-static inline V8<int16_t> addsat(V8<int16_t> x, V8<int16_t> y) {
-#if USE_SSE2
-  return _mm_adds_epi16(x, y);
-#elif USE_NEON
-  return vqaddq_s16(x, y);
-#else
-  auto r = x + y;
-  // An overflow occurred if the signs of both inputs x and y did not differ
-  // but yet the sign of the result did differ.
-  auto overflow = (~(x ^ y) & (r ^ x)) >> 15;
-  // If there was an overflow, we need to choose the appropriate limit to clamp
-  // to depending on whether or not the inputs are negative.
-  auto limit = (x >> 15) ^ 0x7FFF;
-  // If we didn't overflow, just use the result, and otherwise, use the limit.
-  return (~overflow & r) | (overflow & limit);
-#endif
-}
-
-// Interleave and packing helper for YUV conversion. During transform by the
-// color matrix, the color components are de-interleaved as this format is
-// usually what comes out of the planar YUV textures. The components thus need
-// to be interleaved before finally getting packed to BGRA format. Alpha is
-// forced to be opaque.
-static inline PackedRGBA8 packYUV(V8<int16_t> gg, V8<int16_t> br) {
-  return pack(bit_cast<WideRGBA8>(zip(br, gg))) |
-         PackedRGBA8{0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
-}
-
-// clang-format off
-// Supports YUV color matrixes of the form:
-// [R]   [1.1643835616438356,  0.0,  rv ]   [Y -  16]
-// [G] = [1.1643835616438358, -gu,  -gv ] x [U - 128]
-// [B]   [1.1643835616438356,  bu,  0.0 ]   [V - 128]
-// We must be able to multiply a YUV input by a matrix coefficient ranging as
-// high as ~2.2 in the U/V cases, where U/V can be signed values between -128
-// and 127. The largest fixed-point representation we can thus support without
-// overflowing 16 bit integers leaves us 6 bits of fractional precision while
-// also supporting a sign bit. The closest representation of the Y coefficient
-// ~1.164 in this precision is 74.5/2^6 which is common to all color spaces
-// we support. Conversions can still sometimes overflow the precision and
-// require clamping back into range, so we use saturated additions to do this
-// efficiently at no extra cost.
-// clang-format on
-struct YUVMatrix {
-  // These constants are loaded off the "this" pointer via relative addressing
-  // modes and should be about as quick to load as directly addressed SIMD
-  // constant memory.
-  V8<int16_t> rbCoeffs;
-  V8<int16_t> gCoeffs;
-  V8<uint16_t> yScale;
-  V8<int16_t> yBias;
-  V8<int16_t> uvBias;
-  V8<int16_t> brMask;
-
-  // Set the coefficients to cancel out and pass through YUV as GBR. All biases
-  // are set to zero and the BR-mask is set to remove the contribution of Y to
-  // the BR channels. Scales are set such that the shift by 6 in convert is
-  // balanced.
-  YUVMatrix()
-      : rbCoeffs(1 << 6),
-        gCoeffs(0),
-        yScale(1 << (6 + 1)),
-        yBias(0),
-        uvBias(0),
-        brMask(0) {}
-
-  // Convert matrix coefficients to fixed-point representation.
-  YUVMatrix(double rv, double gu, double gv, double bu)
-      : rbCoeffs(
-            zip(I16(int16_t(bu * 64.0 + 0.5)), I16(int16_t(rv * 64.0 + 0.5)))),
-        gCoeffs(zip(I16(-int16_t(gu * -64.0 + 0.5)),
-                    I16(-int16_t(gv * -64.0 + 0.5)))),
-        yScale(2 * 74 + 1),
-        yBias(int16_t(-16 * 74.5) + (1 << 5)),
-        uvBias(-128),
-        brMask(-1) {}
-
-  ALWAYS_INLINE PackedRGBA8 convert(V8<int16_t> yy, V8<int16_t> uv) const {
-    // Bias Y values by -16 and multiply by 74.5. Add 2^5 offset to round to
-    // nearest 2^6. Note that we have to use an unsigned multiply with a 2x
-    // scale to represent a fractional scale and to avoid shifting with the sign
-    // bit.
-    yy = bit_cast<V8<int16_t>>((bit_cast<V8<uint16_t>>(yy) * yScale) >> 1) +
-         yBias;
-
-    // Bias U/V values by -128.
-    uv += uvBias;
-
-    // Compute (R, B) = (74.5*Y + rv*V, 74.5*Y + bu*U)
-    auto br = rbCoeffs * uv;
-    br = addsat(yy & brMask, br);
-    br >>= 6;
-
-    // Compute G = 74.5*Y + -gu*U + -gv*V
-    auto gg = gCoeffs * uv;
-    gg = addsat(
-        yy,
-        addsat(gg, bit_cast<V8<int16_t>>(bit_cast<V4<uint32_t>>(gg) >> 16)));
-    gg >>= 6;
-
-    // Interleave B/R and G values. Force alpha to opaque.
-    return packYUV(gg, br);
-  }
-};
-
-enum YUVColorSpace { REC_601 = 0, REC_709, REC_2020, IDENTITY };
-
-static const YUVMatrix yuvMatrix[IDENTITY + 1] = {
-    // clang-format off
-// From Rec601:
-// [R]   [1.1643835616438356,  0.0,                 1.5960267857142858   ]   [Y -  16]
-// [G] = [1.1643835616438358, -0.3917622900949137, -0.8129676472377708   ] x [U - 128]
-// [B]   [1.1643835616438356,  2.017232142857143,   8.862867620416422e-17]   [V - 128]
-  {1.5960267857142858, -0.3917622900949137, -0.8129676472377708, 2.017232142857143},
-
-// From Rec709:
-// [R]   [1.1643835616438356,  0.0,                  1.7927410714285714]   [Y -  16]
-// [G] = [1.1643835616438358, -0.21324861427372963, -0.532909328559444 ] x [U - 128]
-// [B]   [1.1643835616438356,  2.1124017857142854,   0.0               ]   [V - 128]
-  {1.7927410714285714, -0.21324861427372963, -0.532909328559444, 2.1124017857142854},
-
-// From Re2020:
-// [R]   [1.16438356164384,  0.0,                1.678674107142860 ]   [Y -  16]
-// [G] = [1.16438356164384, -0.187326104219343, -0.650424318505057 ] x [U - 128]
-// [B]   [1.16438356164384,  2.14177232142857,   0.0               ]   [V - 128]
-  {1.678674107142860, -0.187326104219343, -0.650424318505057, 2.14177232142857},
-
-// Identity
-// [R]   [V]
-// [G] = [Y]
-// [B]   [U]
-  {},
-    // clang-format on
-};
-
-// Helper function for textureLinearRowR8 that samples horizontal taps and
-// combines them based on Y fraction with next row.
-template <typename S>
-static ALWAYS_INLINE V8<int16_t> linearRowTapsR8(S sampler, I32 ix,
-                                                 int32_t offsety,
-                                                 int32_t stridey,
-                                                 int16_t fracy) {
-  uint8_t* buf = (uint8_t*)sampler->buf + offsety;
-  auto a0 = unaligned_load<V2<uint8_t>>(&buf[ix.x]);
-  auto b0 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
-  auto c0 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
-  auto d0 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
-  auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>);
-  buf += stridey;
-  auto a1 = unaligned_load<V2<uint8_t>>(&buf[ix.x]);
-  auto b1 = unaligned_load<V2<uint8_t>>(&buf[ix.y]);
-  auto c1 = unaligned_load<V2<uint8_t>>(&buf[ix.z]);
-  auto d1 = unaligned_load<V2<uint8_t>>(&buf[ix.w]);
-  auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>);
-  abcd0 += ((abcd1 - abcd0) * fracy) >> 7;
-  return abcd0;
-}
-
-// Optimized version of textureLinearPackedR8 for Y R8 texture. This assumes
-// constant Y and returns a duplicate of the result interleaved with itself
-// to aid in later YUV transformation.
-template <typename S>
-static inline V8<int16_t> textureLinearRowR8(S sampler, I32 ix, int32_t offsety,
-                                             int32_t stridey, int16_t fracy) {
-  assert(sampler->format == TextureFormat::R8);
-
-  // Calculate X fraction and clamp X offset into range.
-  I32 fracx = ix;
-  ix >>= 7;
-  fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F;
-  ix = clampCoord(ix, sampler->width - 1);
-
-  // Load the sample taps and combine rows.
-  auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy);
-
-  // Unzip the result and do final horizontal multiply-add base on X fraction.
-  auto abcdl = SHUFFLE(abcd, abcd, 0, 0, 2, 2, 4, 4, 6, 6);
-  auto abcdh = SHUFFLE(abcd, abcd, 1, 1, 3, 3, 5, 5, 7, 7);
-  abcdl += ((abcdh - abcdl) * CONVERT(fracx, I16).xxyyzzww) >> 7;
-
-  // The final result is the packed values interleaved with a duplicate of
-  // themselves.
-  return abcdl;
-}
-
-// Optimized version of textureLinearPackedR8 for paired U/V R8 textures.
-// Since the two textures have the same dimensions and stride, the addressing
-// math can be shared between both samplers. This also allows a coalesced
-// multiply in the final stage by packing both U/V results into a single
-// operation.
-template <typename S>
-static inline V8<int16_t> textureLinearRowPairedR8(S sampler, S sampler2,
-                                                   I32 ix, int32_t offsety,
-                                                   int32_t stridey,
-                                                   int16_t fracy) {
-  assert(sampler->format == TextureFormat::R8 &&
-         sampler2->format == TextureFormat::R8);
-  assert(sampler->width == sampler2->width &&
-         sampler->height == sampler2->height);
-  assert(sampler->stride == sampler2->stride);
-
-  // Calculate X fraction and clamp X offset into range.
-  I32 fracx = ix;
-  ix >>= 7;
-  fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F;
-  ix = clampCoord(ix, sampler->width - 1);
-
-  // Load the sample taps for the first sampler and combine rows.
-  auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy);
-
-  // Load the sample taps for the second sampler and combine rows.
-  auto xyzw = linearRowTapsR8(sampler2, ix, offsety, stridey, fracy);
-
-  // We are left with a result vector for each sampler with values for adjacent
-  // pixels interleaved together in each. We need to unzip these values so that
-  // we can do the final horizontal multiply-add based on the X fraction.
-  auto abcdxyzwl = SHUFFLE(abcd, xyzw, 0, 8, 2, 10, 4, 12, 6, 14);
-  auto abcdxyzwh = SHUFFLE(abcd, xyzw, 1, 9, 3, 11, 5, 13, 7, 15);
-  abcdxyzwl += ((abcdxyzwh - abcdxyzwl) * CONVERT(fracx, I16).xxyyzzww) >> 7;
-
-  // The final result is the packed values for the first sampler interleaved
-  // with the packed values for the second sampler.
-  return abcdxyzwl;
-}
-
-// Casting to int loses some precision while stepping that can offset the
-// image, so shift the values by some extra bits of precision to minimize
-// this. We support up to 16 bits of image size, 7 bits of quantization,
-// and 1 bit for sign, which leaves 8 bits left for extra precision.
-const int STEP_BITS = 8;
-
-// Optimized version of textureLinearPackedR8 for Y R8 texture with
-// half-resolution paired U/V R8 textures. This allows us to more efficiently
-// pack YUV samples into vectors to substantially reduce math operations even
-// further.
-template <bool BLEND>
-static inline void upscaleYUV42R8(uint32_t* dest, int span, uint8_t* yRow,
-                                  I32 yU, int32_t yDU, int32_t yStrideV,
-                                  int16_t yFracV, uint8_t* cRow1,
-                                  uint8_t* cRow2, I32 cU, int32_t cDU,
-                                  int32_t cStrideV, int16_t cFracV,
-                                  const YUVMatrix& colorSpace) {
-  // As much as possible try to utilize the fact that we're only using half
-  // the UV samples to combine Y and UV samples into single vectors. Here we
-  // need to initialize several useful vector quantities for stepping fractional
-  // offsets. For the UV samples, we take the average of the first+second and
-  // third+fourth samples in a chunk which conceptually correspond to offsets
-  // 0.5 and 1.5 (in 0..2 range). This allows us to reconstruct intermediate
-  // samples 0.25, 0.75, 1.25, and 1.75 later. X fraction is shifted over into
-  // the top 7 bits of an unsigned short so that we can mask off the exact
-  // fractional bits we need to blend merely by right shifting them into
-  // position.
-  cU = (cU.xzxz + cU.ywyw) >> 1;
-  auto ycFracX = CONVERT(combine(yU, cU), V8<uint16_t>)
-                 << (16 - (STEP_BITS + 7));
-  auto ycFracDX = combine(I16(yDU), I16(cDU)) << (16 - (STEP_BITS + 7));
-  auto ycFracV = combine(I16(yFracV), I16(cFracV));
-  I32 yI = yU >> (STEP_BITS + 7);
-  I32 cI = cU >> (STEP_BITS + 7);
-  // Load initial combined YUV samples for each row and blend them.
-  auto ycSrc0 =
-      CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x]),
-                      combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x]),
-                              unaligned_load<V2<uint8_t>>(&cRow2[cI.x]))),
-              V8<int16_t>);
-  auto ycSrc1 = CONVERT(
-      combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x + yStrideV]),
-              combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x + cStrideV]),
-                      unaligned_load<V2<uint8_t>>(&cRow2[cI.x + cStrideV]))),
-      V8<int16_t>);
-  auto ycSrc = ycSrc0 + (((ycSrc1 - ycSrc0) * ycFracV) >> 7);
-
-  // Here we shift in results from the next sample while caching results from
-  // the previous sample. This allows us to reduce the multiplications in the
-  // inner loop down to only two since we just need to blend the new samples
-  // horizontally and then vertically once each.
-  for (uint32_t* end = dest + span; dest < end; dest += 4) {
-    yU += yDU;
-    I32 yIn = yU >> (STEP_BITS + 7);
-    cU += cDU;
-    I32 cIn = cU >> (STEP_BITS + 7);
-    // Load combined YUV samples for the next chunk on each row and blend them.
-    auto ycSrc0n =
-        CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x]),
-                        combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x]),
-                                unaligned_load<V2<uint8_t>>(&cRow2[cIn.x]))),
-                V8<int16_t>);
-    auto ycSrc1n = CONVERT(
-        combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x + yStrideV]),
-                combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x + cStrideV]),
-                        unaligned_load<V2<uint8_t>>(&cRow2[cIn.x + cStrideV]))),
-        V8<int16_t>);
-    auto ycSrcn = ycSrc0n + (((ycSrc1n - ycSrc0n) * ycFracV) >> 7);
-
-    // The source samples for the chunk may not match the actual tap offsets.
-    // Since we're upscaling, we know the tap offsets fall within all the
-    // samples in a 4-wide chunk. Since we can't rely on PSHUFB or similar,
-    // instead we do laborious shuffling here for the Y samples and then the UV
-    // samples.
-    auto yshuf = lowHalf(ycSrc);
-    auto yshufn =
-        SHUFFLE(yshuf, yIn.x == yI.w ? lowHalf(ycSrcn).yyyy : lowHalf(ycSrcn),
-                1, 2, 3, 4);
-    if (yI.y == yI.x) {
-      yshuf = yshuf.xxyz;
-      yshufn = yshufn.xxyz;
-    }
-    if (yI.z == yI.y) {
-      yshuf = yshuf.xyyz;
-      yshufn = yshufn.xyyz;
-    }
-    if (yI.w == yI.z) {
-      yshuf = yshuf.xyzz;
-      yshufn = yshufn.xyzz;
-    }
-
-    auto cshuf = highHalf(ycSrc);
-    auto cshufn =
-        SHUFFLE(cshuf, cIn.x == cI.y ? highHalf(ycSrcn).yyww : highHalf(ycSrcn),
-                1, 4, 3, 6);
-    if (cI.y == cI.x) {
-      cshuf = cshuf.xxzz;
-      cshufn = cshufn.xxzz;
-    }
-
-    // After shuffling, combine the Y and UV samples back into a single vector
-    // for blending. Shift X fraction into position as unsigned to mask off top
-    // bits and get rid of low bits to avoid multiplication overflow.
-    auto yuvPx = combine(yshuf, cshuf);
-    yuvPx += ((combine(yshufn, cshufn) - yuvPx) *
-              bit_cast<V8<int16_t>>(ycFracX >> (16 - 7))) >>
-             7;
-
-    // Cache the new samples as the current samples on the next iteration.
-    ycSrc = ycSrcn;
-    ycFracX += ycFracDX;
-    yI = yIn;
-    cI = cIn;
-
-    // De-interleave the Y and UV results. We need to average the UV results
-    // to produce values for intermediate samples. Taps for UV were collected at
-    // offsets 0.5 and 1.5, such that if we take a quarter of the difference
-    // (1.5-0.5)/4, subtract it from even samples, and add it to odd samples,
-    // we can estimate samples 0.25, 0.75, 1.25, and 1.75.
-    auto yPx = SHUFFLE(yuvPx, yuvPx, 0, 0, 1, 1, 2, 2, 3, 3);
-    auto uvPx = SHUFFLE(yuvPx, yuvPx, 4, 6, 4, 6, 5, 7, 5, 7) +
-                ((SHUFFLE(yuvPx, yuvPx, 4, 6, 5, 7, 4, 6, 5, 7) -
-                  SHUFFLE(yuvPx, yuvPx, 5, 7, 4, 6, 5, 7, 4, 6)) >>
-                 2);
-
-    commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx));
-  }
-}
-
-// This is the inner loop driver of CompositeYUV that processes an axis-aligned
-// YUV span, dispatching based on appropriate format and scaling. This is also
-// reused by blendYUV to accelerate some cases of texture sampling in the
-// shader.
-template <bool BLEND = false>
-static void linear_row_yuv(uint32_t* dest, int span, sampler2DRect samplerY,
-                           const vec2_scalar& srcUV, float srcDU,
-                           sampler2DRect samplerU, sampler2DRect samplerV,
-                           const vec2_scalar& chromaUV, float chromaDU,
-                           int colorDepth, const YUVMatrix& colorSpace) {
-  // Calculate varying and constant interp data for Y plane.
-  I32 yU = cast(init_interp(srcUV.x, srcDU) * (1 << STEP_BITS));
-  int32_t yV = int32_t(srcUV.y);
-
-  // Calculate varying and constant interp data for chroma planes.
-  I32 cU = cast(init_interp(chromaUV.x, chromaDU) * (1 << STEP_BITS));
-  int32_t cV = int32_t(chromaUV.y);
-
-  // We need to skip 4 pixels per chunk.
-  int32_t yDU = int32_t((4 << STEP_BITS) * srcDU);
-  int32_t cDU = int32_t((4 << STEP_BITS) * chromaDU);
-
-  if (samplerY->width < 2 || samplerU->width < 2) {
-    // If the source row has less than 2 pixels, it's not safe to use a linear
-    // filter because it may overread the row. Just convert the single pixel
-    // with nearest filtering and fill the row with it.
-    I16 yuv = CONVERT(
-        round_pixel((Float){texelFetch(samplerY, ivec2(srcUV)).x.x,
-                            texelFetch(samplerU, ivec2(chromaUV)).x.x,
-                            texelFetch(samplerV, ivec2(chromaUV)).x.x, 1.0f}),
-        I16);
-    commit_solid_span<BLEND>(
-        dest,
-        unpack(colorSpace.convert(V8<int16_t>(yuv.x),
-                                  zip(I16(yuv.y), I16(yuv.z)))),
-        span);
-  } else if (samplerY->format == TextureFormat::R16) {
-    // Sample each YUV plane, rescale it to fit in low 8 bits of word, and
-    // then transform them by the appropriate color space.
-    assert(colorDepth > 8);
-    // Need to right shift the sample by the amount of bits over 8 it
-    // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
-    // of precision at the low end already, hence 1 is subtracted from the
-    // color depth.
-    int rescaleBits = (colorDepth - 1) - 8;
-    for (; span >= 4; span -= 4) {
-      auto yPx =
-          textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >>
-          rescaleBits;
-      auto uPx =
-          textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >>
-          rescaleBits;
-      auto vPx =
-          textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >>
-          rescaleBits;
-      commit_blend_span<BLEND>(
-          dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx)));
-      dest += 4;
-      yU += yDU;
-      cU += cDU;
-    }
-    if (span > 0) {
-      // Handle any remaining pixels...
-      auto yPx =
-          textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >>
-          rescaleBits;
-      auto uPx =
-          textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >>
-          rescaleBits;
-      auto vPx =
-          textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >>
-          rescaleBits;
-      commit_blend_span<BLEND>(
-          dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx)), span);
-    }
-  } else {
-    assert(samplerY->format == TextureFormat::R8);
-    assert(colorDepth == 8);
-
-    // Calculate varying and constant interp data for Y plane.
-    int16_t yFracV = yV & 0x7F;
-    yV >>= 7;
-    int32_t yOffsetV = clampCoord(yV, samplerY->height) * samplerY->stride;
-    int32_t yStrideV =
-        yV >= 0 && yV < int32_t(samplerY->height) - 1 ? samplerY->stride : 0;
-
-    // Calculate varying and constant interp data for chroma planes.
-    int16_t cFracV = cV & 0x7F;
-    cV >>= 7;
-    int32_t cOffsetV = clampCoord(cV, samplerU->height) * samplerU->stride;
-    int32_t cStrideV =
-        cV >= 0 && cV < int32_t(samplerU->height) - 1 ? samplerU->stride : 0;
-
-    // If we're sampling the UV planes at half the resolution of the Y plane,
-    // then try to use half resolution fast-path.
-    if (yDU >= cDU && cDU > 0 && yDU <= (4 << (STEP_BITS + 7)) &&
-        cDU <= (2 << (STEP_BITS + 7))) {
-      // Ensure that samples don't fall outside of the valid bounds of each
-      // planar texture. Step until the initial X coordinates are positive.
-      for (; (yU.x < 0 || cU.x < 0) && span >= 4; span -= 4) {
-        auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV,
-                                      yStrideV, yFracV);
-        auto uvPx = textureLinearRowPairedR8(
-            samplerU, samplerV, cU >> STEP_BITS, cOffsetV, cStrideV, cFracV);
-        commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx));
-        dest += 4;
-        yU += yDU;
-        cU += cDU;
-      }
-      // Calculate the number of aligned chunks that we can step inside the
-      // bounds of each planar texture without overreading.
-      int inside = min(
-          min((((int(samplerY->width) - 4) << (STEP_BITS + 7)) - yU.x) / yDU,
-              (((int(samplerU->width) - 4) << (STEP_BITS + 7)) - cU.x) / cDU) *
-              4,
-          span & ~3);
-      if (inside > 0) {
-        uint8_t* yRow = (uint8_t*)samplerY->buf + yOffsetV;
-        uint8_t* cRow1 = (uint8_t*)samplerU->buf + cOffsetV;
-        uint8_t* cRow2 = (uint8_t*)samplerV->buf + cOffsetV;
-        upscaleYUV42R8<BLEND>(dest, inside, yRow, yU, yDU, yStrideV, yFracV,
-                              cRow1, cRow2, cU, cDU, cStrideV, cFracV,
-                              colorSpace);
-        span -= inside;
-        dest += inside;
-        yU += (inside / 4) * yDU;
-        cU += (inside / 4) * cDU;
-      }
-      // If there are any remaining chunks that weren't inside, handle them
-      // below.
-    }
-    for (; span >= 4; span -= 4) {
-      // Sample each YUV plane and then transform them by the appropriate
-      // color space.
-      auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV,
-                                    yStrideV, yFracV);
-      auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS,
-                                           cOffsetV, cStrideV, cFracV);
-      commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx));
-      dest += 4;
-      yU += yDU;
-      cU += cDU;
-    }
-    if (span > 0) {
-      // Handle any remaining pixels...
-      auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV,
-                                    yStrideV, yFracV);
-      auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS,
-                                           cOffsetV, cStrideV, cFracV);
-      commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx), span);
-    }
-  }
-}
-
-static void linear_convert_yuv(Texture& ytex, Texture& utex, Texture& vtex,
-                               YUVColorSpace colorSpace, int colorDepth,
-                               const IntRect& srcReq, Texture& dsttex,
-                               const IntRect& dstReq, bool invertY,
-                               const IntRect& clipRect) {
-  // Compute valid dest bounds
-  IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY);
-  dstBounds.intersect(clipRect);
-  // Check if sampling bounds are empty
-  if (dstBounds.is_empty()) {
-    return;
-  }
-  // Initialize samplers for source textures
-  sampler2DRect_impl sampler[3];
-  init_sampler(&sampler[0], ytex);
-  init_sampler(&sampler[1], utex);
-  init_sampler(&sampler[2], vtex);
-
-  // Compute source UVs
-  vec2_scalar srcUV(srcReq.x0, srcReq.y0);
-  vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
-                     float(srcReq.height()) / dstReq.height());
-  // Inverted Y must step downward along source rows
-  if (invertY) {
-    srcUV.y += srcReq.height();
-    srcDUV.y = -srcDUV.y;
-  }
-  // Skip to clamped source start
-  srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f);
-  // Calculate separate chroma UVs for chroma planes with different scale
-  vec2_scalar chromaScale(float(utex.width) / ytex.width,
-                          float(utex.height) / ytex.height);
-  vec2_scalar chromaUV = srcUV * chromaScale;
-  vec2_scalar chromaDUV = srcDUV * chromaScale;
-  // Scale UVs by lerp precision. If the row has only 1 pixel, then don't
-  // quantize so that we can use nearest filtering instead to avoid overreads.
-  if (ytex.width >= 2 && utex.width >= 2) {
-    srcUV = linearQuantize(srcUV, 128);
-    srcDUV *= 128.0f;
-    chromaUV = linearQuantize(chromaUV, 128);
-    chromaDUV *= 128.0f;
-  }
-  // Calculate dest pointer from clamped offsets
-  int destStride = dsttex.stride();
-  char* dest = dsttex.sample_ptr(dstReq, dstBounds);
-  int span = dstBounds.width();
-  for (int rows = dstBounds.height(); rows > 0; rows--) {
-    linear_row_yuv((uint32_t*)dest, span, &sampler[0], srcUV, srcDUV.x,
-                   &sampler[1], &sampler[2], chromaUV, chromaDUV.x, colorDepth,
-                   yuvMatrix[colorSpace]);
-    dest += destStride;
-    srcUV.y += srcDUV.y;
-    chromaUV.y += chromaDUV.y;
-  }
-}
-
-extern "C" {
-
-// Extension for compositing a YUV surface represented by separate YUV planes
-// to a BGRA destination. The supplied color space is used to determine the
-// transform from YUV to BGRA after sampling.
-void CompositeYUV(LockedTexture* lockedDst, LockedTexture* lockedY,
-                  LockedTexture* lockedU, LockedTexture* lockedV,
-                  YUVColorSpace colorSpace, GLuint colorDepth, GLint srcX,
-                  GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX,
-                  GLint dstY, GLsizei dstWidth, GLsizei dstHeight,
-                  GLboolean flip, GLint clipX, GLint clipY, GLsizei clipWidth,
-                  GLsizei clipHeight) {
-  if (!lockedDst || !lockedY || !lockedU || !lockedV) {
-    return;
-  }
-  if (colorSpace > IDENTITY) {
-    assert(false);
-    return;
-  }
-  Texture& ytex = *lockedY;
-  Texture& utex = *lockedU;
-  Texture& vtex = *lockedV;
-  Texture& dsttex = *lockedDst;
-  // All YUV planes must currently be represented by R8 or R16 textures.
-  // The chroma (U/V) planes must have matching dimensions.
-  assert(ytex.bpp() == utex.bpp() && ytex.bpp() == vtex.bpp());
-  assert((ytex.bpp() == 1 && colorDepth == 8) ||
-         (ytex.bpp() == 2 && colorDepth > 8));
-  // assert(ytex.width == utex.width && ytex.height == utex.height);
-  assert(utex.width == vtex.width && utex.height == vtex.height);
-  assert(ytex.offset == utex.offset && ytex.offset == vtex.offset);
-  assert(dsttex.bpp() == 4);
-
-  IntRect srcReq =
-      IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - ytex.offset;
-  IntRect dstReq =
-      IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset;
-  // Compute clip rect as relative to the dstReq, as that's the same coords
-  // as used for the sampling bounds.
-  IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth,
-                      clipY - dstY + clipHeight};
-  // For now, always use a linear filter path that would be required for
-  // scaling. Further fast-paths for non-scaled video might be desirable in the
-  // future.
-  linear_convert_yuv(ytex, utex, vtex, colorSpace, colorDepth, srcReq, dsttex,
-                     dstReq, flip, clipRect);
-}
-
-}  // extern "C"
diff --git a/third_party/webrender/swgl/src/gl.cc b/third_party/webrender/swgl/src/gl.cc
index 6e214547421..f4a69752dde 100644
--- a/third_party/webrender/swgl/src/gl.cc
+++ b/third_party/webrender/swgl/src/gl.cc
@@ -22,65 +22,15 @@
 #  define debugf(...) printf(__VA_ARGS__)
 #endif
 
-// #define PRINT_TIMINGS
-
 #ifdef _WIN32
 #  define ALWAYS_INLINE __forceinline
-#  define NO_INLINE __declspec(noinline)
-
-// Including Windows.h brings a huge amount of namespace polution so just
-// define a couple of things manually
-typedef int BOOL;
-#  define WINAPI __stdcall
-#  define DECLSPEC_IMPORT __declspec(dllimport)
-#  define WINBASEAPI DECLSPEC_IMPORT
-typedef unsigned long DWORD;
-typedef long LONG;
-typedef __int64 LONGLONG;
-#  define DUMMYSTRUCTNAME
-
-typedef union _LARGE_INTEGER {
-  struct {
-    DWORD LowPart;
-    LONG HighPart;
-  } DUMMYSTRUCTNAME;
-  struct {
-    DWORD LowPart;
-    LONG HighPart;
-  } u;
-  LONGLONG QuadPart;
-} LARGE_INTEGER;
-extern "C" {
-WINBASEAPI BOOL WINAPI
-QueryPerformanceCounter(LARGE_INTEGER* lpPerformanceCount);
-
-WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency);
-}
-
 #else
-// GCC is slower when dealing with always_inline, especially in debug builds.
-// When using Clang, use always_inline more aggressively.
-#  if defined(__clang__) || defined(NDEBUG)
-#    define ALWAYS_INLINE __attribute__((always_inline)) inline
-#  else
-#    define ALWAYS_INLINE inline
-#  endif
-#  define NO_INLINE __attribute__((noinline))
-#endif
-
-// Some functions may cause excessive binary bloat if inlined in debug or with
-// GCC builds, so use PREFER_INLINE on these instead of ALWAYS_INLINE.
-#if defined(__clang__) && defined(NDEBUG)
-#  define PREFER_INLINE ALWAYS_INLINE
-#else
-#  define PREFER_INLINE inline
+#  define ALWAYS_INLINE __attribute__((always_inline)) inline
 #endif
 
 #define UNREACHABLE __builtin_unreachable()
 
-#define UNUSED [[maybe_unused]]
-
-#define FALLTHROUGH [[fallthrough]]
+#define UNUSED __attribute__((unused))
 
 #ifdef MOZILLA_CLIENT
 #  define IMPLICIT __attribute__((annotate("moz_implicit")))
@@ -91,32 +41,19 @@ WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency);
 #include "gl_defs.h"
 #include "glsl.h"
 #include "program.h"
-#include "texture.h"
 
 using namespace glsl;
 
-typedef ivec2_scalar IntPoint;
-
 struct IntRect {
   int x0;
   int y0;
   int x1;
   int y1;
 
-  IntRect() : x0(0), y0(0), x1(0), y1(0) {}
-  IntRect(int x0, int y0, int x1, int y1) : x0(x0), y0(y0), x1(x1), y1(y1) {}
-  IntRect(IntPoint origin, IntPoint size)
-      : x0(origin.x),
-        y0(origin.y),
-        x1(origin.x + size.x),
-        y1(origin.y + size.y) {}
-
   int width() const { return x1 - x0; }
   int height() const { return y1 - y0; }
   bool is_empty() const { return width() <= 0 || height() <= 0; }
 
-  IntPoint origin() const { return IntPoint(x0, y0); }
-
   bool same_size(const IntRect& o) const {
     return width() == o.width() && height() == o.height();
   }
@@ -133,12 +70,6 @@ struct IntRect {
     return *this;
   }
 
-  IntRect intersection(const IntRect& o) {
-    IntRect result = *this;
-    result.intersect(o);
-    return result;
-  }
-
   // Scale from source-space to dest-space, optionally rounding inward
   IntRect& scale(int srcWidth, int srcHeight, int dstWidth, int dstHeight,
                  bool roundIn = false) {
@@ -156,60 +87,15 @@ struct IntRect {
     swap(y0, y1);
   }
 
-  IntRect& offset(const IntPoint& o) {
-    x0 += o.x;
-    y0 += o.y;
-    x1 += o.x;
-    y1 += o.y;
+  IntRect& offset(int dx, int dy) {
+    x0 += dx;
+    y0 += dy;
+    x1 += dx;
+    y1 += dy;
     return *this;
   }
-
-  IntRect operator+(const IntPoint& o) const {
-    return IntRect(*this).offset(o);
-  }
-  IntRect operator-(const IntPoint& o) const {
-    return IntRect(*this).offset(-o);
-  }
 };
 
-typedef vec2_scalar Point2D;
-typedef vec4_scalar Point3D;
-
-struct IntRange {
-  int start;
-  int end;
-
-  int len() const { return end - start; }
-
-  IntRange intersect(IntRange r) const {
-    return {max(start, r.start), min(end, r.end)};
-  }
-};
-
-struct FloatRange {
-  float start;
-  float end;
-
-  float clip(float x) const { return clamp(x, start, end); }
-
-  FloatRange clip(FloatRange r) const { return {clip(r.start), clip(r.end)}; }
-
-  FloatRange merge(FloatRange r) const {
-    return {min(start, r.start), max(end, r.end)};
-  }
-
-  IntRange round() const {
-    return {int(floor(start + 0.5f)), int(floor(end + 0.5f))};
-  }
-
-  IntRange round_out() const { return {int(floor(start)), int(ceil(end))}; }
-};
-
-template <typename P>
-static inline FloatRange x_range(P p0, P p1) {
-  return {min(p0.x, p1.x), max(p0.x, p1.x)};
-}
-
 struct VertexAttrib {
   size_t size = 0;  // in bytes
   GLenum type = 0;
@@ -237,18 +123,12 @@ static int bytes_for_internal_format(GLenum internal_format) {
     case GL_R8:
     case GL_RED:
       return 1;
-    case GL_RG8:
-    case GL_RG:
-      return 2;
     case GL_DEPTH_COMPONENT:
     case GL_DEPTH_COMPONENT16:
+      return 2;
     case GL_DEPTH_COMPONENT24:
     case GL_DEPTH_COMPONENT32:
       return 4;
-    case GL_RGB_RAW_422_APPLE:
-      return 2;
-    case GL_R16:
-      return 2;
     default:
       debugf("internal format: %x\n", internal_format);
       assert(0);
@@ -268,12 +148,6 @@ static TextureFormat gl_format_to_texture_format(int type) {
       return TextureFormat::RGBA8;
     case GL_R8:
       return TextureFormat::R8;
-    case GL_RG8:
-      return TextureFormat::RG8;
-    case GL_R16:
-      return TextureFormat::R16;
-    case GL_RGB_RAW_422_APPLE:
-      return TextureFormat::YUV422;
     default:
       assert(0);
       return TextureFormat::RGBA8;
@@ -287,34 +161,19 @@ struct Query {
 struct Buffer {
   char* buf = nullptr;
   size_t size = 0;
-  size_t capacity = 0;
 
   bool allocate(size_t new_size) {
-    // If the size remains unchanged, don't allocate anything.
-    if (new_size == size) {
-      return false;
-    }
-    // If the new size is within the existing capacity of the buffer, just
-    // reuse the existing buffer.
-    if (new_size <= capacity) {
-      size = new_size;
-      return true;
-    }
-    // Otherwise we need to reallocate the buffer to hold up to the requested
-    // larger size.
-    char* new_buf = (char*)realloc(buf, new_size);
-    assert(new_buf);
-    if (!new_buf) {
-      // If we fail, null out the buffer rather than leave around the old
-      // allocation state.
+    if (new_size != size) {
+      char* new_buf = (char*)realloc(buf, new_size);
+      assert(new_buf);
+      if (new_buf) {
+        buf = new_buf;
+        size = new_size;
+        return true;
+      }
       cleanup();
-      return false;
     }
-    // The reallocation succeeded, so install the buffer.
-    buf = new_buf;
-    size = new_size;
-    capacity = new_size;
-    return true;
+    return false;
   }
 
   void cleanup() {
@@ -322,7 +181,6 @@ struct Buffer {
       free(buf);
       buf = nullptr;
       size = 0;
-      capacity = 0;
     }
   }
 
@@ -331,6 +189,7 @@ struct Buffer {
 
 struct Framebuffer {
   GLuint color_attachment = 0;
+  GLint layer = 0;
   GLuint depth_attachment = 0;
 };
 
@@ -364,32 +223,17 @@ struct Texture {
   GLenum internal_format = 0;
   int width = 0;
   int height = 0;
+  int depth = 0;
   char* buf = nullptr;
   size_t buf_size = 0;
-  uint32_t buf_stride = 0;
-  uint8_t buf_bpp = 0;
   GLenum min_filter = GL_NEAREST;
   GLenum mag_filter = GL_LINEAR;
-  // The number of active locks on this texture. If this texture has any active
-  // locks, we need to disallow modifying or destroying the texture as it may
-  // be accessed by other threads where modifications could lead to races.
-  int32_t locked = 0;
-  // When used as an attachment of a framebuffer, rendering to the texture
-  // behaves as if it is located at the given offset such that the offset is
-  // subtracted from all transformed vertexes after the viewport is applied.
-  IntPoint offset;
 
   enum FLAGS {
-    // If the buffer is internally-allocated by SWGL
     SHOULD_FREE = 1 << 1,
-    // If the buffer has been cleared to initialize it. Currently this is only
-    // utilized by depth buffers which need to know when depth runs have reset
-    // to a valid row state. When unset, the depth runs may contain garbage.
-    CLEARED = 1 << 2,
   };
   int flags = SHOULD_FREE;
   bool should_free() const { return bool(flags & SHOULD_FREE); }
-  bool cleared() const { return bool(flags & CLEARED); }
 
   void set_flag(int flag, bool val) {
     if (val) {
@@ -398,14 +242,7 @@ struct Texture {
       flags &= ~flag;
     }
   }
-  void set_should_free(bool val) {
-    // buf must be null before SHOULD_FREE can be safely toggled. Otherwise, we
-    // might accidentally mistakenly realloc an externally allocated buffer as
-    // if it were an internally allocated one.
-    assert(!buf);
-    set_flag(SHOULD_FREE, val);
-  }
-  void set_cleared(bool val) { set_flag(CLEARED, val); }
+  void set_should_free(bool val) { set_flag(SHOULD_FREE, val); }
 
   // Delayed-clearing state. When a clear of an FB is requested, we don't
   // immediately clear each row, as the rows may be subsequently overwritten
@@ -418,9 +255,6 @@ struct Texture {
   uint32_t clear_val = 0;
   uint32_t* cleared_rows = nullptr;
 
-  void init_depth_runs(uint32_t z);
-  void fill_depth_runs(uint32_t z, const IntRect& scissor);
-
   void enable_delayed_clear(uint32_t val) {
     delay_clear = height;
     clear_val = val;
@@ -441,88 +275,40 @@ struct Texture {
     }
   }
 
-  int bpp() const { return buf_bpp; }
-  void set_bpp() { buf_bpp = bytes_for_internal_format(internal_format); }
+  int bpp() const { return bytes_for_internal_format(internal_format); }
 
-  size_t stride() const { return buf_stride; }
-  void set_stride() { buf_stride = aligned_stride(buf_bpp * width); }
-
-  // Set an external backing buffer of this texture.
-  void set_buffer(void* new_buf, size_t new_stride) {
-    assert(!should_free());
-    // Ensure that the supplied stride is at least as big as the row data and
-    // is aligned to the smaller of either the BPP or word-size. We need to at
-    // least be able to sample data from within a row and sample whole pixels
-    // of smaller formats without risking unaligned access.
-    set_bpp();
-    set_stride();
-    assert(new_stride >= size_t(bpp() * width) &&
-           new_stride % min(bpp(), sizeof(uint32_t)) == 0);
+  size_t stride(int b = 0, int min_width = 0) const {
+    return aligned_stride((b ? b : bpp()) * max(width, min_width));
+  }
 
-    buf = (char*)new_buf;
-    buf_size = 0;
-    buf_stride = new_stride;
+  size_t layer_stride(int b = 0, int min_width = 0, int min_height = 0) const {
+    return stride(b ? b : bpp(), min_width) * max(height, min_height);
   }
 
   bool allocate(bool force = false, int min_width = 0, int min_height = 0) {
-    assert(!locked);  // Locked textures shouldn't be reallocated
-    // If we get here, some GL API call that invalidates the texture was used.
-    // Mark the buffer as not-cleared to signal this.
-    set_cleared(false);
-    // Check if there is either no buffer currently or if we forced validation
-    // of the buffer size because some dimension might have changed.
     if ((!buf || force) && should_free()) {
-      // Initialize the buffer's BPP and stride, since they may have changed.
-      set_bpp();
-      set_stride();
-      // Compute new size based on the maximum potential stride, rather than
-      // the current stride, to hopefully avoid reallocations when size would
-      // otherwise change too much...
-      size_t max_stride = max(buf_stride, aligned_stride(buf_bpp * min_width));
-      size_t size = max_stride * max(height, min_height);
-      if ((!buf && size > 0) || size > buf_size) {
+      size_t size = layer_stride(bpp(), min_width, min_height) * max(depth, 1);
+      if (!buf || size > buf_size) {
         // Allocate with a SIMD register-sized tail of padding at the end so we
         // can safely read or write past the end of the texture with SIMD ops.
-        // Currently only the flat Z-buffer texture needs this padding due to
-        // full-register loads and stores in check_depth and discard_depth. In
-        // case some code in the future accidentally uses a linear filter on a
-        // texture with less than 2 pixels per row, we also add this padding
-        // just to be safe. All other texture types and use-cases should be
-        // safe to omit padding.
-        size_t padding =
-            internal_format == GL_DEPTH_COMPONENT24 || max(width, min_width) < 2
-                ? sizeof(Float)
-                : 0;
-        char* new_buf = (char*)realloc(buf, size + padding);
+        char* new_buf = (char*)realloc(buf, size + sizeof(Float));
         assert(new_buf);
         if (new_buf) {
-          // Successfully reallocated the buffer, so go ahead and set it.
           buf = new_buf;
           buf_size = size;
           return true;
         }
-        // Allocation failed, so ensure we don't leave stale buffer state.
         cleanup();
       }
     }
-    // Nothing changed...
     return false;
   }
 
   void cleanup() {
-    assert(!locked);  // Locked textures shouldn't be destroyed
-    if (buf) {
-      // If we need to toggle SHOULD_FREE state, ensure that buf is nulled out,
-      // regardless of whether we internally allocated it. This will prevent us
-      // from wrongly treating buf as having been internally allocated for when
-      // we go to realloc if it actually was externally allocted.
-      if (should_free()) {
-        free(buf);
-      }
+    if (buf && should_free()) {
+      free(buf);
       buf = nullptr;
       buf_size = 0;
-      buf_bpp = 0;
-      buf_stride = 0;
     }
     disable_delayed_clear();
   }
@@ -530,41 +316,44 @@ struct Texture {
   ~Texture() { cleanup(); }
 
   IntRect bounds() const { return IntRect{0, 0, width, height}; }
-  IntRect offset_bounds() const { return bounds() + offset; }
 
   // Find the valid sampling bounds relative to the requested region
   IntRect sample_bounds(const IntRect& req, bool invertY = false) const {
-    IntRect bb = bounds().intersect(req) - req.origin();
+    IntRect bb = bounds().intersect(req).offset(-req.x0, -req.y0);
     if (invertY) bb.invert_y(req.height());
     return bb;
   }
 
   // Get a pointer for sampling at the given offset
-  char* sample_ptr(int x, int y) const {
-    return buf + y * stride() + x * bpp();
+  char* sample_ptr(int x, int y, int z, int bpp, size_t stride) const {
+    return buf + (height * z + y) * stride + x * bpp;
+  }
+
+  char* sample_ptr(int x, int y, int z, int bpp) const {
+    return sample_ptr(x, y, z, bpp, stride(bpp));
+  }
+
+  char* sample_ptr(int x, int y, int z) const {
+    return sample_ptr(x, y, z, bpp());
   }
 
   // Get a pointer for sampling the requested region and limit to the provided
   // sampling bounds
-  char* sample_ptr(const IntRect& req, const IntRect& bounds,
+  char* sample_ptr(const IntRect& req, const IntRect& bounds, int z,
                    bool invertY = false) const {
     // Offset the sample pointer by the clamped bounds
     int x = req.x0 + bounds.x0;
     // Invert the Y offset if necessary
     int y = invertY ? req.y1 - 1 - bounds.y0 : req.y0 + bounds.y0;
-    return sample_ptr(x, y);
+    return sample_ptr(x, y, z);
   }
 };
 
-// The last vertex attribute is reserved as a null attribute in case a vertex
-// attribute is used without being set.
-#define MAX_ATTRIBS 17
-#define NULL_ATTRIB 16
+#define MAX_ATTRIBS 16
+#define NULL_ATTRIB 15
 struct VertexArray {
   VertexAttrib attribs[MAX_ATTRIBS];
   int max_attrib = -1;
-  // The GL spec defines element array buffer binding to be part of VAO state.
-  GLuint element_array_buffer_binding = 0;
 
   void validate();
 };
@@ -580,67 +369,33 @@ struct Program {
   FragmentShaderImpl* frag_impl = nullptr;
   bool deleted = false;
 
-  ~Program() { delete impl; }
+  ~Program() {
+    delete impl;
+  }
 };
 
-// clang-format off
-// Fully-expand GL defines while ignoring more than 4 suffixes
+// for GL defines to fully expand
 #define CONCAT_KEY(prefix, x, y, z, w, ...) prefix##x##y##z##w
-// Generate a blend key enum symbol
-#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0, 0)
-#define MASK_BLEND_KEY(...) CONCAT_KEY(MASK_BLEND_, __VA_ARGS__, 0, 0, 0)
-#define AA_BLEND_KEY(...) CONCAT_KEY(AA_BLEND_, __VA_ARGS__, 0, 0, 0)
-#define AA_MASK_BLEND_KEY(...) CONCAT_KEY(AA_MASK_BLEND_, __VA_ARGS__, 0, 0, 0)
-
-// Utility macro to easily generate similar code for all implemented blend modes
+#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0)
 #define FOR_EACH_BLEND_KEY(macro)                                              \
-  macro(GL_ONE, GL_ZERO, 0, 0)                                                 \
-  macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA)  \
-  macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0)                                  \
-  macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0)                                 \
-  macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE)                      \
-  macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0)                                 \
-  macro(GL_ZERO, GL_SRC_COLOR, 0, 0)                                           \
-  macro(GL_ONE, GL_ONE, 0, 0)                                                  \
-  macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA)                        \
-  macro(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE)                       \
-  macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, 0, 0)                       \
-  macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0)                                 \
-  macro(GL_MIN, 0, 0, 0)                                                       \
-  macro(GL_MAX, 0, 0, 0)                                                       \
-  macro(GL_MULTIPLY_KHR, 0, 0, 0)                                              \
-  macro(GL_SCREEN_KHR, 0, 0, 0)                                                \
-  macro(GL_OVERLAY_KHR, 0, 0, 0)                                               \
-  macro(GL_DARKEN_KHR, 0, 0, 0)                                                \
-  macro(GL_LIGHTEN_KHR, 0, 0, 0)                                               \
-  macro(GL_COLORDODGE_KHR, 0, 0, 0)                                            \
-  macro(GL_COLORBURN_KHR, 0, 0, 0)                                             \
-  macro(GL_HARDLIGHT_KHR, 0, 0, 0)                                             \
-  macro(GL_SOFTLIGHT_KHR, 0, 0, 0)                                             \
-  macro(GL_DIFFERENCE_KHR, 0, 0, 0)                                            \
-  macro(GL_EXCLUSION_KHR, 0, 0, 0)                                             \
-  macro(GL_HSL_HUE_KHR, 0, 0, 0)                                               \
-  macro(GL_HSL_SATURATION_KHR, 0, 0, 0)                                        \
-  macro(GL_HSL_COLOR_KHR, 0, 0, 0)                                             \
-  macro(GL_HSL_LUMINOSITY_KHR, 0, 0, 0)                                        \
-  macro(SWGL_BLEND_DROP_SHADOW, 0, 0, 0)                                       \
-  macro(SWGL_BLEND_SUBPIXEL_TEXT, 0, 0, 0)
+  macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE)                  \
+      macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0)                              \
+          macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0)                         \
+              macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE)          \
+                  macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0) macro(          \
+                      GL_ZERO, GL_SRC_COLOR, 0, 0) macro(GL_ONE, GL_ONE, 0, 0) \
+                      macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA)    \
+                          macro(GL_ONE, GL_ZERO, 0, 0) macro(                  \
+                              GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE) \
+                              macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, \
+                                    0, 0)                                      \
+                                  macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0)
 
 #define DEFINE_BLEND_KEY(...) BLEND_KEY(__VA_ARGS__),
-#define DEFINE_MASK_BLEND_KEY(...) MASK_BLEND_KEY(__VA_ARGS__),
-#define DEFINE_AA_BLEND_KEY(...) AA_BLEND_KEY(__VA_ARGS__),
-#define DEFINE_AA_MASK_BLEND_KEY(...) AA_MASK_BLEND_KEY(__VA_ARGS__),
 enum BlendKey : uint8_t {
+  BLEND_KEY_NONE = 0,
   FOR_EACH_BLEND_KEY(DEFINE_BLEND_KEY)
-  FOR_EACH_BLEND_KEY(DEFINE_MASK_BLEND_KEY)
-  FOR_EACH_BLEND_KEY(DEFINE_AA_BLEND_KEY)
-  FOR_EACH_BLEND_KEY(DEFINE_AA_MASK_BLEND_KEY)
-  BLEND_KEY_NONE = BLEND_KEY(GL_ONE, GL_ZERO),
-  MASK_BLEND_KEY_NONE = MASK_BLEND_KEY(GL_ONE, GL_ZERO),
-  AA_BLEND_KEY_NONE = AA_BLEND_KEY(GL_ONE, GL_ZERO),
-  AA_MASK_BLEND_KEY_NONE = AA_MASK_BLEND_KEY(GL_ONE, GL_ZERO),
 };
-// clang-format on
 
 const size_t MAX_TEXTURE_UNITS = 16;
 
@@ -704,10 +459,8 @@ struct ObjectStore {
 
   O* find(size_t i) const { return i < size ? objects[i] : nullptr; }
 
-  template <typename T>
-  void on_erase(T*, ...) {}
-  template <typename T>
-  void on_erase(T* o, decltype(&T::on_erase)) {
+  template <typename T> void on_erase(T*, ...) {}
+  template <typename T> void on_erase(T* o, decltype(&T::on_erase)) {
     o->on_erase();
   }
 
@@ -727,8 +480,6 @@ struct ObjectStore {
 };
 
 struct Context {
-  int32_t references = 1;
-
   ObjectStore<Query> queries;
   ObjectStore<Buffer> buffers;
   ObjectStore<Texture> textures;
@@ -756,7 +507,7 @@ struct Context {
   bool scissortest = false;
   IntRect scissor = {0, 0, 0, 0};
 
-  GLfloat clearcolor[4] = {0, 0, 0, 0};
+  uint32_t clearcolor = 0;
   GLdouble cleardepth = 1;
 
   int unpack_row_length = 0;
@@ -766,10 +517,14 @@ struct Context {
 
   struct TextureUnit {
     GLuint texture_2d_binding = 0;
+    GLuint texture_3d_binding = 0;
+    GLuint texture_2d_array_binding = 0;
     GLuint texture_rectangle_binding = 0;
 
     void unlink(GLuint n) {
       ::unlink(texture_2d_binding, n);
+      ::unlink(texture_3d_binding, n);
+      ::unlink(texture_2d_array_binding, n);
       ::unlink(texture_rectangle_binding, n);
     }
   };
@@ -784,6 +539,7 @@ struct Context {
   GLuint pixel_pack_buffer_binding = 0;
   GLuint pixel_unpack_buffer_binding = 0;
   GLuint array_buffer_binding = 0;
+  GLuint element_array_buffer_binding = 0;
   GLuint time_elapsed_query = 0;
   GLuint samples_passed_query = 0;
   GLuint renderbuffer_binding = 0;
@@ -800,9 +556,13 @@ struct Context {
       case GL_ARRAY_BUFFER:
         return array_buffer_binding;
       case GL_ELEMENT_ARRAY_BUFFER:
-        return vertex_arrays[current_vertex_array].element_array_buffer_binding;
+        return element_array_buffer_binding;
       case GL_TEXTURE_2D:
         return texture_units[active_texture_unit].texture_2d_binding;
+      case GL_TEXTURE_2D_ARRAY:
+        return texture_units[active_texture_unit].texture_2d_array_binding;
+      case GL_TEXTURE_3D:
+        return texture_units[active_texture_unit].texture_3d_binding;
       case GL_TEXTURE_RECTANGLE:
         return texture_units[active_texture_unit].texture_rectangle_binding;
       case GL_TIME_ELAPSED:
@@ -830,17 +590,16 @@ struct Context {
     return textures[texture_units[unit].texture_2d_binding];
   }
 
-  Texture& get_texture(sampler2DRect, int unit) {
-    return textures[texture_units[unit].texture_rectangle_binding];
+  Texture& get_texture(sampler2DArray, int unit) {
+    return textures[texture_units[unit].texture_2d_array_binding];
   }
 
-  IntRect apply_scissor(IntRect bb,
-                        const IntPoint& origin = IntPoint(0, 0)) const {
-    return scissortest ? bb.intersect(scissor - origin) : bb;
+  Texture& get_texture(sampler2DRect, int unit) {
+    return textures[texture_units[unit].texture_rectangle_binding];
   }
 
-  IntRect apply_scissor(const Texture& t) const {
-    return apply_scissor(t.bounds(), t.offset);
+  IntRect apply_scissor(IntRect bb) const {
+    return scissortest ? bb.intersect(scissor) : bb;
   }
 };
 static Context* ctx = nullptr;
@@ -851,12 +610,14 @@ static BlendKey blend_key = BLEND_KEY_NONE;
 static void prepare_texture(Texture& t, const IntRect* skip = nullptr);
 
 template <typename S>
+static inline void init_depth(S* s, Texture& t) {
+  s->depth = max(t.depth, 1);
+  s->height_stride = s->stride * t.height;
+}
+
+template <typename S>
 static inline void init_filter(S* s, Texture& t) {
-  // If the width is not at least 2 pixels, then we can't safely sample the end
-  // of the row with a linear filter. In that case, just punt to using nearest
-  // filtering instead.
-  s->filter = t.width >= 2 ? gl_filter_to_texture_filter(t.mag_filter)
-                           : TextureFilter::NEAREST;
+  s->filter = gl_filter_to_texture_filter(t.mag_filter);
 }
 
 template <typename S>
@@ -864,44 +625,20 @@ static inline void init_sampler(S* s, Texture& t) {
   prepare_texture(t);
   s->width = t.width;
   s->height = t.height;
-  s->stride = t.stride();
   int bpp = t.bpp();
-  if (bpp >= 4)
-    s->stride /= 4;
-  else if (bpp == 2)
-    s->stride /= 2;
-  else
-    assert(bpp == 1);
-  // Use uint32_t* for easier sampling, but need to cast to uint8_t* or
-  // uint16_t* for formats with bpp < 4.
+  s->stride = t.stride(bpp);
+  if (bpp >= 4) s->stride /= 4;
+  // Use uint32_t* for easier sampling, but need to cast to uint8_t* for formats
+  // with bpp < 4.
   s->buf = (uint32_t*)t.buf;
   s->format = gl_format_to_texture_format(t.internal_format);
 }
 
 template <typename S>
-static inline void null_sampler(S* s) {
-  // For null texture data, just make the sampler provide a 1x1 buffer that is
-  // transparent black. Ensure buffer holds at least a SIMD vector of zero data
-  // for SIMD padding of unaligned loads.
-  static const uint32_t zeroBuf[sizeof(Float) / sizeof(uint32_t)] = {0};
-  s->width = 1;
-  s->height = 1;
-  s->stride = s->width;
-  s->buf = (uint32_t*)zeroBuf;
-  s->format = TextureFormat::RGBA8;
-}
-
-template <typename S>
-static inline void null_filter(S* s) {
-  s->filter = TextureFilter::NEAREST;
-}
-
-template <typename S>
 S* lookup_sampler(S* s, int texture) {
   Texture& t = ctx->get_texture(s, texture);
   if (!t.buf) {
-    null_sampler(s);
-    null_filter(s);
+    *s = S();
   } else {
     init_sampler(s, t);
     init_filter(s, t);
@@ -913,13 +650,26 @@ template <typename S>
 S* lookup_isampler(S* s, int texture) {
   Texture& t = ctx->get_texture(s, texture);
   if (!t.buf) {
-    null_sampler(s);
+    *s = S();
   } else {
     init_sampler(s, t);
   }
   return s;
 }
 
+template <typename S>
+S* lookup_sampler_array(S* s, int texture) {
+  Texture& t = ctx->get_texture(s, texture);
+  if (!t.buf) {
+    *s = S();
+  } else {
+    init_sampler(s, t);
+    init_depth(s, t);
+    init_filter(s, t);
+  }
+  return s;
+}
+
 int bytes_per_type(GLenum type) {
   switch (type) {
     case GL_INT:
@@ -983,40 +733,21 @@ void load_attrib(T& attrib, VertexAttrib& va, uint32_t start, int instance,
     attrib = T(load_attrib_scalar<scalar_type>(va, src));
   } else {
     // Specialized for WR's primitive vertex order/winding.
+    // Triangles must be indexed at offsets 0, 1, 2.
+    // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3.
+    // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2.
+    // Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so that the
+    // points form a convex path that can be traversed by the rasterizer.
     if (!count) return;
-    assert(count >= 2 && count <= 4);
+    assert(count == 3 || count == 4);
     char* src = (char*)va.buf + va.stride * start + va.offset;
-    switch (count) {
-      case 2: {
-        // Lines must be indexed at offsets 0, 1.
-        // Line vertexes fill vertex shader SIMD lanes as 0, 1, 1, 0.
-        scalar_type lanes[2] = {
-            load_attrib_scalar<scalar_type>(va, src),
-            load_attrib_scalar<scalar_type>(va, src + va.stride)};
-        attrib = (T){lanes[0], lanes[1], lanes[1], lanes[0]};
-        break;
-      }
-      case 3: {
-        // Triangles must be indexed at offsets 0, 1, 2.
-        // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2.
-        scalar_type lanes[3] = {
-            load_attrib_scalar<scalar_type>(va, src),
-            load_attrib_scalar<scalar_type>(va, src + va.stride),
-            load_attrib_scalar<scalar_type>(va, src + va.stride * 2)};
-        attrib = (T){lanes[0], lanes[1], lanes[2], lanes[2]};
-        break;
-      }
-      default:
-        // Quads must be successive triangles indexed at offsets 0, 1, 2, 2,
-        // 1, 3. Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so
-        // that the points form a convex path that can be traversed by the
-        // rasterizer.
-        attrib = (T){load_attrib_scalar<scalar_type>(va, src),
-                     load_attrib_scalar<scalar_type>(va, src + va.stride),
-                     load_attrib_scalar<scalar_type>(va, src + va.stride * 3),
-                     load_attrib_scalar<scalar_type>(va, src + va.stride * 2)};
-        break;
-    }
+    attrib = (T){
+        load_attrib_scalar<scalar_type>(va, src),
+        load_attrib_scalar<scalar_type>(va, src + va.stride),
+        load_attrib_scalar<scalar_type>(va, src + va.stride * 2 +
+                                            (count > 3 ? va.stride : 0)),
+        load_attrib_scalar<scalar_type>(va, src + va.stride * 2)
+    };
   }
 }
 
@@ -1076,6 +807,7 @@ void Enable(GLenum cap) {
   switch (cap) {
     case GL_BLEND:
       ctx->blend = true;
+      blend_key = ctx->blend_key;
       break;
     case GL_DEPTH_TEST:
       ctx->depthtest = true;
@@ -1090,6 +822,7 @@ void Disable(GLenum cap) {
   switch (cap) {
     case GL_BLEND:
       ctx->blend = false;
+      blend_key = BLEND_KEY_NONE;
       break;
     case GL_DEPTH_TEST:
       ctx->depthtest = false;
@@ -1103,18 +836,10 @@ void Disable(GLenum cap) {
 GLenum GetError() { return GL_NO_ERROR; }
 
 static const char* const extensions[] = {
-    "GL_ARB_blend_func_extended",
-    "GL_ARB_clear_texture",
-    "GL_ARB_copy_image",
-    "GL_ARB_draw_instanced",
-    "GL_ARB_explicit_attrib_location",
-    "GL_ARB_instanced_arrays",
-    "GL_ARB_invalidate_subdata",
-    "GL_ARB_texture_storage",
-    "GL_EXT_timer_query",
-    "GL_KHR_blend_equation_advanced",
-    "GL_KHR_blend_equation_advanced_coherent",
-    "GL_APPLE_rgb_422",
+    "GL_ARB_blend_func_extended", "GL_ARB_copy_image",
+    "GL_ARB_draw_instanced",      "GL_ARB_explicit_attrib_location",
+    "GL_ARB_instanced_arrays",    "GL_ARB_invalidate_subdata",
+    "GL_ARB_texture_storage",     "GL_EXT_timer_query",
 };
 
 void GetIntegerv(GLenum pname, GLint* params) {
@@ -1128,7 +853,7 @@ void GetIntegerv(GLenum pname, GLint* params) {
       params[0] = 1 << 15;
       break;
     case GL_MAX_ARRAY_TEXTURE_LAYERS:
-      params[0] = 0;
+      params[0] = 1 << 15;
       break;
     case GL_READ_FRAMEBUFFER_BINDING:
       params[0] = ctx->read_framebuffer_binding;
@@ -1145,12 +870,6 @@ void GetIntegerv(GLenum pname, GLint* params) {
     case GL_NUM_EXTENSIONS:
       params[0] = sizeof(extensions) / sizeof(extensions[0]);
       break;
-    case GL_MAJOR_VERSION:
-      params[0] = 3;
-      break;
-    case GL_MINOR_VERSION:
-      params[0] = 2;
-      break;
     default:
       debugf("unhandled glGetIntegerv parameter %x\n", pname);
       assert(false);
@@ -1177,8 +896,6 @@ const char* GetString(GLenum name) {
       return "Software WebRender";
     case GL_VERSION:
       return "3.2";
-    case GL_SHADING_LANGUAGE_VERSION:
-      return "1.50";
     default:
       debugf("unhandled glGetString parameter %x\n", name);
       assert(false);
@@ -1254,23 +971,17 @@ GLenum remap_blendfunc(GLenum rgb, GLenum a) {
   return a;
 }
 
-// Generate a hashed blend key based on blend func and equation state. This
-// allows all the blend state to be processed down to a blend key that can be
-// dealt with inside a single switch statement.
-static void hash_blend_key() {
-  GLenum srgb = ctx->blendfunc_srgb;
-  GLenum drgb = ctx->blendfunc_drgb;
-  GLenum sa = ctx->blendfunc_sa;
-  GLenum da = ctx->blendfunc_da;
-  GLenum equation = ctx->blend_equation;
+void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) {
+  ctx->blendfunc_srgb = srgb;
+  ctx->blendfunc_drgb = drgb;
+  sa = remap_blendfunc(srgb, sa);
+  da = remap_blendfunc(drgb, da);
+  ctx->blendfunc_sa = sa;
+  ctx->blendfunc_da = da;
+
 #define HASH_BLEND_KEY(x, y, z, w) ((x << 4) | (y) | (z << 24) | (w << 20))
-  // Basic non-separate blend funcs used the two argument form
   int hash = HASH_BLEND_KEY(srgb, drgb, 0, 0);
-  // Separate alpha blend funcs use the 4 argument hash
   if (srgb != sa || drgb != da) hash |= HASH_BLEND_KEY(0, 0, sa, da);
-  // Any other blend equation than the default func_add ignores the func and
-  // instead generates a one-argument hash based on the equation
-  if (equation != GL_FUNC_ADD) hash = HASH_BLEND_KEY(equation, 0, 0, 0);
   switch (hash) {
 #define MAP_BLEND_KEY(...)                   \
   case HASH_BLEND_KEY(__VA_ARGS__):          \
@@ -1278,22 +989,14 @@ static void hash_blend_key() {
     break;
     FOR_EACH_BLEND_KEY(MAP_BLEND_KEY)
     default:
-      debugf("blendfunc: %x, %x, separate: %x, %x, equation: %x\n", srgb, drgb,
-             sa, da, equation);
+      debugf("blendfunc: %x, %x, separate: %x, %x\n", srgb, drgb, sa, da);
       assert(false);
       break;
   }
-}
 
-void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) {
-  ctx->blendfunc_srgb = srgb;
-  ctx->blendfunc_drgb = drgb;
-  sa = remap_blendfunc(srgb, sa);
-  da = remap_blendfunc(drgb, da);
-  ctx->blendfunc_sa = sa;
-  ctx->blendfunc_da = da;
-
-  hash_blend_key();
+  if (ctx->blend) {
+    blend_key = ctx->blend_key;
+  }
 }
 
 void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
@@ -1302,12 +1005,8 @@ void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
 }
 
 void BlendEquation(GLenum mode) {
-  assert(mode == GL_FUNC_ADD || mode == GL_MIN || mode == GL_MAX ||
-         (mode >= GL_MULTIPLY_KHR && mode <= GL_HSL_LUMINOSITY_KHR));
-  if (mode != ctx->blend_equation) {
-    ctx->blend_equation = mode;
-    hash_blend_key();
-  }
+  assert(mode == GL_FUNC_ADD);
+  ctx->blend_equation = mode;
 }
 
 void DepthMask(GLboolean flag) { ctx->depthmask = flag; }
@@ -1328,10 +1027,8 @@ void SetScissor(GLint x, GLint y, GLsizei width, GLsizei height) {
 }
 
 void ClearColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
-  ctx->clearcolor[0] = r;
-  ctx->clearcolor[1] = g;
-  ctx->clearcolor[2] = b;
-  ctx->clearcolor[3] = a;
+  I32 c = round_pixel((Float){b, g, r, a});
+  ctx->clearcolor = bit_cast<uint32_t>(CONVERT(c, U8));
 }
 
 void ClearDepth(GLdouble depth) { ctx->cleardepth = depth; }
@@ -1369,6 +1066,7 @@ void DeleteBuffer(GLuint n) {
     unlink(ctx->pixel_pack_buffer_binding, n);
     unlink(ctx->pixel_unpack_buffer_binding, n);
     unlink(ctx->array_buffer_binding, n);
+    unlink(ctx->element_array_buffer_binding, n);
   }
 }
 
@@ -1434,45 +1132,26 @@ void DeleteProgram(GLuint n) {
 void LinkProgram(GLuint program) {
   Program& p = ctx->programs[program];
   assert(p.impl);
-  if (!p.impl) {
-    return;
-  }
   assert(p.impl->interpolants_size() <= sizeof(Interpolants));
   if (!p.vert_impl) p.vert_impl = p.impl->get_vertex_shader();
   if (!p.frag_impl) p.frag_impl = p.impl->get_fragment_shader();
 }
 
-GLint GetLinkStatus(GLuint program) {
-  if (auto* p = ctx->programs.find(program)) {
-    return p->impl ? 1 : 0;
-  }
-  return 0;
-}
-
 void BindAttribLocation(GLuint program, GLuint index, char* name) {
   Program& p = ctx->programs[program];
   assert(p.impl);
-  if (!p.impl) {
-    return;
-  }
   p.impl->bind_attrib(name, index);
 }
 
 GLint GetAttribLocation(GLuint program, char* name) {
   Program& p = ctx->programs[program];
   assert(p.impl);
-  if (!p.impl) {
-    return -1;
-  }
   return p.impl->get_attrib(name);
 }
 
 GLint GetUniformLocation(GLuint program, char* name) {
   Program& p = ctx->programs[program];
   assert(p.impl);
-  if (!p.impl) {
-    return -1;
-  }
   GLint loc = p.impl->get_uniform(name);
   // debugf("location: %d\n", loc);
   return loc;
@@ -1482,15 +1161,7 @@ static uint64_t get_time_value() {
 #ifdef __MACH__
   return mach_absolute_time();
 #elif defined(_WIN32)
-  LARGE_INTEGER time;
-  static bool have_frequency = false;
-  static LARGE_INTEGER frequency;
-  if (!have_frequency) {
-    QueryPerformanceFrequency(&frequency);
-    have_frequency = true;
-  }
-  QueryPerformanceCounter(&time);
-  return time.QuadPart * 1000000000ULL / frequency.QuadPart;
+  return uint64_t(clock()) * (1000000000ULL / CLOCKS_PER_SEC);
 #else
   return ({
     struct timespec tp;
@@ -1583,113 +1254,60 @@ void PixelStorei(GLenum name, GLint param) {
 static GLenum remap_internal_format(GLenum format) {
   switch (format) {
     case GL_DEPTH_COMPONENT:
-      return GL_DEPTH_COMPONENT24;
+      return GL_DEPTH_COMPONENT16;
     case GL_RGBA:
       return GL_RGBA8;
     case GL_RED:
       return GL_R8;
-    case GL_RG:
-      return GL_RG8;
-    case GL_RGB_422_APPLE:
-      return GL_RGB_RAW_422_APPLE;
     default:
       return format;
   }
 }
 
-}  // extern "C"
-
-static bool format_requires_conversion(GLenum external_format,
-                                       GLenum internal_format) {
-  switch (external_format) {
-    case GL_RGBA:
-      return internal_format == GL_RGBA8;
-    default:
-      return false;
-  }
-}
-
-static inline void copy_bgra8_to_rgba8(uint32_t* dest, const uint32_t* src,
-                                       int width) {
-  for (; width >= 4; width -= 4, dest += 4, src += 4) {
-    U32 p = unaligned_load<U32>(src);
-    U32 rb = p & 0x00FF00FF;
-    unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16));
-  }
-  for (; width > 0; width--, dest++, src++) {
-    uint32_t p = *src;
-    uint32_t rb = p & 0x00FF00FF;
-    *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16);
-  }
-}
-
-static void convert_copy(GLenum external_format, GLenum internal_format,
-                         uint8_t* dst_buf, size_t dst_stride,
-                         const uint8_t* src_buf, size_t src_stride,
-                         size_t width, size_t height) {
-  switch (external_format) {
-    case GL_RGBA:
-      if (internal_format == GL_RGBA8) {
-        for (; height; height--) {
-          copy_bgra8_to_rgba8((uint32_t*)dst_buf, (const uint32_t*)src_buf,
-                              width);
-          dst_buf += dst_stride;
-          src_buf += src_stride;
-        }
-        return;
-      }
-      break;
-    default:
-      break;
-  }
-  size_t row_bytes = width * bytes_for_internal_format(internal_format);
-  for (; height; height--) {
-    memcpy(dst_buf, src_buf, row_bytes);
-    dst_buf += dst_stride;
-    src_buf += src_stride;
+void TexStorage3D(GLenum target, GLint levels, GLenum internal_format,
+                  GLsizei width, GLsizei height, GLsizei depth) {
+  assert(levels == 1);
+  Texture& t = ctx->textures[ctx->get_binding(target)];
+  internal_format = remap_internal_format(internal_format);
+  bool changed = false;
+  if (t.width != width || t.height != height || t.depth != depth ||
+      t.internal_format != internal_format) {
+    changed = true;
+    t.internal_format = internal_format;
+    t.width = width;
+    t.height = height;
+    t.depth = depth;
   }
+  t.disable_delayed_clear();
+  t.allocate(changed);
 }
 
-static void set_tex_storage(Texture& t, GLenum external_format, GLsizei width,
-                            GLsizei height, void* buf = nullptr,
-                            GLsizei stride = 0, GLsizei min_width = 0,
-                            GLsizei min_height = 0) {
-  GLenum internal_format = remap_internal_format(external_format);
+static void set_tex_storage(Texture& t, GLenum internal_format,
+                            GLsizei width, GLsizei height,
+                            bool should_free = true, void* buf = nullptr,
+                            GLsizei min_width = 0, GLsizei min_height = 0) {
+  internal_format = remap_internal_format(internal_format);
   bool changed = false;
-  if (t.width != width || t.height != height ||
+  if (t.width != width || t.height != height || t.depth != 0 ||
       t.internal_format != internal_format) {
     changed = true;
     t.internal_format = internal_format;
     t.width = width;
     t.height = height;
+    t.depth = 0;
   }
-  // If we are changed from an internally managed buffer to an externally
-  // supplied one or vice versa, ensure that we clean up old buffer state.
-  // However, if we have to convert the data from a non-native format, then
-  // always treat it as internally managed since we will need to copy to an
-  // internally managed native format buffer.
-  bool should_free = buf == nullptr || format_requires_conversion(
-                                           external_format, internal_format);
-  if (t.should_free() != should_free) {
-    changed = true;
-    t.cleanup();
+  if (t.should_free() != should_free || buf != nullptr) {
+    if (t.should_free()) {
+      t.cleanup();
+    }
     t.set_should_free(should_free);
-  }
-  // If now an external buffer, explicitly set it...
-  if (!should_free) {
-    t.set_buffer(buf, stride);
+    t.buf = (char*)buf;
+    t.buf_size = 0;
   }
   t.disable_delayed_clear();
   t.allocate(changed, min_width, min_height);
-  // If we have a buffer that needs format conversion, then do that now.
-  if (buf && should_free) {
-    convert_copy(external_format, internal_format, (uint8_t*)t.buf, t.stride(),
-                 (const uint8_t*)buf, stride, width, height);
-  }
 }
 
-extern "C" {
-
 void TexStorage2D(GLenum target, GLint levels, GLenum internal_format,
                   GLsizei width, GLsizei height) {
   assert(levels == 1);
@@ -1701,19 +1319,12 @@ GLenum internal_format_for_data(GLenum format, GLenum ty) {
   if (format == GL_RED && ty == GL_UNSIGNED_BYTE) {
     return GL_R8;
   } else if ((format == GL_RGBA || format == GL_BGRA) &&
-             (ty == GL_UNSIGNED_BYTE || ty == GL_UNSIGNED_INT_8_8_8_8_REV)) {
+             ty == GL_UNSIGNED_BYTE) {
     return GL_RGBA8;
   } else if (format == GL_RGBA && ty == GL_FLOAT) {
     return GL_RGBA32F;
   } else if (format == GL_RGBA_INTEGER && ty == GL_INT) {
     return GL_RGBA32I;
-  } else if (format == GL_RG && ty == GL_UNSIGNED_BYTE) {
-    return GL_RG8;
-  } else if (format == GL_RGB_422_APPLE &&
-             ty == GL_UNSIGNED_SHORT_8_8_REV_APPLE) {
-    return GL_RGB_RAW_422_APPLE;
-  } else if (format == GL_RED && ty == GL_UNSIGNED_SHORT) {
-    return GL_R16;
   } else {
     debugf("unknown internal format for format %x, type %x\n", format, ty);
     assert(false);
@@ -1721,6 +1332,20 @@ GLenum internal_format_for_data(GLenum format, GLenum ty) {
   }
 }
 
+static inline void copy_bgra8_to_rgba8(uint32_t* dest, uint32_t* src,
+                                       int width) {
+  for (; width >= 4; width -= 4, dest += 4, src += 4) {
+    U32 p = unaligned_load<U32>(src);
+    U32 rb = p & 0x00FF00FF;
+    unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16));
+  }
+  for (; width > 0; width--, dest++, src++) {
+    uint32_t p = *src;
+    uint32_t rb = p & 0x00FF00FF;
+    *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16);
+  }
+}
+
 static Buffer* get_pixel_pack_buffer() {
   return ctx->pixel_pack_buffer_binding
              ? &ctx->buffers[ctx->pixel_pack_buffer_binding]
@@ -1750,10 +1375,7 @@ static void* get_pixel_unpack_buffer_data(void* data) {
 void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset,
                    GLsizei width, GLsizei height, GLenum format, GLenum ty,
                    void* data) {
-  if (level != 0) {
-    assert(false);
-    return;
-  }
+  if (level != 0) { assert(false); return; }
   data = get_pixel_unpack_buffer_data(data);
   if (!data) return;
   Texture& t = ctx->textures[ctx->get_binding(target)];
@@ -1765,33 +1387,84 @@ void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset,
   GLsizei row_length =
       ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width;
   assert(t.internal_format == internal_format_for_data(format, ty));
-  int src_bpp = format_requires_conversion(format, t.internal_format)
-                    ? bytes_for_internal_format(format)
-                    : t.bpp();
-  if (!src_bpp || !t.buf) return;
-  convert_copy(format, t.internal_format,
-               (uint8_t*)t.sample_ptr(xoffset, yoffset), t.stride(),
-               (const uint8_t*)data, row_length * src_bpp, width, height);
+  int bpp = t.bpp();
+  if (!bpp || !t.buf) return;
+  size_t dest_stride = t.stride(bpp);
+  char* dest = t.sample_ptr(xoffset, yoffset, 0, bpp, dest_stride);
+  char* src = (char*)data;
+  for (int y = 0; y < height; y++) {
+    if (t.internal_format == GL_RGBA8 && format != GL_BGRA) {
+      copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width);
+    } else {
+      memcpy(dest, src, width * bpp);
+    }
+    dest += dest_stride;
+    src += row_length * bpp;
+  }
 }
 
 void TexImage2D(GLenum target, GLint level, GLint internal_format,
                 GLsizei width, GLsizei height, GLint border, GLenum format,
                 GLenum ty, void* data) {
-  if (level != 0) {
-    assert(false);
-    return;
-  }
+  if (level != 0) { assert(false); return; }
   assert(border == 0);
   TexStorage2D(target, 1, internal_format, width, height);
   TexSubImage2D(target, 0, 0, 0, width, height, format, ty, data);
 }
 
+void TexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset,
+                   GLint zoffset, GLsizei width, GLsizei height, GLsizei depth,
+                   GLenum format, GLenum ty, void* data) {
+  if (level != 0) { assert(false); return; }
+  data = get_pixel_unpack_buffer_data(data);
+  if (!data) return;
+  Texture& t = ctx->textures[ctx->get_binding(target)];
+  prepare_texture(t);
+  assert(ctx->unpack_row_length == 0 || ctx->unpack_row_length >= width);
+  GLsizei row_length =
+      ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width;
+  if (format == GL_BGRA) {
+    assert(ty == GL_UNSIGNED_BYTE);
+    assert(t.internal_format == GL_RGBA8);
+  } else {
+    assert(t.internal_format == internal_format_for_data(format, ty));
+  }
+  int bpp = t.bpp();
+  if (!bpp || !t.buf) return;
+  char* src = (char*)data;
+  assert(xoffset + width <= t.width);
+  assert(yoffset + height <= t.height);
+  assert(zoffset + depth <= t.depth);
+  size_t dest_stride = t.stride(bpp);
+  for (int z = 0; z < depth; z++) {
+    char* dest = t.sample_ptr(xoffset, yoffset, zoffset + z, bpp, dest_stride);
+    for (int y = 0; y < height; y++) {
+      if (t.internal_format == GL_RGBA8 && format != GL_BGRA) {
+        copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width);
+      } else {
+        memcpy(dest, src, width * bpp);
+      }
+      dest += dest_stride;
+      src += row_length * bpp;
+    }
+  }
+}
+
+void TexImage3D(GLenum target, GLint level, GLint internal_format,
+                GLsizei width, GLsizei height, GLsizei depth, GLint border,
+                GLenum format, GLenum ty, void* data) {
+  if (level != 0) { assert(false); return; }
+  assert(border == 0);
+  TexStorage3D(target, 1, internal_format, width, height, depth);
+  TexSubImage3D(target, 0, 0, 0, 0, width, height, depth, format, ty, data);
+}
+
 void GenerateMipmap(UNUSED GLenum target) {
   // TODO: support mipmaps
 }
 
-void SetTextureParameter(GLuint texid, GLenum pname, GLint param) {
-  Texture& t = ctx->textures[texid];
+void TexParameteri(GLenum target, GLenum pname, GLint param) {
+  Texture& t = ctx->textures[ctx->get_binding(target)];
   switch (pname) {
     case GL_TEXTURE_WRAP_S:
       assert(param == GL_CLAMP_TO_EDGE);
@@ -1810,10 +1483,6 @@ void SetTextureParameter(GLuint texid, GLenum pname, GLint param) {
   }
 }
 
-void TexParameteri(GLenum target, GLenum pname, GLint param) {
-  SetTextureParameter(ctx->get_binding(target), pname, param);
-}
-
 void GenTextures(int n, GLuint* result) {
   for (int i = 0; i < n; i++) {
     Texture t;
@@ -1839,7 +1508,9 @@ void GenRenderbuffers(int n, GLuint* result) {
 void Renderbuffer::on_erase() {
   for (auto* fb : ctx->framebuffers) {
     if (fb) {
-      unlink(fb->color_attachment, texture);
+      if (unlink(fb->color_attachment, texture)) {
+        fb->layer = 0;
+      }
       unlink(fb->depth_attachment, texture);
     }
   }
@@ -1875,11 +1546,10 @@ void RenderbufferStorage(GLenum target, GLenum internal_format, GLsizei width,
   }
   switch (internal_format) {
     case GL_DEPTH_COMPONENT:
-    case GL_DEPTH_COMPONENT16:
     case GL_DEPTH_COMPONENT24:
     case GL_DEPTH_COMPONENT32:
-      // Force depth format to 24 bits...
-      internal_format = GL_DEPTH_COMPONENT24;
+      // Force depth format to 16 bits...
+      internal_format = GL_DEPTH_COMPONENT16;
       break;
   }
   set_tex_storage(ctx->textures[r.texture], internal_format, width, height);
@@ -1963,8 +1633,7 @@ void VertexAttribDivisor(GLuint index, GLuint divisor) {
   va.divisor = divisor;
 }
 
-void BufferData(GLenum target, GLsizeiptr size, void* data,
-                UNUSED GLenum usage) {
+void BufferData(GLenum target, GLsizeiptr size, void* data, UNUSED GLenum usage) {
   Buffer& b = ctx->buffers[ctx->get_binding(target)];
   if (b.allocate(size)) {
     ctx->validate_vertex_array = true;
@@ -2004,23 +1673,17 @@ GLboolean UnmapBuffer(GLenum target) {
 
 void Uniform1i(GLint location, GLint V0) {
   // debugf("tex: %d\n", (int)ctx->textures.size);
-  if (vertex_shader) {
-    vertex_shader->set_uniform_1i(location, V0);
-  }
+  vertex_shader->set_uniform_1i(location, V0);
 }
 void Uniform4fv(GLint location, GLsizei count, const GLfloat* v) {
   assert(count == 1);
-  if (vertex_shader) {
-    vertex_shader->set_uniform_4fv(location, v);
-  }
+  vertex_shader->set_uniform_4fv(location, v);
 }
 void UniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose,
                       const GLfloat* value) {
   assert(count == 1);
   assert(!transpose);
-  if (vertex_shader) {
-    vertex_shader->set_uniform_matrix4fv(location, value);
-  }
+  vertex_shader->set_uniform_matrix4fv(location, value);
 }
 
 void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget,
@@ -2031,7 +1694,24 @@ void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget,
   Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)];
   if (attachment == GL_COLOR_ATTACHMENT0) {
     fb.color_attachment = texture;
+    fb.layer = 0;
+  } else if (attachment == GL_DEPTH_ATTACHMENT) {
+    fb.depth_attachment = texture;
+  } else {
+    assert(0);
+  }
+}
+
+void FramebufferTextureLayer(GLenum target, GLenum attachment, GLuint texture,
+                             GLint level, GLint layer) {
+  assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER);
+  assert(level == 0);
+  Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)];
+  if (attachment == GL_COLOR_ATTACHMENT0) {
+    fb.color_attachment = texture;
+    fb.layer = layer;
   } else if (attachment == GL_DEPTH_ATTACHMENT) {
+    assert(layer == 0);
     fb.depth_attachment = texture;
   } else {
     assert(0);
@@ -2046,6 +1726,7 @@ void FramebufferRenderbuffer(GLenum target, GLenum attachment,
   Renderbuffer& rb = ctx->renderbuffers[renderbuffer];
   if (attachment == GL_COLOR_ATTACHMENT0) {
     fb.color_attachment = rb.texture;
+    fb.layer = 0;
   } else if (attachment == GL_DEPTH_ATTACHMENT) {
     fb.depth_attachment = rb.texture;
   } else {
@@ -2055,18 +1736,11 @@ void FramebufferRenderbuffer(GLenum target, GLenum attachment,
 
 }  // extern "C"
 
-static inline Framebuffer* get_framebuffer(GLenum target,
-                                           bool fallback = false) {
+static inline Framebuffer* get_framebuffer(GLenum target) {
   if (target == GL_FRAMEBUFFER) {
     target = GL_DRAW_FRAMEBUFFER;
   }
-  Framebuffer* fb = ctx->framebuffers.find(ctx->get_binding(target));
-  if (fallback && !fb) {
-    // If the specified framebuffer isn't found and a fallback is requested,
-    // use the default framebuffer.
-    fb = &ctx->framebuffers[0];
-  }
-  return fb;
+  return ctx->framebuffers.find(ctx->get_binding(target));
 }
 
 template <typename T>
@@ -2092,7 +1766,9 @@ static inline uint32_t clear_chunk(uint16_t value) {
   return uint32_t(value) | (uint32_t(value) << 16);
 }
 
-static inline uint32_t clear_chunk(uint32_t value) { return value; }
+static inline uint32_t clear_chunk(uint32_t value) {
+  return value;
+}
 
 template <typename T>
 static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) {
@@ -2115,22 +1791,20 @@ static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) {
 }
 
 template <typename T>
-static void clear_buffer(Texture& t, T value, IntRect bb, int skip_start = 0,
-                         int skip_end = 0) {
+static void clear_buffer(Texture& t, T value, int layer, IntRect bb,
+                         int skip_start = 0, int skip_end = 0) {
   if (!t.buf) return;
   skip_start = max(skip_start, bb.x0);
   skip_end = max(skip_end, skip_start);
   assert(sizeof(T) == t.bpp());
-  size_t stride = t.stride();
-  // When clearing multiple full-width rows, collapse them into a single large
-  // "row" to avoid redundant setup from clearing each row individually. Note
-  // that we can only safely do this if the stride is tightly packed.
-  if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end &&
-      (t.should_free() || stride == t.width * sizeof(T))) {
+  size_t stride = t.stride(sizeof(T));
+  // When clearing multiple full-width rows, collapse them into a single
+  // large "row" to avoid redundant setup from clearing each row individually.
+  if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end) {
     bb.x1 += (stride / sizeof(T)) * (bb.height() - 1);
     bb.y1 = bb.y0 + 1;
   }
-  T* buf = (T*)t.sample_ptr(bb.x0, bb.y0);
+  T* buf = (T*)t.sample_ptr(bb.x0, bb.y0, layer, sizeof(T), stride);
   uint32_t chunk = clear_chunk(value);
   for (int rows = bb.height(); rows > 0; rows--) {
     if (bb.x0 < skip_start) {
@@ -2144,12 +1818,20 @@ static void clear_buffer(Texture& t, T value, IntRect bb, int skip_start = 0,
 }
 
 template <typename T>
+static inline void clear_buffer(Texture& t, T value, int layer = 0) {
+  IntRect bb = ctx->apply_scissor(t.bounds());
+  if (bb.width() > 0) {
+    clear_buffer<T>(t, value, layer, bb);
+  }
+}
+
+template <typename T>
 static inline void force_clear_row(Texture& t, int y, int skip_start = 0,
                                    int skip_end = 0) {
   assert(t.buf != nullptr);
   assert(sizeof(T) == t.bpp());
   assert(skip_start <= skip_end);
-  T* buf = (T*)t.sample_ptr(0, y);
+  T* buf = (T*)t.sample_ptr(0, y, 0, sizeof(T));
   uint32_t chunk = clear_chunk((T)t.clear_val);
   if (skip_start > 0) {
     clear_row<T>(buf, skip_start, t.clear_val, chunk);
@@ -2188,9 +1870,9 @@ static void force_clear(Texture& t, const IntRect* skip = nullptr) {
       while (mask) {
         int count = __builtin_ctz(mask);
         if (count > 0) {
-          clear_buffer<T>(t, t.clear_val,
-                          IntRect{0, start, t.width, start + count}, skip_start,
-                          skip_end);
+          clear_buffer<T>(t, t.clear_val, 0,
+                          IntRect{0, start, t.width, start + count},
+                          skip_start, skip_end);
           t.delay_clear -= count;
           start += count;
           mask >>= count;
@@ -2201,9 +1883,9 @@ static void force_clear(Texture& t, const IntRect* skip = nullptr) {
       }
       int count = (i + 1) * 32 - start;
       if (count > 0) {
-        clear_buffer<T>(t, t.clear_val,
-                        IntRect{0, start, t.width, start + count}, skip_start,
-                        skip_end);
+        clear_buffer<T>(t, t.clear_val, 0,
+                        IntRect{0, start, t.width, start + count},
+                        skip_start, skip_end);
         t.delay_clear -= count;
       }
     }
@@ -2220,7 +1902,7 @@ static void prepare_texture(Texture& t, const IntRect* skip) {
       case GL_R8:
         force_clear<uint8_t>(t, skip);
         break;
-      case GL_RG8:
+      case GL_DEPTH_COMPONENT16:
         force_clear<uint16_t>(t, skip);
         break;
       default:
@@ -2230,53 +1912,31 @@ static void prepare_texture(Texture& t, const IntRect* skip) {
   }
 }
 
-// Setup a clear on a texture. This may either force an immediate clear or
-// potentially punt to a delayed clear, if applicable.
-template <typename T>
-static void request_clear(Texture& t, T value, const IntRect& scissor) {
-  // If the clear would require a scissor, force clear anything outside
-  // the scissor, and then immediately clear anything inside the scissor.
-  if (!scissor.contains(t.offset_bounds())) {
-    IntRect skip = scissor - t.offset;
-    force_clear<T>(t, &skip);
-    clear_buffer<T>(t, value, skip.intersection(t.bounds()));
-  } else {
-    // Do delayed clear for 2D texture without scissor.
-    t.enable_delayed_clear(value);
-  }
-}
-
-template <typename T>
-static inline void request_clear(Texture& t, T value) {
-  // If scissoring is enabled, use the scissor rect. Otherwise, just scissor to
-  // the entire texture bounds.
-  request_clear(t, value, ctx->scissortest ? ctx->scissor : t.offset_bounds());
-}
-
 extern "C" {
 
-void InitDefaultFramebuffer(int x, int y, int width, int height, int stride,
-                            void* buf) {
+void InitDefaultFramebuffer(int width, int height) {
   Framebuffer& fb = ctx->framebuffers[0];
   if (!fb.color_attachment) {
     GenTextures(1, &fb.color_attachment);
+    fb.layer = 0;
   }
-  // If the dimensions or buffer properties changed, we need to reallocate
-  // the underlying storage for the color buffer texture.
   Texture& colortex = ctx->textures[fb.color_attachment];
-  set_tex_storage(colortex, GL_RGBA8, width, height, buf, stride);
-  colortex.offset = IntPoint(x, y);
+  if (colortex.width != width || colortex.height != height) {
+    colortex.cleanup();
+    set_tex_storage(colortex, GL_RGBA8, width, height);
+  }
   if (!fb.depth_attachment) {
     GenTextures(1, &fb.depth_attachment);
   }
-  // Ensure dimensions of the depth buffer match the color buffer.
   Texture& depthtex = ctx->textures[fb.depth_attachment];
-  set_tex_storage(depthtex, GL_DEPTH_COMPONENT24, width, height);
-  depthtex.offset = IntPoint(x, y);
+  if (depthtex.width != width || depthtex.height != height) {
+    depthtex.cleanup();
+    set_tex_storage(depthtex, GL_DEPTH_COMPONENT16, width, height);
+  }
 }
 
 void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width,
-                     int32_t* height, int32_t* stride) {
+                     int32_t* height) {
   Framebuffer* fb = ctx->framebuffers.find(fbo);
   if (!fb || !fb->color_attachment) {
     return nullptr;
@@ -2285,33 +1945,16 @@ void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width,
   if (flush) {
     prepare_texture(colortex);
   }
-  assert(colortex.offset == IntPoint(0, 0));
-  if (width) {
-    *width = colortex.width;
-  }
-  if (height) {
-    *height = colortex.height;
-  }
-  if (stride) {
-    *stride = colortex.stride();
-  }
-  return colortex.buf ? colortex.sample_ptr(0, 0) : nullptr;
-}
-
-void ResolveFramebuffer(GLuint fbo) {
-  Framebuffer* fb = ctx->framebuffers.find(fbo);
-  if (!fb || !fb->color_attachment) {
-    return;
-  }
-  Texture& colortex = ctx->textures[fb->color_attachment];
-  prepare_texture(colortex);
+  *width = colortex.width;
+  *height = colortex.height;
+  return colortex.buf ? colortex.sample_ptr(0, 0, fb->layer) : nullptr;
 }
 
 void SetTextureBuffer(GLuint texid, GLenum internal_format, GLsizei width,
-                      GLsizei height, GLsizei stride, void* buf,
-                      GLsizei min_width, GLsizei min_height) {
+                      GLsizei height, void* buf, GLsizei min_width,
+                      GLsizei min_height) {
   Texture& t = ctx->textures[texid];
-  set_tex_storage(t, internal_format, width, height, buf, stride, min_width,
+  set_tex_storage(t, internal_format, width, height, !buf, buf, min_width,
                   min_height);
 }
 
@@ -2323,170 +1966,57 @@ GLenum CheckFramebufferStatus(GLenum target) {
   return GL_FRAMEBUFFER_COMPLETE;
 }
 
-void ClearTexSubImage(GLuint texture, GLint level, GLint xoffset, GLint yoffset,
-                      GLint zoffset, GLsizei width, GLsizei height,
-                      GLsizei depth, GLenum format, GLenum type,
-                      const void* data) {
-  if (level != 0) {
-    assert(false);
-    return;
-  }
-  Texture& t = ctx->textures[texture];
-  assert(!t.locked);
-  if (width <= 0 || height <= 0 || depth <= 0) {
-    return;
-  }
-  assert(zoffset == 0 && depth == 1);
-  IntRect scissor = {xoffset, yoffset, xoffset + width, yoffset + height};
-  if (t.internal_format == GL_DEPTH_COMPONENT24) {
-    uint32_t value = 0xFFFFFF;
-    switch (format) {
-      case GL_DEPTH_COMPONENT:
-        switch (type) {
-          case GL_DOUBLE:
-            value = uint32_t(*(const GLdouble*)data * 0xFFFFFF);
-            break;
-          case GL_FLOAT:
-            value = uint32_t(*(const GLfloat*)data * 0xFFFFFF);
-            break;
-          default:
-            assert(false);
-            break;
-        }
-        break;
-      default:
-        assert(false);
-        break;
-    }
-    if (t.cleared() && !scissor.contains(t.offset_bounds())) {
-      // If we need to scissor the clear and the depth buffer was already
-      // initialized, then just fill runs for that scissor area.
-      t.fill_depth_runs(value, scissor);
-    } else {
-      // Otherwise, the buffer is either uninitialized or the clear would
-      // encompass the entire buffer. If uninitialized, we can safely fill
-      // the entire buffer with any value and thus ignore any scissoring.
-      t.init_depth_runs(value);
-    }
-    return;
-  }
-
-  uint32_t color = 0xFF000000;
-  switch (type) {
-    case GL_FLOAT: {
-      const GLfloat* f = (const GLfloat*)data;
-      Float v = {0.0f, 0.0f, 0.0f, 1.0f};
-      switch (format) {
-        case GL_RGBA:
-          v.w = f[3];  // alpha
-          FALLTHROUGH;
-        case GL_RGB:
-          v.z = f[2];  // blue
-          FALLTHROUGH;
-        case GL_RG:
-          v.y = f[1];  // green
-          FALLTHROUGH;
-        case GL_RED:
-          v.x = f[0];  // red
-          break;
-        default:
-          assert(false);
-          break;
-      }
-      color = bit_cast<uint32_t>(CONVERT(round_pixel(v), U8));
-      break;
-    }
-    case GL_UNSIGNED_BYTE: {
-      const GLubyte* b = (const GLubyte*)data;
-      switch (format) {
-        case GL_RGBA:
-          color = (color & ~0xFF000000) | (uint32_t(b[3]) << 24);  // alpha
-          FALLTHROUGH;
-        case GL_RGB:
-          color = (color & ~0x00FF0000) | (uint32_t(b[2]) << 16);  // blue
-          FALLTHROUGH;
-        case GL_RG:
-          color = (color & ~0x0000FF00) | (uint32_t(b[1]) << 8);  // green
-          FALLTHROUGH;
-        case GL_RED:
-          color = (color & ~0x000000FF) | uint32_t(b[0]);  // red
-          break;
-        default:
-          assert(false);
-          break;
-      }
-      break;
-    }
-    default:
-      assert(false);
-      break;
-  }
-
-  switch (t.internal_format) {
-    case GL_RGBA8:
-      // Clear color needs to swizzle to BGRA.
-      request_clear<uint32_t>(t,
-                              (color & 0xFF00FF00) |
-                                  ((color << 16) & 0xFF0000) |
-                                  ((color >> 16) & 0xFF),
-                              scissor);
-      break;
-    case GL_R8:
-      request_clear<uint8_t>(t, uint8_t(color & 0xFF), scissor);
-      break;
-    case GL_RG8:
-      request_clear<uint16_t>(t, uint16_t(color & 0xFFFF), scissor);
-      break;
-    default:
-      assert(false);
-      break;
-  }
-}
-
-void ClearTexImage(GLuint texture, GLint level, GLenum format, GLenum type,
-                   const void* data) {
-  Texture& t = ctx->textures[texture];
-  IntRect scissor = t.offset_bounds();
-  ClearTexSubImage(texture, level, scissor.x0, scissor.y0, 0, scissor.width(),
-                   scissor.height(), 1, format, type, data);
+static inline bool clear_requires_scissor(Texture& t) {
+  return ctx->scissortest && !ctx->scissor.contains(t.bounds());
 }
 
 void Clear(GLbitfield mask) {
-  Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true);
+  Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER);
   if ((mask & GL_COLOR_BUFFER_BIT) && fb.color_attachment) {
     Texture& t = ctx->textures[fb.color_attachment];
-    IntRect scissor = ctx->scissortest
-                          ? ctx->scissor.intersection(t.offset_bounds())
-                          : t.offset_bounds();
-    ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0,
-                     scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT,
-                     ctx->clearcolor);
+    if (t.internal_format == GL_RGBA8) {
+      uint32_t color = ctx->clearcolor;
+      // If the clear would require a scissor, force clear anything outside
+      // the scissor, and then immediately clear anything inside the scissor.
+      if (clear_requires_scissor(t)) {
+        force_clear<uint32_t>(t, &ctx->scissor);
+        clear_buffer<uint32_t>(t, color, fb.layer);
+      } else if (t.depth > 1) {
+        // Delayed clear is not supported on texture arrays.
+        t.disable_delayed_clear();
+        clear_buffer<uint32_t>(t, color, fb.layer);
+      } else {
+        // Do delayed clear for 2D texture without scissor.
+        t.enable_delayed_clear(color);
+      }
+    } else if (t.internal_format == GL_R8) {
+      uint8_t color = uint8_t((ctx->clearcolor >> 16) & 0xFF);
+      if (clear_requires_scissor(t)) {
+        force_clear<uint8_t>(t, &ctx->scissor);
+        clear_buffer<uint8_t>(t, color, fb.layer);
+      } else if (t.depth > 1) {
+        t.disable_delayed_clear();
+        clear_buffer<uint8_t>(t, color, fb.layer);
+      } else {
+        t.enable_delayed_clear(color);
+      }
+    } else {
+      assert(false);
+    }
   }
   if ((mask & GL_DEPTH_BUFFER_BIT) && fb.depth_attachment) {
     Texture& t = ctx->textures[fb.depth_attachment];
-    IntRect scissor = ctx->scissortest
-                          ? ctx->scissor.intersection(t.offset_bounds())
-                          : t.offset_bounds();
-    ClearTexSubImage(fb.depth_attachment, 0, scissor.x0, scissor.y0, 0,
-                     scissor.width(), scissor.height(), 1, GL_DEPTH_COMPONENT,
-                     GL_DOUBLE, &ctx->cleardepth);
+    assert(t.internal_format == GL_DEPTH_COMPONENT16);
+    uint16_t depth = uint16_t(0xFFFF * ctx->cleardepth) - 0x8000;
+    if (clear_requires_scissor(t)) {
+      force_clear<uint16_t>(t, &ctx->scissor);
+      clear_buffer<uint16_t>(t, depth);
+    } else {
+      t.enable_delayed_clear(depth);
+    }
   }
 }
 
-void ClearColorRect(GLuint fbo, GLint xoffset, GLint yoffset, GLsizei width,
-                    GLsizei height, GLfloat r, GLfloat g, GLfloat b,
-                    GLfloat a) {
-  GLfloat color[] = {r, g, b, a};
-  Framebuffer& fb = ctx->framebuffers[fbo];
-  Texture& t = ctx->textures[fb.color_attachment];
-  IntRect scissor =
-      IntRect{xoffset, yoffset, xoffset + width, yoffset + height}.intersection(
-          t.offset_bounds());
-  ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0,
-                   scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT,
-                   color);
-}
-
 void InvalidateFramebuffer(GLenum target, GLsizei num_attachments,
                            const GLenum* attachments) {
   Framebuffer* fb = get_framebuffer(target);
@@ -2497,7 +2027,7 @@ void InvalidateFramebuffer(GLenum target, GLsizei num_attachments,
     switch (attachments[i]) {
       case GL_DEPTH_ATTACHMENT: {
         Texture& t = ctx->textures[fb->depth_attachment];
-        t.set_cleared(false);
+        t.disable_delayed_clear();
         break;
       }
       case GL_COLOR_ATTACHMENT0: {
@@ -2516,58 +2046,40 @@ void ReadPixels(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format,
   Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
   if (!fb) return;
   assert(format == GL_RED || format == GL_RGBA || format == GL_RGBA_INTEGER ||
-         format == GL_BGRA || format == GL_RG);
+         format == GL_BGRA);
   Texture& t = ctx->textures[fb->color_attachment];
   if (!t.buf) return;
   prepare_texture(t);
   // debugf("read pixels %d, %d, %d, %d from fb %d with format %x\n", x, y,
   // width, height, ctx->read_framebuffer_binding, t.internal_format);
-  x -= t.offset.x;
-  y -= t.offset.y;
-  assert(x >= 0 && y >= 0);
   assert(x + width <= t.width);
   assert(y + height <= t.height);
   if (internal_format_for_data(format, type) != t.internal_format) {
     debugf("mismatched format for read pixels: %x vs %x\n", t.internal_format,
            internal_format_for_data(format, type));
     assert(false);
-    return;
-  }
-  // Only support readback conversions that are reversible
-  assert(!format_requires_conversion(format, t.internal_format) ||
-         bytes_for_internal_format(format) == t.bpp());
-  uint8_t* dest = (uint8_t*)data;
-  size_t destStride = width * t.bpp();
-  if (y < 0) {
-    dest += -y * destStride;
-    height += y;
-    y = 0;
-  }
-  if (y + height > t.height) {
-    height = t.height - y;
-  }
-  if (x < 0) {
-    dest += -x * t.bpp();
-    width += x;
-    x = 0;
   }
-  if (x + width > t.width) {
-    width = t.width - x;
-  }
-  if (width <= 0 || height <= 0) {
-    return;
+  int bpp = t.bpp();
+  char* dest = (char*)data;
+  size_t src_stride = t.stride(bpp);
+  char* src = t.sample_ptr(x, y, fb->layer, bpp, src_stride);
+  for (; height > 0; height--) {
+    if (t.internal_format == GL_RGBA8 && format != GL_BGRA) {
+      copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width);
+    } else {
+      memcpy(dest, src, width * bpp);
+    }
+    dest += width * bpp;
+    src += src_stride;
   }
-  convert_copy(format, t.internal_format, dest, destStride,
-               (const uint8_t*)t.sample_ptr(x, y), t.stride(), width, height);
 }
 
 void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel,
                       GLint srcX, GLint srcY, GLint srcZ, GLuint dstName,
-                      GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX,
-                      GLint dstY, GLint dstZ, GLsizei srcWidth,
-                      GLsizei srcHeight, GLsizei srcDepth) {
+                      GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX, GLint dstY,
+                      GLint dstZ, GLsizei srcWidth, GLsizei srcHeight,
+                      GLsizei srcDepth) {
   assert(srcLevel == 0 && dstLevel == 0);
-  assert(srcZ == 0 && srcDepth == 1 && dstZ == 0);
   if (srcTarget == GL_RENDERBUFFER) {
     Renderbuffer& rb = ctx->renderbuffers[srcName];
     srcName = rb.texture;
@@ -2581,44 +2093,532 @@ void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel,
   prepare_texture(srctex);
   Texture& dsttex = ctx->textures[dstName];
   if (!dsttex.buf) return;
-  assert(!dsttex.locked);
   IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight};
   prepare_texture(dsttex, &skip);
   assert(srctex.internal_format == dsttex.internal_format);
   assert(srcWidth >= 0);
   assert(srcHeight >= 0);
+  assert(srcDepth >= 0);
   assert(srcX + srcWidth <= srctex.width);
   assert(srcY + srcHeight <= srctex.height);
+  assert(srcZ + srcDepth <= max(srctex.depth, 1));
   assert(dstX + srcWidth <= dsttex.width);
   assert(dstY + srcHeight <= dsttex.height);
+  assert(dstZ + srcDepth <= max(dsttex.depth, 1));
   int bpp = srctex.bpp();
-  int src_stride = srctex.stride();
-  int dest_stride = dsttex.stride();
-  char* dest = dsttex.sample_ptr(dstX, dstY);
-  char* src = srctex.sample_ptr(srcX, srcY);
-  for (int y = 0; y < srcHeight; y++) {
-    memcpy(dest, src, srcWidth * bpp);
-    dest += dest_stride;
-    src += src_stride;
+  int src_stride = srctex.stride(bpp);
+  int dest_stride = dsttex.stride(bpp);
+  for (int z = 0; z < srcDepth; z++) {
+    char* dest = dsttex.sample_ptr(dstX, dstY, dstZ + z, bpp, dest_stride);
+    char* src = srctex.sample_ptr(srcX, srcY, srcZ + z, bpp, src_stride);
+    for (int y = 0; y < srcHeight; y++) {
+      memcpy(dest, src, srcWidth * bpp);
+      dest += dest_stride;
+      src += src_stride;
+    }
   }
 }
 
-void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset,
-                       GLint yoffset, GLint x, GLint y, GLsizei width,
+void CopyTexSubImage3D(GLenum target, UNUSED GLint level, GLint xoffset, GLint yoffset,
+                       GLint zoffset, GLint x, GLint y, GLsizei width,
                        GLsizei height) {
   assert(level == 0);
   Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
   if (!fb) return;
-  CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D, 0, x, y, 0,
-                   ctx->get_binding(target), GL_TEXTURE_2D, 0, xoffset, yoffset,
-                   0, width, height, 1);
+  CopyImageSubData(fb->color_attachment, GL_TEXTURE_3D, 0, x, y, fb->layer,
+                   ctx->get_binding(target), GL_TEXTURE_3D, 0, xoffset, yoffset,
+                   zoffset, width, height, 1);
+}
+
+void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset, GLint yoffset,
+                       GLint x, GLint y, GLsizei width, GLsizei height) {
+  assert(level == 0);
+  Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
+  if (!fb) return;
+  CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D_ARRAY, 0, x, y,
+                   fb->layer, ctx->get_binding(target), GL_TEXTURE_2D_ARRAY, 0,
+                   xoffset, yoffset, 0, width, height, 1);
 }
 
 }  // extern "C"
 
-#include "blend.h"
-#include "composite.h"
-#include "swgl_ext.h"
+using PackedRGBA8 = V16<uint8_t>;
+using WideRGBA8 = V16<uint16_t>;
+using HalfRGBA8 = V8<uint16_t>;
+
+static inline WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); }
+
+static inline PackedRGBA8 pack(WideRGBA8 p) {
+#if USE_SSE2
+  return _mm_packus_epi16(lowHalf(p), highHalf(p));
+#elif USE_NEON
+  return vcombine_u8(vqmovn_u16(lowHalf(p)), vqmovn_u16(highHalf(p)));
+#else
+  return CONVERT(p, PackedRGBA8);
+#endif
+}
+
+static inline HalfRGBA8 packRGBA8(I32 a, I32 b) {
+#if USE_SSE2
+  return _mm_packs_epi32(a, b);
+#elif USE_NEON
+  return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b));
+#else
+  return CONVERT(combine(a, b), HalfRGBA8);
+#endif
+}
+
+using PackedR8 = V4<uint8_t>;
+using WideR8 = V4<uint16_t>;
+
+static inline WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); }
+
+static inline WideR8 packR8(I32 a) {
+#if USE_SSE2
+  return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a)));
+#elif USE_NEON
+  return vqmovun_s32(a);
+#else
+  return CONVERT(a, WideR8);
+#endif
+}
+
+static inline PackedR8 pack(WideR8 p) {
+#if USE_SSE2
+  auto m = expand(p);
+  auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m));
+  return SHUFFLE(r, r, 0, 1, 2, 3);
+#elif USE_NEON
+  return lowHalf(bit_cast<V8<uint8_t>>(vqmovn_u16(expand(p))));
+#else
+  return CONVERT(p, PackedR8);
+#endif
+}
+
+using ZMask4 = V4<int16_t>;
+using ZMask8 = V8<int16_t>;
+
+static inline PackedRGBA8 unpack(ZMask4 mask, uint32_t*) {
+  return bit_cast<PackedRGBA8>(mask.xxyyzzww);
+}
+
+static inline WideR8 unpack(ZMask4 mask, uint8_t*) {
+  return bit_cast<WideR8>(mask);
+}
+
+#if USE_SSE2
+#  define ZMASK_NONE_PASSED 0xFFFF
+#  define ZMASK_ALL_PASSED 0
+static inline uint32_t zmask_code(ZMask8 mask) {
+  return _mm_movemask_epi8(mask);
+}
+static inline uint32_t zmask_code(ZMask4 mask) {
+  return zmask_code(mask.xyzwxyzw);
+}
+#else
+using ZMask4Code = V4<uint8_t>;
+using ZMask8Code = V8<uint8_t>;
+#  define ZMASK_NONE_PASSED 0xFFFFFFFFU
+#  define ZMASK_ALL_PASSED 0
+static inline uint32_t zmask_code(ZMask4 mask) {
+  return bit_cast<uint32_t>(CONVERT(mask, ZMask4Code));
+}
+static inline uint32_t zmask_code(ZMask8 mask) {
+  return zmask_code(
+      ZMask4((U16(lowHalf(mask)) >> 12) | (U16(highHalf(mask)) << 4)));
+}
+#endif
+
+template <int FUNC, bool MASK>
+static ALWAYS_INLINE int check_depth8(uint16_t z, uint16_t* zbuf,
+                                      ZMask8& outmask) {
+  ZMask8 dest = unaligned_load<ZMask8>(zbuf);
+  ZMask8 src = int16_t(z);
+  // Invert the depth test to check which pixels failed and should be discarded.
+  ZMask8 mask = FUNC == GL_LEQUAL ?
+                                  // GL_LEQUAL: Not(LessEqual) = Greater
+                    ZMask8(src > dest)
+                                  :
+                                  // GL_LESS: Not(Less) = GreaterEqual
+                    ZMask8(src >= dest);
+  switch (zmask_code(mask)) {
+    case ZMASK_NONE_PASSED:
+      return 0;
+    case ZMASK_ALL_PASSED:
+      if (MASK) {
+        unaligned_store(zbuf, src);
+      }
+      return -1;
+    default:
+      if (MASK) {
+        unaligned_store(zbuf, (mask & dest) | (~mask & src));
+      }
+      outmask = mask;
+      return 1;
+  }
+}
+
+template <bool FULL_SPANS, bool DISCARD>
+static ALWAYS_INLINE bool check_depth4(ZMask4 src, uint16_t* zbuf,
+                                       ZMask4& outmask, int span = 0) {
+  ZMask4 dest = unaligned_load<ZMask4>(zbuf);
+  // Invert the depth test to check which pixels failed and should be discarded.
+  ZMask4 mask = ctx->depthfunc == GL_LEQUAL
+                    ?
+                    // GL_LEQUAL: Not(LessEqual) = Greater
+                    ZMask4(src > dest)
+                    :
+                    // GL_LESS: Not(Less) = GreaterEqual
+                    ZMask4(src >= dest);
+  if (!FULL_SPANS) {
+    mask |= ZMask4(span) < ZMask4{1, 2, 3, 4};
+  }
+  if (zmask_code(mask) == ZMASK_NONE_PASSED) {
+    return false;
+  }
+  if (!DISCARD && ctx->depthmask) {
+    unaligned_store(zbuf, (mask & dest) | (~mask & src));
+  }
+  outmask = mask;
+  return true;
+}
+
+template <bool FULL_SPANS, bool DISCARD>
+static ALWAYS_INLINE bool check_depth4(uint16_t z, uint16_t* zbuf,
+                                       ZMask4& outmask, int span = 0) {
+  return check_depth4<FULL_SPANS, DISCARD>(ZMask4(int16_t(z)), zbuf, outmask,
+                                           span);
+}
+
+template <typename T>
+static inline ZMask4 packZMask4(T a) {
+#if USE_SSE2
+  return lowHalf(bit_cast<ZMask8>(_mm_packs_epi32(a, a)));
+#elif USE_NEON
+  return vqmovn_s32(a);
+#else
+  return CONVERT(a, ZMask4);
+#endif
+}
+
+static ALWAYS_INLINE ZMask4 packDepth() {
+  return packZMask4(cast(fragment_shader->gl_FragCoord.z * 0xFFFF) - 0x8000);
+}
+
+static ALWAYS_INLINE void discard_depth(ZMask4 src, uint16_t* zbuf,
+                                        ZMask4 mask) {
+  if (ctx->depthmask) {
+    ZMask4 dest = unaligned_load<ZMask4>(zbuf);
+    mask |= packZMask4(fragment_shader->isPixelDiscarded);
+    unaligned_store(zbuf, (mask & dest) | (~mask & src));
+  }
+}
+
+static ALWAYS_INLINE void discard_depth(uint16_t z, uint16_t* zbuf,
+                                        ZMask4 mask) {
+  discard_depth(ZMask4(int16_t(z)), zbuf, mask);
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8(const vec4& v) {
+  ivec4 i = round_pixel(v);
+  HalfRGBA8 xz = packRGBA8(i.z, i.x);
+  HalfRGBA8 yw = packRGBA8(i.y, i.w);
+  HalfRGBA8 xy = zipLow(xz, yw);
+  HalfRGBA8 zw = zipHigh(xz, yw);
+  HalfRGBA8 lo = zip2Low(xy, zw);
+  HalfRGBA8 hi = zip2High(xy, zw);
+  return combine(lo, hi);
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v) {
+  I32 i = round_pixel((Float){v.z, v.y, v.x, v.w});
+  HalfRGBA8 c = packRGBA8(i, i);
+  return combine(c, c);
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8() {
+  return pack_pixels_RGBA8(fragment_shader->gl_FragColor);
+}
+
+template <typename V>
+static inline PackedRGBA8 pack_span(uint32_t*, const V& v) {
+  return pack(pack_pixels_RGBA8(v));
+}
+
+static inline PackedRGBA8 pack_span(uint32_t*) {
+  return pack(pack_pixels_RGBA8());
+}
+
+// (x*y + x) >> 8, cheap approximation of (x*y) / 255
+template <typename T>
+static inline T muldiv255(T x, T y) {
+  return (x * y + x) >> 8;
+}
+
+// Byte-wise addition for when x or y is a signed 8-bit value stored in the
+// low byte of a larger type T only with zeroed-out high bits, where T is
+// greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used
+// upon signed operands, using up all the precision in a 16 bit integer, and
+// potentially losing the sign bit in the last >> 8 shift. Due to the
+// properties of two's complement arithmetic, even though we've discarded the
+// sign bit, we can still represent a negative number under addition (without
+// requiring any extra sign bits), just that any negative number will behave
+// like a large unsigned number under addition, generating a single carry bit
+// on overflow that we need to discard. Thus, just doing a byte-wise add will
+// overflow without the troublesome carry, giving us only the remaining 8 low
+// bits we actually need while keeping the high bits at zero.
+template <typename T>
+static inline T addlow(T x, T y) {
+  typedef VectorType<uint8_t, sizeof(T)> bytes;
+  return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y));
+}
+
+static inline WideRGBA8 alphas(WideRGBA8 c) {
+  return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15);
+}
+
+static inline WideRGBA8 blend_pixels_RGBA8(PackedRGBA8 pdst, WideRGBA8 src) {
+  WideRGBA8 dst = unpack(pdst);
+  const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0,      0xFFFF, 0xFFFF,
+                              0xFFFF, 0,      0xFFFF, 0xFFFF, 0xFFFF, 0,
+                              0xFFFF, 0xFFFF, 0xFFFF, 0};
+  const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF,
+                                0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF};
+  const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255,
+                                  0, 0, 0, 255, 0, 0, 0, 255};
+  switch (blend_key) {
+    case BLEND_KEY_NONE:
+      return src;
+    case BLEND_KEY(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE):
+      // dst + src.a*(src.rgb1 - dst.rgb0)
+      // use addlow for signed overflow
+      return addlow(dst,
+          muldiv255(alphas(src), (src | ALPHA_OPAQUE) - (dst & RGB_MASK)));
+    case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
+      return src + dst - muldiv255(dst, alphas(src));
+    case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_COLOR):
+      return dst - muldiv255(dst, src);
+    case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE):
+      return dst - (muldiv255(dst, src) & RGB_MASK);
+    case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA):
+      return dst - muldiv255(dst, alphas(src));
+    case BLEND_KEY(GL_ZERO, GL_SRC_COLOR):
+      return muldiv255(src, dst);
+    case BLEND_KEY(GL_ONE, GL_ONE):
+      return src + dst;
+    case BLEND_KEY(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
+      return src + dst - (muldiv255(dst, src) & ALPHA_MASK);
+    case BLEND_KEY(GL_ONE, GL_ZERO):
+      return src;
+    case BLEND_KEY(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE):
+      // src*(1-dst.a) + dst*1 = src - src*dst.a + dst
+      return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK);
+    case BLEND_KEY(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR):
+      // src*k + (1-src)*dst = src*k + dst - src*dst = dst + src*(k - dst)
+      // use addlow for signed overflow
+      return addlow(dst,
+          muldiv255(src, combine(ctx->blendcolor, ctx->blendcolor) - dst));
+    case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
+      WideRGBA8 secondary =
+          pack_pixels_RGBA8(fragment_shader->gl_SecondaryFragColor);
+      return src + dst - muldiv255(dst, secondary);
+    }
+    default:
+      UNREACHABLE;
+      // return src;
+  }
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint32_t* buf, PackedRGBA8 mask) {
+  PackedRGBA8 dst = unaligned_load<PackedRGBA8>(buf);
+  WideRGBA8 r = pack_pixels_RGBA8();
+  if (blend_key) r = blend_pixels_RGBA8(dst, r);
+  if (DISCARD) mask |= bit_cast<PackedRGBA8>(fragment_shader->isPixelDiscarded);
+  unaligned_store(buf, (mask & dst) | (~mask & pack(r)));
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint32_t* buf) {
+  discard_output<DISCARD>(buf, 0);
+}
+
+template <>
+inline void discard_output<false>(uint32_t* buf) {
+  WideRGBA8 r = pack_pixels_RGBA8();
+  if (blend_key) r = blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), r);
+  unaligned_store(buf, pack(r));
+}
+
+static inline PackedRGBA8 span_mask_RGBA8(int span) {
+  return bit_cast<PackedRGBA8>(I32(span) < I32{1, 2, 3, 4});
+}
+
+static inline PackedRGBA8 span_mask(uint32_t*, int span) {
+  return span_mask_RGBA8(span);
+}
+
+static inline WideR8 pack_pixels_R8(Float c) {
+  return packR8(round_pixel(c));
+}
+
+static inline WideR8 pack_pixels_R8() {
+  return pack_pixels_R8(fragment_shader->gl_FragColor.x);
+}
+
+template <typename C>
+static inline PackedR8 pack_span(uint8_t*, C c) {
+  return pack(pack_pixels_R8(c));
+}
+
+static inline PackedR8 pack_span(uint8_t*) { return pack(pack_pixels_R8()); }
+
+static inline WideR8 blend_pixels_R8(WideR8 dst, WideR8 src) {
+  switch (blend_key) {
+    case BLEND_KEY_NONE:
+      return src;
+    case BLEND_KEY(GL_ZERO, GL_SRC_COLOR):
+      return muldiv255(src, dst);
+    case BLEND_KEY(GL_ONE, GL_ONE):
+      return src + dst;
+    case BLEND_KEY(GL_ONE, GL_ZERO):
+      return src;
+    default:
+      UNREACHABLE;
+      // return src;
+  }
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint8_t* buf, WideR8 mask) {
+  WideR8 dst = unpack(unaligned_load<PackedR8>(buf));
+  WideR8 r = pack_pixels_R8();
+  if (blend_key) r = blend_pixels_R8(dst, r);
+  if (DISCARD) mask |= packR8(fragment_shader->isPixelDiscarded);
+  unaligned_store(buf, pack((mask & dst) | (~mask & r)));
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint8_t* buf) {
+  discard_output<DISCARD>(buf, 0);
+}
+
+template <>
+inline void discard_output<false>(uint8_t* buf) {
+  WideR8 r = pack_pixels_R8();
+  if (blend_key) r = blend_pixels_R8(unpack(unaligned_load<PackedR8>(buf)), r);
+  unaligned_store(buf, pack(r));
+}
+
+static inline WideR8 span_mask_R8(int span) {
+  return bit_cast<WideR8>(WideR8(span) < WideR8{1, 2, 3, 4});
+}
+
+static inline WideR8 span_mask(uint8_t*, int span) {
+  return span_mask_R8(span);
+}
+
+template <bool DISCARD, bool W, typename P, typename M>
+static inline void commit_output(P* buf, M mask) {
+  fragment_shader->run<W>();
+  discard_output<DISCARD>(buf, mask);
+}
+
+template <bool DISCARD, bool W, typename P>
+static inline void commit_output(P* buf) {
+  fragment_shader->run<W>();
+  discard_output<DISCARD>(buf);
+}
+
+template <bool DISCARD, bool W, typename P>
+static inline void commit_output(P* buf, int span) {
+  commit_output<DISCARD, W>(buf, span_mask(buf, span));
+}
+
+template <bool DISCARD, bool W, typename P, typename Z>
+static inline void commit_output(P* buf, Z z, uint16_t* zbuf) {
+  ZMask4 zmask;
+  if (check_depth4<true, DISCARD>(z, zbuf, zmask)) {
+    commit_output<DISCARD, W>(buf, unpack(zmask, buf));
+    if (DISCARD) {
+      discard_depth(z, zbuf, zmask);
+    }
+  } else {
+    fragment_shader->skip<W>();
+  }
+}
+
+template <bool DISCARD, bool W, typename P, typename Z>
+static inline void commit_output(P* buf, Z z, uint16_t* zbuf, int span) {
+  ZMask4 zmask;
+  if (check_depth4<false, DISCARD>(z, zbuf, zmask, span)) {
+    commit_output<DISCARD, W>(buf, unpack(zmask, buf));
+    if (DISCARD) {
+      discard_depth(z, zbuf, zmask);
+    }
+  }
+}
+
+static inline void commit_span(uint32_t* buf, PackedRGBA8 r) {
+  if (blend_key)
+    r = pack(blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), unpack(r)));
+  unaligned_store(buf, r);
+}
+
+UNUSED static inline void commit_solid_span(uint32_t* buf, PackedRGBA8 r,
+                                            int len) {
+  if (blend_key) {
+    auto src = unpack(r);
+    for (uint32_t* end = &buf[len]; buf < end; buf += 4) {
+      unaligned_store(
+          buf, pack(blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), src)));
+    }
+  } else {
+    fill_n(buf, len, bit_cast<U32>(r).x);
+  }
+}
+
+UNUSED static inline void commit_texture_span(uint32_t* buf, uint32_t* src,
+                                              int len) {
+  if (blend_key) {
+    for (uint32_t* end = &buf[len]; buf < end; buf += 4, src += 4) {
+      PackedRGBA8 r = unaligned_load<PackedRGBA8>(src);
+      unaligned_store(buf, pack(blend_pixels_RGBA8(
+                               unaligned_load<PackedRGBA8>(buf), unpack(r))));
+    }
+  } else {
+    memcpy(buf, src, len * sizeof(uint32_t));
+  }
+}
+
+static inline void commit_span(uint8_t* buf, PackedR8 r) {
+  if (blend_key)
+    r = pack(blend_pixels_R8(unpack(unaligned_load<PackedR8>(buf)), unpack(r)));
+  unaligned_store(buf, r);
+}
+
+UNUSED static inline void commit_solid_span(uint8_t* buf, PackedR8 r, int len) {
+  if (blend_key) {
+    auto src = unpack(r);
+    for (uint8_t* end = &buf[len]; buf < end; buf += 4) {
+      unaligned_store(buf, pack(blend_pixels_R8(
+                               unpack(unaligned_load<PackedR8>(buf)), src)));
+    }
+  } else {
+    fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(r));
+  }
+}
+
+#define DISPATCH_DRAW_SPAN(self, buf, len) do {           \
+  int drawn = self->draw_span(buf, len);                  \
+  if (drawn) self->step_interp_inputs(drawn >> 2);        \
+  for (buf += drawn; drawn < len; drawn += 4, buf += 4) { \
+    run(self);                                            \
+    commit_span(buf, pack_span(buf));                     \
+  }                                                       \
+} while (0)
+
+#include "texture.h"
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wuninitialized"
@@ -2627,14 +2627,942 @@ void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset,
 #pragma GCC diagnostic ignored "-Wunused-variable"
 #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
 #ifdef __clang__
-#  pragma GCC diagnostic ignored "-Wunused-private-field"
+#pragma GCC diagnostic ignored "-Wunused-private-field"
 #else
-#  pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
 #endif
 #include "load_shader.h"
 #pragma GCC diagnostic pop
 
-#include "rasterize.h"
+typedef vec2_scalar Point2D;
+typedef vec4_scalar Point3D;
+
+struct ClipRect {
+  float x0;
+  float y0;
+  float x1;
+  float y1;
+
+  ClipRect(const IntRect& i) : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {}
+  ClipRect(Texture& t) : ClipRect(ctx->apply_scissor(t.bounds())) {}
+
+  template <typename P>
+  bool overlaps(int nump, const P* p) const {
+    // Generate a mask of which side of the clip rect all of a polygon's points
+    // fall inside of. This is a cheap conservative estimate of whether the
+    // bounding box of the polygon might overlap the clip rect, rather than an
+    // exact test that would require multiple slower line intersections.
+    int sides = 0;
+    for (int i = 0; i < nump; i++) {
+      sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2;
+      sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8;
+    }
+    return sides == 0xF;
+  }
+};
+
+// Helper function for drawing 8-pixel wide chunks of a span with depth buffer.
+// Using 8-pixel chunks maximizes use of 16-bit depth values in 128-bit wide
+// SIMD register. However, since fragment shaders process only 4 pixels per
+// invocation, we need to run fragment shader twice for every 8 pixel batch
+// of results we get from the depth test. Perspective is not supported.
+template <int FUNC, bool MASK, typename P>
+static inline void draw_depth_span(uint16_t z, P* buf, uint16_t* depth,
+                                   int span) {
+  int skip = 0;
+  // Check if the fragment shader has an optimized draw specialization.
+  if (fragment_shader->has_draw_span(buf)) {
+    // The loop tries to accumulate runs of pixels that passed (len) and
+    // runs of pixels that failed (skip). This allows it to pass the largest
+    // possible span in between changes in depth pass or fail status to the
+    // fragment shader's draw specialer.
+    int len = 0;
+    do {
+      ZMask8 zmask;
+      // Process depth in 8-pixel chunks.
+      switch (check_depth8<FUNC, MASK>(z, depth, zmask)) {
+        case 0: // All pixels failed the depth test.
+          if (len) {
+            // Flush out passed pixels.
+            fragment_shader->draw_span(buf - len, len);
+            len = 0;
+          }
+          // Accumulate 2 skipped chunks.
+          skip += 2;
+          break;
+        case -1: // All pixels passed the depth test.
+          if (skip) {
+            // Flushed out any skipped chunks.
+            fragment_shader->skip(skip);
+            skip = 0;
+          }
+          // Accumulate 8 passed pixels.
+          len += 8;
+          break;
+        default: // Mixture of pass and fail results.
+          if (len) {
+            // Flush out any passed pixels.
+            fragment_shader->draw_span(buf - len, len);
+            len = 0;
+          } else if (skip) {
+            // Flush out any skipped chunks.
+            fragment_shader->skip(skip);
+            skip = 0;
+          }
+          // Run fragment shader on first 4 depth results.
+          commit_output<false, false>(buf, unpack(lowHalf(zmask), buf));
+          // Run fragment shader on next 4 depth results.
+          commit_output<false, false>(buf + 4, unpack(highHalf(zmask), buf));
+          break;
+      }
+      // Advance to next 8 pixels...
+      buf += 8;
+      depth += 8;
+      span -= 8;
+    } while (span >= 8);
+    // Flush out any remaining passed pixels.
+    if (len) {
+      fragment_shader->draw_span(buf - len, len);
+    }
+  } else {
+    // No draw specialization, so we can use a simpler loop here that just
+    // accumulates depth failures, but otherwise invokes fragment shader
+    // immediately on depth pass.
+    do {
+      ZMask8 zmask;
+      // Process depth in 8-pixel chunks.
+      switch (check_depth8<FUNC, MASK>(z, depth, zmask)) {
+        case 0: // All pixels failed the depth test.
+          // Accumulate 2 skipped chunks.
+          skip += 2;
+          break;
+        case -1: // All pixels passed the depth test.
+          if (skip) {
+            // Flush out any skipped chunks.
+            fragment_shader->skip(skip);
+            skip = 0;
+          }
+          // Run the fragment shader for two 4-pixel chunks.
+          commit_output<false, false>(buf);
+          commit_output<false, false>(buf + 4);
+          break;
+        default: // Mixture of pass and fail results.
+          if (skip) {
+            // Flush out any skipped chunks.
+            fragment_shader->skip(skip);
+            skip = 0;
+          }
+          // Run fragment shader on first 4 depth results.
+          commit_output<false, false>(buf, unpack(lowHalf(zmask), buf));
+          // Run fragment shader on next 4 depth results.
+          commit_output<false, false>(buf + 4, unpack(highHalf(zmask), buf));
+          break;
+      }
+      // Advance to next 8 pixels...
+      buf += 8;
+      depth += 8;
+      span -= 8;
+    } while (span >= 8);
+  }
+  // Flush out any remaining skipped chunks.
+  if (skip) {
+    fragment_shader->skip(skip);
+  }
+}
+
+// Draw a simple span in 4-pixel wide chunks, optionally using depth.
+template <bool DISCARD, bool W, typename P, typename Z>
+static ALWAYS_INLINE void draw_span(P* buf, uint16_t* depth, int span, Z z) {
+  if (depth) {
+    // Depth testing is enabled. If perspective is used, Z values will vary
+    // across the span, we use packDepth to generate 16-bit Z values suitable
+    // for depth testing based on current values from gl_FragCoord.z.
+    // Otherwise, for the no-perspective case, we just use the provided Z.
+    // Process 4-pixel chunks first.
+    for (; span >= 4; span -= 4, buf += 4, depth += 4) {
+      commit_output<DISCARD, W>(buf, z(), depth);
+    }
+    // If there are any remaining pixels, do a partial chunk.
+    if (span > 0) {
+      commit_output<DISCARD, W>(buf, z(), depth, span);
+    }
+  } else {
+    // Process 4-pixel chunks first.
+    for (; span >= 4; span -= 4, buf += 4) {
+      commit_output<DISCARD, W>(buf);
+    }
+    // If there are any remaining pixels, do a partial chunk.
+    if (span > 0) {
+      commit_output<DISCARD, W>(buf, span);
+    }
+  }
+}
+
+// Draw spans for each row of a given quad (or triangle) with a constant Z
+// value. The quad is assumed convex. It is clipped to fall within the given
+// clip rect. In short, this function rasterizes a quad by first finding a
+// top most starting point and then from there tracing down the left and right
+// sides of this quad until it hits the bottom, outputting a span between the
+// current left and right positions at each row along the way. Points are
+// assumed to be ordered in either CW or CCW to support this, but currently
+// both orders (CW and CCW) are supported and equivalent.
+template <typename P>
+static inline void draw_quad_spans(int nump, Point2D p[4], uint16_t z,
+                                   Interpolants interp_outs[4],
+                                   Texture& colortex, int layer,
+                                   Texture& depthtex,
+                                   const ClipRect& clipRect) {
+  // Only triangles and convex quads supported.
+  assert(nump == 3 || nump == 4);
+  Point2D l0, r0, l1, r1;
+  int l0i, r0i, l1i, r1i;
+  {
+    // Find the index of the top-most (smallest Y) point from which
+    // rasterization can start.
+    int top = nump > 3 && p[3].y < p[2].y
+                  ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3)
+                                     : (p[1].y < p[3].y ? 1 : 3))
+                  : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2)
+                                     : (p[1].y < p[2].y ? 1 : 2));
+    // Helper to find next index in the points array, walking forward.
+#define NEXT_POINT(idx)   \
+  ({                      \
+    int cur = (idx) + 1;  \
+    cur < nump ? cur : 0; \
+  })
+    // Helper to find the previous index in the points array, walking backward.
+#define PREV_POINT(idx)        \
+  ({                           \
+    int cur = (idx)-1;         \
+    cur >= 0 ? cur : nump - 1; \
+  })
+    // Start looking for "left"-side and "right"-side descending edges starting
+    // from the determined top point.
+    int next = NEXT_POINT(top);
+    int prev = PREV_POINT(top);
+    if (p[top].y == p[next].y) {
+      // If the next point is on the same row as the top, then advance one more
+      // time to the next point and use that as the "left" descending edge.
+      l0i = next;
+      l1i = NEXT_POINT(next);
+      // Assume top and prev form a descending "right" edge, as otherwise this
+      // will be a collapsed polygon and harmlessly bail out down below.
+      r0i = top;
+      r1i = prev;
+    } else if (p[top].y == p[prev].y) {
+      // If the prev point is on the same row as the top, then advance to the
+      // prev again and use that as the "right" descending edge.
+      // Assume top and next form a non-empty descending "left" edge.
+      l0i = top;
+      l1i = next;
+      r0i = prev;
+      r1i = PREV_POINT(prev);
+    } else {
+      // Both next and prev are on distinct rows from top, so both "left" and
+      // "right" edges are non-empty/descending.
+      l0i = r0i = top;
+      l1i = next;
+      r1i = prev;
+    }
+    // Load the points from the indices.
+    l0 = p[l0i]; // Start of left edge
+    r0 = p[r0i]; // End of left edge
+    l1 = p[l1i]; // Start of right edge
+    r1 = p[r1i]; // End of right edge
+    //    debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1:
+    //    %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i,
+    //    r1.x, r1.y);
+  }
+
+  struct Edge
+  {
+    float yScale;
+    float xSlope;
+    float x;
+    Interpolants interpSlope;
+    Interpolants interp;
+
+    Edge(float y, const Point2D& p0, const Point2D& p1,
+         const Interpolants& i0, const Interpolants& i1) :
+      // Inverse Y scale for slope calculations. Avoid divide on 0-length edge.
+      // Later checks below ensure that Y <= p1.y, or otherwise we don't use
+      // this edge. We just need to guard against Y == p1.y == p0.y. In that
+      // case, Y - p0.y == 0 and will cancel out the slopes below, except if
+      // yScale is Inf for some reason (or worse, NaN), which 1/(p1.y-p0.y)
+      // might produce if we don't bound it.
+      yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
+      // Calculate dX/dY slope
+      xSlope((p1.x - p0.x) * yScale),
+      // Initialize current X based on Y and slope
+      x(p0.x + (y - p0.y) * xSlope),
+      // Calculate change in interpolants per change in Y
+      interpSlope((i1 - i0) * yScale),
+      // Initialize current interpolants based on Y and slope
+      interp(i0 + (y - p0.y) * interpSlope)
+    {}
+
+    void nextRow() {
+      // step current X and interpolants to next row from slope
+      x += xSlope;
+      interp += interpSlope;
+    }
+  };
+
+  // Vertex selection above should result in equal left and right start rows
+  assert(l0.y == r0.y);
+  // Find the start y, clip to within the clip rect, and round to row center.
+  float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f;
+  // Initialize left and right edges from end points and start Y
+  Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+  Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+  // Get pointer to color buffer and depth buffer at current Y
+  P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer, sizeof(P));
+  uint16_t* fdepth =
+    (uint16_t*)depthtex.sample_ptr(0, int(y), 0, sizeof(uint16_t));
+  // Loop along advancing Ys, rasterizing spans at each row
+  float checkY = min(min(l1.y, r1.y), clipRect.y1);
+  for (;;) {
+    // Check if we maybe passed edge ends or outside clip rect...
+    if (y > checkY) {
+      // If we're outside the clip rect, we're done.
+      if (y > clipRect.y1) break;
+      // Helper to find the next non-duplicate vertex that doesn't loop back.
+#define STEP_EDGE(e0i, e0, e1i, e1, STEP_POINT, end)                   \
+      for (;;) {                                                       \
+        /* Set new start of edge to be end of old edge */              \
+        e0i = e1i;                                                     \
+        e0 = e1;                                                       \
+        /* Set new end of edge to next point */                        \
+        e1i = STEP_POINT(e1i);                                         \
+        e1 = p[e1i];                                                   \
+        /* If the edge is descending, use it. */                       \
+        if (e1.y > e0.y) break;                                        \
+        /* If the edge is ascending or crossed the end, we're done. */ \
+        if (e1.y < e0.y || e0i == end) return;                         \
+        /* Otherwise, it's a duplicate, so keep searching. */          \
+      }
+      // Check if Y advanced past the end of the left edge
+      if (y > l1.y) {
+        // Step to next left edge past Y and reset edge interpolants.
+        do { STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i); } while (y > l1.y);
+        left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+      }
+      // Check if Y advanced past the end of the right edge
+      if (y > r1.y) {
+        // Step to next right edge past Y and reset edge interpolants.
+        do { STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i); } while (y > r1.y);
+        right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+      }
+      // Reset check condition for next time around.
+      checkY = min(min(l1.y, r1.y), clipRect.y1);
+    }
+    // lx..rx form the bounds of the span. WR does not use backface culling,
+    // so we need to use min/max to support the span in either orientation.
+    // Clip the span to fall within the clip rect and then round to nearest
+    // column.
+    int startx = int(max(min(left.x, right.x), clipRect.x0) + 0.5f);
+    int endx = int(min(max(left.x, right.x), clipRect.x1) + 0.5f);
+    // Check if span is non-empty.
+    int span = endx - startx;
+    if (span > 0) {
+      ctx->shaded_rows++;
+      ctx->shaded_pixels += span;
+      // Advance color/depth buffer pointers to the start of the span.
+      P* buf = fbuf + startx;
+      // Check if the we will need to use depth-buffer or discard on this span.
+      uint16_t* depth = depthtex.buf != nullptr ? fdepth + startx : nullptr;
+      bool use_discard = fragment_shader->use_discard();
+      if (depthtex.delay_clear) {
+        // Delayed clear is enabled for the depth buffer. Check if this row
+        // needs to be cleared.
+        int yi = int(y);
+        uint32_t& mask = depthtex.cleared_rows[yi / 32];
+        if ((mask & (1 << (yi & 31))) == 0) {
+          // The depth buffer is unitialized on this row, but we know it will
+          // thus be cleared entirely to the clear value. This lets us quickly
+          // check the constant Z value of the quad against the clear Z to know
+          // if the entire span passes or fails the depth test all at once.
+          switch (ctx->depthfunc) {
+            case GL_LESS:
+              if (int16_t(z) < int16_t(depthtex.clear_val))
+                break;
+              else
+                goto next_span;
+            case GL_LEQUAL:
+              if (int16_t(z) <= int16_t(depthtex.clear_val))
+                break;
+              else
+                goto next_span;
+          }
+          // If we got here, we passed the depth test.
+          if (ctx->depthmask) {
+            // Depth writes are enabled, so we need to initialize depth.
+            mask |= 1 << (yi & 31);
+            depthtex.delay_clear--;
+            if (use_discard) {
+              // if discard is enabled, we don't know what pixels may be
+              // written to, so we have to clear the entire row.
+              force_clear_row<uint16_t>(depthtex, yi);
+            } else {
+              // Otherwise, we only need to clear the pixels that fall outside
+              // the current span on this row.
+              if (startx > 0 || endx < depthtex.width) {
+                force_clear_row<uint16_t>(depthtex, yi, startx, endx);
+              }
+              // Fill in the span's Z values with constant Z.
+              clear_buffer<uint16_t>(depthtex, z, 0,
+                                     IntRect{startx, yi, endx, yi + 1});
+              // We already passed the depth test, so no need to test depth
+              // any more.
+              depth = nullptr;
+            }
+          } else {
+            // No depth writes, so don't clear anything, and no need to test.
+            depth = nullptr;
+          }
+        }
+      }
+      if (colortex.delay_clear) {
+        // Delayed clear is enabled for the color buffer. Check if needs clear.
+        int yi = int(y);
+        uint32_t& mask = colortex.cleared_rows[yi / 32];
+        if ((mask & (1 << (yi & 31))) == 0) {
+          mask |= 1 << (yi & 31);
+          colortex.delay_clear--;
+          if (depth || blend_key || use_discard) {
+            // If depth test, blending, or discard is used, old color values
+            // might be sampled, so we need to clear the entire row to fill it.
+            force_clear_row<P>(colortex, yi);
+          } else if (startx > 0 || endx < colortex.width) {
+            // Otherwise, we only need to clear the row outside of the span.
+            // The fragment shader will fill the row within the span itself.
+            force_clear_row<P>(colortex, yi, startx, endx);
+          }
+        }
+      }
+      // Initialize fragment shader interpolants to current span position.
+      fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1);
+      fragment_shader->gl_FragCoord.y = y;
+      {
+        // Change in interpolants is difference between current right and left
+        // edges per the change in right and left X.
+        Interpolants step =
+            (right.interp - left.interp) * (1.0f / (right.x - left.x));
+        // Advance current interpolants to X at start of span.
+        Interpolants o = left.interp + step * (startx + 0.5f - left.x);
+        fragment_shader->init_span(&o, &step, 4.0f);
+      }
+      if (!use_discard) {
+        // Fast paths for the case where fragment discard is not used.
+        if (depth) {
+          // If depth is used, we want to process spans in 8-pixel chunks to
+          // maximize sampling and testing 16-bit depth values within the 128-
+          // bit width of a SIMD register.
+          if (span >= 8) {
+            // Specializations for supported depth functions depending on
+            // whether depth writes are enabled.
+            if (ctx->depthfunc == GL_LEQUAL) {
+              if (ctx->depthmask)
+                draw_depth_span<GL_LEQUAL, true>(z, buf, depth, span);
+              else
+                draw_depth_span<GL_LEQUAL, false>(z, buf, depth, span);
+            } else {
+              if (ctx->depthmask)
+                draw_depth_span<GL_LESS, true>(z, buf, depth, span);
+              else
+                draw_depth_span<GL_LESS, false>(z, buf, depth, span);
+            }
+            // Advance buffers past processed chunks.
+            buf += span & ~7;
+            depth += span & ~7;
+            span &= 7;
+          }
+        } else {
+          // Check if the fragment shader has an optimized draw specialization.
+          if (span >= 4 && fragment_shader->has_draw_span(buf)) {
+            // Draw specialization expects 4-pixel chunks.
+            int len = span & ~3;
+            fragment_shader->draw_span(buf, len);
+            buf += len;
+            span &= 3;
+          }
+        }
+        draw_span<false, false>(buf, depth, span, [=]{ return z; });
+      } else {
+        // If discard is used, then use slower fallbacks. This should be rare.
+        // Just needs to work, doesn't need to be too fast yet...
+        draw_span<true, false>(buf, depth, span, [=]{ return z; });
+      }
+    }
+  next_span:
+    // Advance Y and edge interpolants to next row.
+    y++;
+    left.nextRow();
+    right.nextRow();
+    // Advance buffers to next row.
+    fbuf += colortex.stride(sizeof(P)) / sizeof(P);
+    fdepth += depthtex.stride(sizeof(uint16_t)) / sizeof(uint16_t);
+  }
+}
+
+// Draw perspective-correct spans for a convex quad that has been clipped to
+// the near and far Z planes, possibly producing a clipped convex polygon with
+// more than 4 sides. This assumes the Z value will vary across the spans and
+// requires interpolants to factor in W values. This tends to be slower than
+// the simpler 2D draw_quad_spans above, especially since we can't optimize the
+// depth test easily when Z values, and should be used only rarely if possible.
+template <typename P>
+static inline void draw_perspective_spans(int nump, Point3D* p,
+                                          Interpolants* interp_outs,
+                                          Texture& colortex, int layer,
+                                          Texture& depthtex,
+                                          const ClipRect& clipRect) {
+  Point3D l0, r0, l1, r1;
+  int l0i, r0i, l1i, r1i;
+  {
+    // Find the index of the top-most point (smallest Y) from which
+    // rasterization can start.
+    int top = 0;
+    for (int i = 1; i < nump; i++) {
+      if (p[i].y < p[top].y) {
+        top = i;
+      }
+    }
+    // Find left-most top point, the start of the left descending edge.
+    // Advance forward in the points array, searching at most nump points
+    // in case the polygon is flat.
+    l0i = top;
+    for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) {
+      l0i = i;
+    }
+    if (l0i == nump - 1) {
+      for (int i = 0; i <= top && p[i].y == p[top].y; i++) {
+        l0i = i;
+      }
+    }
+    // Find right-most top point, the start of the right descending edge.
+    // Advance backward in the points array, searching at most nump points.
+    r0i = top;
+    for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) {
+      r0i = i;
+    }
+    if (r0i == 0) {
+      for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) {
+        r0i = i;
+      }
+    }
+    // End of left edge is next point after left edge start.
+    l1i = NEXT_POINT(l0i);
+    // End of right edge is prev point after right edge start.
+    r1i = PREV_POINT(r0i);
+    l0 = p[l0i]; // Start of left edge
+    r0 = p[r0i]; // End of left edge
+    l1 = p[l1i]; // Start of right edge
+    r1 = p[r1i]; // End of right edge
+  }
+
+  struct Edge
+  {
+    float yScale;
+    // Current coordinates for edge. Where in the 2D case of draw_quad_spans,
+    // it is enough to just track the X coordinate as we advance along the rows,
+    // for the perspective case we also need to keep track of Z and W. For
+    // simplicity, we just use the full 3D point to track all these coordinates.
+    Point3D pSlope;
+    Point3D p;
+    Interpolants interpSlope;
+    Interpolants interp;
+
+    Edge(float y, const Point3D& p0, const Point3D& p1,
+         const Interpolants& i0, const Interpolants& i1) :
+      // Inverse Y scale for slope calculations. Avoid divide on 0-length edge.
+      yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
+      // Calculate dX/dY slope
+      pSlope((p1 - p0) * yScale),
+      // Initialize current coords based on Y and slope
+      p(p0 + (y - p0.y) * pSlope),
+      // Crucially, these interpolants must be scaled by the point's 1/w value,
+      // which allows linear interpolation in a perspective-correct manner.
+      // This will be canceled out inside the fragment shader later.
+      // Calculate change in interpolants per change in Y
+      interpSlope((i1 * p1.w - i0 * p0.w) * yScale),
+      // Initialize current interpolants based on Y and slope
+      interp(i0 * p0.w + (y - p0.y) * interpSlope)
+    {}
+
+    float x() const { return p.x; }
+    vec2_scalar zw() const { return {p.z, p.w}; }
+
+    void nextRow() {
+      // step current coords and interpolants to next row from slope
+      p += pSlope;
+      interp += interpSlope;
+    }
+  };
+
+  // Vertex selection above should result in equal left and right start rows
+  assert(l0.y == r0.y);
+  // Find the start y, clip to within the clip rect, and round to row center.
+  float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f;
+  // Initialize left and right edges from end points and start Y
+  Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+  Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+  // Get pointer to color buffer and depth buffer at current Y
+  P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer, sizeof(P));
+  uint16_t* fdepth =
+    (uint16_t*)depthtex.sample_ptr(0, int(y), 0, sizeof(uint16_t));
+  // Loop along advancing Ys, rasterizing spans at each row
+  float checkY = min(min(l1.y, r1.y), clipRect.y1);
+  for (;;) {
+    // Check if we maybe passed edge ends or outside clip rect...
+    if (y > checkY) {
+      // If we're outside the clip rect, we're done.
+      if (y > clipRect.y1) break;
+      // Check if Y advanced past the end of the left edge
+      if (y > l1.y) {
+        // Step to next left edge past Y and reset edge interpolants.
+        do { STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i); } while (y > l1.y);
+        left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+      }
+      // Check if Y advanced past the end of the right edge
+      if (y > r1.y) {
+        // Step to next right edge past Y and reset edge interpolants.
+        do { STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i); } while (y > r1.y);
+        right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+      }
+      // Reset check condition for next time around.
+      checkY = min(min(l1.y, r1.y), clipRect.y1);
+    }
+    // lx..rx form the bounds of the span. WR does not use backface culling,
+    // so we need to use min/max to support the span in either orientation.
+    // Clip the span to fall within the clip rect and then round to nearest
+    // column.
+    int startx = int(max(min(left.x(), right.x()), clipRect.x0) + 0.5f);
+    int endx = int(min(max(left.x(), right.x()), clipRect.x1) + 0.5f);
+    // Check if span is non-empty.
+    int span = endx - startx;
+    if (span > 0) {
+      ctx->shaded_rows++;
+      ctx->shaded_pixels += span;
+      // Advance color/depth buffer pointers to the start of the span.
+      P* buf = fbuf + startx;
+      // Check if the we will need to use depth-buffer or discard on this span.
+      uint16_t* depth = depthtex.buf != nullptr ? fdepth + startx : nullptr;
+      bool use_discard = fragment_shader->use_discard();
+      if (depthtex.delay_clear) {
+        // Delayed clear is enabled for the depth buffer. Check if this row
+        // needs to be cleared.
+        int yi = int(y);
+        uint32_t& mask = depthtex.cleared_rows[yi / 32];
+        if ((mask & (1 << (yi & 31))) == 0) {
+          mask |= 1 << (yi & 31);
+          depthtex.delay_clear--;
+          // Since Z varies across the span, it's easier to just clear the
+          // row and rely on later depth testing. If necessary, this could be
+          // optimized to test against the start and end Z values of the span
+          // here.
+          force_clear_row<uint16_t>(depthtex, yi);
+        }
+      }
+      if (colortex.delay_clear) {
+        // Delayed clear is enabled for the color buffer. Check if needs clear.
+        int yi = int(y);
+        uint32_t& mask = colortex.cleared_rows[yi / 32];
+        if ((mask & (1 << (yi & 31))) == 0) {
+          mask |= 1 << (yi & 31);
+          colortex.delay_clear--;
+          if (depth || blend_key || use_discard) {
+            // If depth test, blending, or discard is used, old color values
+            // might be sampled, so we need to clear the entire row to fill it.
+            force_clear_row<P>(colortex, yi);
+          } else if (startx > 0 || endx < colortex.width) {
+            // Otherwise, we only need to clear the row outside of the span.
+            // The fragment shader will fill the row within the span itself.
+            force_clear_row<P>(colortex, yi, startx, endx);
+          }
+        }
+      }
+      // Initialize fragment shader interpolants to current span position.
+      fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1);
+      fragment_shader->gl_FragCoord.y = y;
+      {
+        // Calculate the fragment Z and W change per change in fragment X step.
+        vec2_scalar stepZW =
+            (right.zw() - left.zw()) * (1.0f / (right.x() - left.x()));
+        // Calculate initial Z and W values for span start.
+        vec2_scalar zw = left.zw() + stepZW * (startx + 0.5f - left.x());
+        // Set fragment shader's Z and W values so that it can use them to
+        // cancel out the 1/w baked into the interpolants.
+        fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x);
+        fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y);
+        fragment_shader->stepZW = stepZW * 4.0f;
+        // Change in interpolants is difference between current right and left
+        // edges per the change in right and left X. The left and right
+        // interpolant values were previously multipled by 1/w, so the step and
+        // initial span values take this into account.
+        Interpolants step =
+            (right.interp - left.interp) * (1.0f / (right.x() - left.x()));
+        // Advance current interpolants to X at start of span.
+        Interpolants o = left.interp + step * (startx + 0.5f - left.x());
+        fragment_shader->init_span<true>(&o, &step, 4.0f);
+      }
+      if (!use_discard) {
+        // No discard is used. Common case.
+        draw_span<false, true>(buf, depth, span, packDepth);
+      } else {
+        // Discard is used. Rare.
+        draw_span<true, true>(buf, depth, span, packDepth);
+      }
+    }
+    // Advance Y and edge interpolants to next row.
+    y++;
+    left.nextRow();
+    right.nextRow();
+    // Advance buffers to next row.
+    fbuf += colortex.stride(sizeof(P)) / sizeof(P);
+    fdepth += depthtex.stride(sizeof(uint16_t)) / sizeof(uint16_t);
+  }
+}
+
+// Clip a primitive against both sides of a view-frustum axis, producing
+// intermediate vertexes with interpolated attributes that will no longer
+// intersect the selected axis planes. This assumes the primitive is convex
+// and should produce at most N+2 vertexes for each invocation (only in the
+// worst case where one point falls outside on each of the opposite sides
+// with the rest of the points inside).
+template <XYZW AXIS>
+static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP,
+                     Interpolants* outInterp) {
+  int numClip = 0;
+  Point3D prev = p[nump - 1];
+  Interpolants prevInterp = interp[nump - 1];
+  float prevCoord = prev.select(AXIS);
+  // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and
+  // if so, remember which side it is outside of.
+  int prevSide = prevCoord < -prev.w ? -1 : (prevCoord > prev.w ? 1 : 0);
+  // Loop through points, finding edges that cross the planes by evaluating
+  // the side at each point.
+  for (int i = 0; i < nump; i++) {
+    Point3D cur = p[i];
+    Interpolants curInterp = interp[i];
+    float curCoord = cur.select(AXIS);
+    int curSide = curCoord < -cur.w ? -1 : (curCoord > cur.w ? 1 : 0);
+    // Check if the previous and current end points are on different sides.
+    if (curSide != prevSide) {
+      // One of the edge's end points is outside the plane with the other
+      // inside the plane. Find the offset where it crosses the plane and
+      // adjust the point and interpolants to there.
+      if (prevSide) {
+        // Edge that was previously outside crosses inside.
+        // Evaluate plane equation for previous and current end-point
+        // based on previous side and calculate relative offset.
+        assert(numClip < nump + 2);
+        float prevDist = prevCoord - prevSide * prev.w;
+        float curDist = curCoord - prevSide * cur.w;
+        float k = prevDist / (prevDist - curDist);
+        outP[numClip] = prev + (cur - prev) * k;
+        outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
+        numClip++;
+      }
+      if (curSide) {
+        // Edge that was previously inside crosses outside.
+        // Evaluate plane equation for previous and current end-point
+        // based on current side and calculate relative offset.
+        assert(numClip < nump + 2);
+        float prevDist = prevCoord - curSide * prev.w;
+        float curDist = curCoord - curSide * cur.w;
+        float k = prevDist / (prevDist - curDist);
+        outP[numClip] = prev + (cur - prev) * k;
+        outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
+        numClip++;
+      }
+    }
+    if (!curSide) {
+      // The current end point is inside the plane, so output point unmodified.
+      assert(numClip < nump + 2);
+      outP[numClip] = cur;
+      outInterp[numClip] = curInterp;
+      numClip++;
+    }
+    prev = cur;
+    prevInterp = curInterp;
+    prevCoord = curCoord;
+    prevSide = curSide;
+  }
+  return numClip;
+}
+
+// Helper function to dispatch to perspective span drawing with points that
+// have already been transformed and clipped.
+static inline void draw_perspective_clipped(int nump, Point3D* p_clip,
+                                            Interpolants* interp_clip,
+                                            Texture& colortex, int layer,
+                                            Texture& depthtex) {
+  // If polygon is ouside clip rect, nothing to draw.
+  ClipRect clipRect(colortex);
+  if (!clipRect.overlaps(nump, p_clip)) {
+    return;
+  }
+
+  // Finally draw perspective-correct spans for the polygon.
+  if (colortex.internal_format == GL_RGBA8) {
+    draw_perspective_spans<uint32_t>(nump, p_clip, interp_clip, colortex,
+                                     layer, depthtex, clipRect);
+  } else if (colortex.internal_format == GL_R8) {
+    draw_perspective_spans<uint8_t>(nump, p_clip, interp_clip, colortex,
+                                    layer, depthtex, clipRect);
+  } else {
+    assert(false);
+  }
+}
+
+// Draws a perspective-correct 3D primitive with varying Z value, as opposed
+// to a simple 2D planar primitive with a constant Z value that could be
+// trivially Z rejected. This requires clipping the primitive against the near
+// and far planes to ensure it stays within the valid Z-buffer range. The Z
+// and W of each fragment of the primitives are interpolated across the
+// generated spans and then depth-tested as appropriate.
+// Additionally, vertex attributes must be interpolated with perspective-
+// correction by dividing by W before interpolation, and then later multiplied
+// by W again to produce the final correct attribute value for each fragment.
+// This process is expensive and should be avoided if possible for primitive
+// batches that are known ahead of time to not need perspective-correction.
+static void draw_perspective(int nump,
+                             Interpolants interp_outs[4],
+                             Texture& colortex, int layer,
+                             Texture& depthtex) {
+  // Convert output of vertex shader to screen space.
+  vec4 pos = vertex_shader->gl_Position;
+  vec3_scalar scale =
+    vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f;
+  vec3_scalar offset =
+    vec3_scalar(ctx->viewport.x0, ctx->viewport.y0, 0.0f) + scale;
+  if (test_none(pos.z <= -pos.w || pos.z >= pos.w)) {
+    // No points cross the near or far planes, so no clipping required.
+    // Just divide coords by W and convert to viewport.
+    Float w = 1.0f / pos.w;
+    vec3 screen = pos.sel(X, Y, Z) * w * scale + offset;
+    Point3D p[4] = {
+        {screen.x.x, screen.y.x, screen.z.x, w.x},
+        {screen.x.y, screen.y.y, screen.z.y, w.y},
+        {screen.x.z, screen.y.z, screen.z.z, w.z},
+        {screen.x.w, screen.y.w, screen.z.w, w.w}
+    };
+    draw_perspective_clipped(nump, p, interp_outs, colortex, layer, depthtex);
+  } else {
+    // Points cross the near or far planes, so we need to clip.
+    // Start with the original 3 or 4 points...
+    Point3D p[4] = {
+        {pos.x.x, pos.y.x, pos.z.x, pos.w.x},
+        {pos.x.y, pos.y.y, pos.z.y, pos.w.y},
+        {pos.x.z, pos.y.z, pos.z.z, pos.w.z},
+        {pos.x.w, pos.y.w, pos.z.w, pos.w.w}
+    };
+    // Clipping can expand the points by 1 for each of 6 view frustum planes.
+    Point3D p_clip[4 + 6];
+    Interpolants interp_clip[4 + 6];
+    // Clip against near and far Z planes.
+    nump = clip_side<Z>(nump, p, interp_outs, p_clip, interp_clip);
+    // If no points are left inside the view frustum, there's nothing to draw.
+    if (nump < 3) {
+      return;
+    }
+    // After clipping against only the near and far planes, we might still
+    // produce points where W = 0, exactly at the camera plane. OpenGL specifies
+    // that for clip coordinates, points must satisfy:
+    //   -W <= X <= W
+    //   -W <= Y <= W
+    //   -W <= Z <= W
+    // When Z = W = 0, this is trivially satisfied, but when we transform and
+    // divide by W below it will produce a divide by 0. Usually we want to only
+    // clip Z to avoid the extra work of clipping X and Y. We can still project
+    // points that fall outside the view frustum X and Y so long as Z is valid.
+    // The span drawing code will then ensure X and Y are clamped to viewport
+    // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y,
+    // will push W further inside the view frustum so that it is no longer 0,
+    // allowing us to finally proceed to projecting the points to the screen.
+    for (int i = 0; i < nump; i++) {
+      // Found an invalid W, so need to clip against X and Y...
+      if (p_clip[i].w <= 0.0f) {
+        // Ping-pong p_clip -> p_tmp -> p_clip.
+        Point3D p_tmp[4 + 6];
+        Interpolants interp_tmp[4 + 6];
+        nump = clip_side<X>(nump, p_clip, interp_clip, p_tmp, interp_tmp);
+        if (nump < 3) return;
+        nump = clip_side<Y>(nump, p_tmp, interp_tmp, p_clip, interp_clip);
+        if (nump < 3) return;
+        // After clipping against X and Y planes, there's still points left
+        // to draw, so proceed to trying projection now...
+        break;
+      }
+    }
+    // Divide coords by W and convert to viewport.
+    for (int i = 0; i < nump; i++) {
+      float w = 1.0f / p_clip[i].w;
+      p_clip[i] = Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w);
+    }
+    draw_perspective_clipped(nump, p_clip, interp_clip, colortex, layer,
+                             depthtex);
+  }
+}
+
+static void draw_quad(int nump, Texture& colortex, int layer,
+                      Texture& depthtex) {
+  // Run vertex shader once for the primitive's vertices.
+  // Reserve space for 6 sets of interpolants, in case we need to clip against
+  // near and far planes in the perspective case.
+  Interpolants interp_outs[4];
+  vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants));
+  vec4 pos = vertex_shader->gl_Position;
+  // Check if any vertex W is different from another. If so, use perspective.
+  if (test_any(pos.w != pos.w.x)) {
+    draw_perspective(nump, interp_outs, colortex, layer, depthtex);
+    return;
+  }
+
+  // Convert output of vertex shader to screen space.
+  // Divide coords by W and convert to viewport.
+  float w = 1.0f / pos.w.x;
+  vec2 screen =
+      (pos.sel(X, Y) * w + 1) * 0.5f *
+          vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) +
+      vec2_scalar(ctx->viewport.x0, ctx->viewport.y0);
+  Point2D p[4] = {{screen.x.x, screen.y.x},
+                  {screen.x.y, screen.y.y},
+                  {screen.x.z, screen.y.z},
+                  {screen.x.w, screen.y.w}};
+
+  // If quad is ouside clip rect, nothing to draw.
+  ClipRect clipRect(colortex);
+  if (!clipRect.overlaps(nump, p)) {
+    return;
+  }
+
+  // Since the quad is assumed 2D, Z is constant across the quad.
+  float screenZ = (pos.z.x * w + 1) * 0.5f;
+  if (screenZ < 0 || screenZ > 1) {
+    // Z values would cross the near or far plane, so just bail.
+    return;
+  }
+  // Since Z doesn't need to be interpolated, just set the fragment shader's
+  // Z and W values here, once and for all fragment shader invocations.
+  // SSE2 does not support unsigned comparison, so bias Z to be negative.
+  uint16_t z = uint16_t(0xFFFF * screenZ) - 0x8000;
+  fragment_shader->gl_FragCoord.z = screenZ;
+  fragment_shader->gl_FragCoord.w = w;
+
+  // Finally draw 2D spans for the quad. Currently only supports drawing to
+  // RGBA8 and R8 color buffers.
+  if (colortex.internal_format == GL_RGBA8) {
+    draw_quad_spans<uint32_t>(nump, p, z, interp_outs, colortex, layer,
+                              depthtex, clipRect);
+  } else if (colortex.internal_format == GL_R8) {
+    draw_quad_spans<uint8_t>(nump, p, z, interp_outs, colortex, layer, depthtex,
+                             clipRect);
+  } else {
+    assert(false);
+  }
+}
 
 void VertexArray::validate() {
   int last_enabled = -1;
@@ -2653,32 +3581,78 @@ void VertexArray::validate() {
   max_attrib = last_enabled;
 }
 
+template <typename INDEX>
+static inline void draw_elements(GLsizei count, GLsizei instancecount,
+                                 Buffer& indices_buf, size_t offset,
+                                 VertexArray& v, Texture& colortex, int layer,
+                                 Texture& depthtex) {
+  assert((offset & (sizeof(INDEX) - 1)) == 0);
+  INDEX* indices = (INDEX*)(indices_buf.buf + offset);
+  count = min(count,
+              (GLsizei)((indices_buf.size - offset) / sizeof(INDEX)));
+  // Triangles must be indexed at offsets 0, 1, 2.
+  // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3.
+  if (count == 6 && indices[1] == indices[0] + 1 &&
+      indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) {
+    assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1);
+    // Fast path - since there is only a single quad, we only load per-vertex
+    // attribs once for all instances, as they won't change across instances
+    // or within an instance.
+    vertex_shader->load_attribs(v.attribs, indices[0], 0, 4);
+    draw_quad(4, colortex, layer, depthtex);
+    for (GLsizei instance = 1; instance < instancecount; instance++) {
+      vertex_shader->load_attribs(v.attribs, indices[0], instance, 0);
+      draw_quad(4, colortex, layer, depthtex);
+    }
+  } else {
+    for (GLsizei instance = 0; instance < instancecount; instance++) {
+      for (GLsizei i = 0; i + 3 <= count; i += 3) {
+        if (indices[i + 1] != indices[i] + 1 ||
+            indices[i + 2] != indices[i] + 2) {
+          continue;
+        }
+        int nump = 3;
+        if (i + 6 <= count && indices[i + 5] == indices[i] + 3) {
+          assert(indices[i + 3] == indices[i] + 2 &&
+                 indices[i + 4] == indices[i] + 1);
+          nump = 4;
+          i += 3;
+        }
+        vertex_shader->load_attribs(v.attribs, indices[i], instance, nump);
+        draw_quad(nump, colortex, layer, depthtex);
+      }
+    }
+  }
+}
+
 extern "C" {
 
 void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
-                           GLintptr offset, GLsizei instancecount) {
-  if (offset < 0 || count <= 0 || instancecount <= 0 || !vertex_shader ||
-      !fragment_shader) {
+                           void* indicesptr, GLsizei instancecount) {
+  assert(mode == GL_TRIANGLES);
+  assert(type == GL_UNSIGNED_SHORT || type == GL_UNSIGNED_INT);
+  if (count <= 0 || instancecount <= 0) {
     return;
   }
 
-  Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true);
-  if (!fb.color_attachment) {
-    return;
-  }
+  Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER);
   Texture& colortex = ctx->textures[fb.color_attachment];
   if (!colortex.buf) {
     return;
   }
-  assert(!colortex.locked);
   assert(colortex.internal_format == GL_RGBA8 ||
          colortex.internal_format == GL_R8);
   Texture& depthtex = ctx->textures[ctx->depthtest ? fb.depth_attachment : 0];
   if (depthtex.buf) {
-    assert(depthtex.internal_format == GL_DEPTH_COMPONENT24);
+    assert(depthtex.internal_format == GL_DEPTH_COMPONENT16);
     assert(colortex.width == depthtex.width &&
            colortex.height == depthtex.height);
-    assert(colortex.offset == depthtex.offset);
+  }
+
+  Buffer& indices_buf = ctx->buffers[ctx->element_array_buffer_binding];
+  size_t offset = (size_t)indicesptr;
+  if (!indices_buf.buf || offset >= indices_buf.size) {
+    return;
   }
 
   // debugf("current_vertex_array %d\n", ctx->current_vertex_array);
@@ -2689,8 +3663,8 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
     v.validate();
   }
 
-#ifdef PRINT_TIMINGS
-  uint64_t start = get_time_value();
+#ifndef NDEBUG
+  // uint64_t start = get_time_value();
 #endif
 
   ctx->shaded_rows = 0;
@@ -2698,43 +3672,14 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
 
   vertex_shader->init_batch();
 
-  switch (type) {
-    case GL_UNSIGNED_SHORT:
-      assert(mode == GL_TRIANGLES);
-      draw_elements<uint16_t>(count, instancecount, offset, v, colortex,
-                              depthtex);
-      break;
-    case GL_UNSIGNED_INT:
-      assert(mode == GL_TRIANGLES);
-      draw_elements<uint32_t>(count, instancecount, offset, v, colortex,
-                              depthtex);
-      break;
-    case GL_NONE:
-      // Non-standard GL extension - if element type is GL_NONE, then we don't
-      // use any element buffer and behave as if DrawArrays was called instead.
-      for (GLsizei instance = 0; instance < instancecount; instance++) {
-        switch (mode) {
-          case GL_LINES:
-            for (GLsizei i = 0; i + 2 <= count; i += 2) {
-              vertex_shader->load_attribs(v.attribs, offset + i, instance, 2);
-              draw_quad(2, colortex, depthtex);
-            }
-            break;
-          case GL_TRIANGLES:
-            for (GLsizei i = 0; i + 3 <= count; i += 3) {
-              vertex_shader->load_attribs(v.attribs, offset + i, instance, 3);
-              draw_quad(3, colortex, depthtex);
-            }
-            break;
-          default:
-            assert(false);
-            break;
-        }
-      }
-      break;
-    default:
-      assert(false);
-      break;
+  if (type == GL_UNSIGNED_SHORT) {
+    draw_elements<uint16_t>(count, instancecount, indices_buf, offset, v,
+                            colortex, fb.layer, depthtex);
+  } else if (type == GL_UNSIGNED_INT) {
+    draw_elements<uint32_t>(count, instancecount, indices_buf, offset, v,
+                            colortex, fb.layer, depthtex);
+  } else {
+    assert(false);
   }
 
   if (ctx->samples_passed_query) {
@@ -2742,66 +3687,329 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
     q.value += ctx->shaded_pixels;
   }
 
-#ifdef PRINT_TIMINGS
-  uint64_t end = get_time_value();
-  printf(
-      "%7.3fms draw(%s, %d): %d pixels in %d rows (avg %f pixels/row, "
-      "%fns/pixel)\n",
-      double(end - start) / (1000. * 1000.),
-      ctx->programs[ctx->current_program].impl->get_name(), instancecount,
-      ctx->shaded_pixels, ctx->shaded_rows,
-      double(ctx->shaded_pixels) / ctx->shaded_rows,
-      double(end - start) / max(ctx->shaded_pixels, 1));
+#ifndef NDEBUG
+  // uint64_t end = get_time_value();
+  // debugf("draw(%d): %fms for %d pixels in %d rows (avg %f pixels/row, %f
+  // ns/pixel)\n", instancecount, double(end - start)/(1000.*1000.),
+  // ctx->shaded_pixels, ctx->shaded_rows,
+  // double(ctx->shaded_pixels)/ctx->shaded_rows, double(end -
+  // start)/max(ctx->shaded_pixels, 1));
 #endif
 }
 
-void Finish() {
-#ifdef PRINT_TIMINGS
-  printf("Finish\n");
-#endif
+} // extern "C"
+
+template <typename P>
+static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth,
+                             int span) {
+  int frac = 0;
+  for (P* end = dst + span; dst < end; dst++) {
+    *dst = *src;
+    // Step source according to width ratio.
+    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
+      src++;
+    }
+  }
 }
 
-void MakeCurrent(Context* c) {
-  if (ctx == c) {
+static void scale_blit(Texture& srctex, const IntRect& srcReq, int srcZ,
+                       Texture& dsttex, const IntRect& dstReq, int dstZ,
+                       bool invertY) {
+  // Cache scaling ratios
+  int srcWidth = srcReq.width();
+  int srcHeight = srcReq.height();
+  int dstWidth = dstReq.width();
+  int dstHeight = dstReq.height();
+  // Compute valid dest bounds
+  IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY);
+  // Compute valid source bounds
+  // Scale source to dest, rounding inward to avoid sampling outside source
+  IntRect srcBounds = srctex.sample_bounds(srcReq)
+    .scale(srcWidth, srcHeight, dstWidth, dstHeight, true);
+  // Limit dest sampling bounds to overlap source bounds
+  dstBounds.intersect(srcBounds);
+  // Check if sampling bounds are empty
+  if (dstBounds.is_empty()) {
     return;
   }
-  ctx = c;
-  setup_program(ctx ? ctx->current_program : 0);
+  // Compute final source bounds from clamped dest sampling bounds
+  srcBounds = IntRect(dstBounds)
+    .scale(dstWidth, dstHeight, srcWidth, srcHeight);
+  // Calculate source and dest pointers from clamped offsets
+  int bpp = srctex.bpp();
+  int srcStride = srctex.stride(bpp);
+  int destStride = dsttex.stride(bpp);
+  char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ, invertY);
+  char* src = srctex.sample_ptr(srcReq, srcBounds, srcZ);
+  // Inverted Y must step downward along dest rows
+  if (invertY) {
+    destStride = -destStride;
+  }
+  int span = dstBounds.width();
+  int frac = 0;
+  for (int rows = dstBounds.height(); rows > 0; rows--) {
+    if (srcWidth == dstWidth) {
+      // No scaling, so just do a fast copy.
+      memcpy(dest, src, span * bpp);
+    } else {
+      // Do scaling with different source and dest widths.
+      switch (bpp) {
+        case 1:
+          scale_row((uint8_t*)dest, dstWidth, (uint8_t*)src, srcWidth, span);
+          break;
+        case 2:
+          scale_row((uint16_t*)dest, dstWidth, (uint16_t*)src, srcWidth, span);
+          break;
+        case 4:
+          scale_row((uint32_t*)dest, dstWidth, (uint32_t*)src, srcWidth, span);
+          break;
+        default:
+          assert(false);
+          break;
+      }
+    }
+    dest += destStride;
+    // Step source according to height ratio.
+    for (frac += srcHeight; frac >= dstHeight; frac -= dstHeight) {
+      src += srcStride;
+    }
+  }
+}
+
+static void linear_row(uint32_t* dest, int span, const vec2_scalar& srcUV,
+                       float srcDU, int srcZOffset, sampler2DArray sampler) {
+  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+  for (; span >= 4; span -= 4) {
+    auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset);
+    unaligned_store(dest, srcpx);
+    dest += 4;
+    uv.x += 4 * srcDU;
+  }
+  if (span > 0) {
+    auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset);
+    auto mask = span_mask_RGBA8(span);
+    auto dstpx = unaligned_load<PackedRGBA8>(dest);
+    unaligned_store(dest, (mask & dstpx) | (~mask & srcpx));
+  }
 }
 
-Context* CreateContext() { return new Context; }
+static void linear_row(uint8_t* dest, int span, const vec2_scalar& srcUV,
+                       float srcDU, int srcZOffset, sampler2DArray sampler) {
+  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+  for (; span >= 4; span -= 4) {
+    auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset);
+    unaligned_store(dest, pack(srcpx));
+    dest += 4;
+    uv.x += 4 * srcDU;
+  }
+  if (span > 0) {
+    auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset);
+    auto mask = span_mask_R8(span);
+    auto dstpx = unpack(unaligned_load<PackedR8>(dest));
+    unaligned_store(dest, pack((mask & dstpx) | (~mask & srcpx)));
+  }
+}
 
-void ReferenceContext(Context* c) {
-  if (!c) {
+static void linear_blit(Texture& srctex, const IntRect& srcReq, int srcZ,
+                        Texture& dsttex, const IntRect& dstReq, int dstZ,
+                        bool invertY) {
+  assert(srctex.internal_format == GL_RGBA8 ||
+         srctex.internal_format == GL_R8);
+  // Compute valid dest bounds
+  IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY);
+  // Check if sampling bounds are empty
+  if (dstBounds.is_empty()) {
     return;
   }
-  ++c->references;
+  // Initialize sampler for source texture
+  sampler2DArray_impl sampler;
+  init_sampler(&sampler, srctex);
+  init_depth(&sampler, srctex);
+  sampler.filter = TextureFilter::LINEAR;
+  // Compute source UVs
+  int srcZOffset = srcZ * sampler.height_stride;
+  vec2_scalar srcUV(srcReq.x0, srcReq.y0);
+  vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
+                     float(srcReq.height()) / dstReq.height());
+  // Skip to clamped source start
+  srcUV += srcDUV * vec2_scalar(dstBounds.x0, dstBounds.y0);
+  // Offset source UVs to texel centers and scale by lerp precision
+  srcUV = linearQuantize(srcUV + 0.5f, 128);
+  srcDUV *= 128.0f;
+  // Calculate dest pointer from clamped offsets
+  int bpp = dsttex.bpp();
+  int destStride = dsttex.stride(bpp);
+  char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ, invertY);
+  // Inverted Y must step downward along dest rows
+  if (invertY) {
+    destStride = -destStride;
+  }
+  int span = dstBounds.width();
+  for (int rows = dstBounds.height(); rows > 0; rows--) {
+    switch (bpp) {
+      case 1:
+        linear_row((uint8_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+                   &sampler);
+        break;
+      case 4:
+        linear_row((uint32_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+                   &sampler);
+        break;
+      default:
+        assert(false);
+        break;
+    }
+    dest += destStride;
+    srcUV.y += srcDUV.y;
+  }
 }
 
-void DestroyContext(Context* c) {
-  if (!c) {
+extern "C" {
+
+void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                     GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                     GLbitfield mask, GLenum filter) {
+  assert(mask == GL_COLOR_BUFFER_BIT);
+  Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER);
+  if (!srcfb || srcfb->layer < 0) return;
+  Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER);
+  if (!dstfb || dstfb->layer < 0) return;
+  Texture& srctex = ctx->textures[srcfb->color_attachment];
+  if (!srctex.buf || srcfb->layer >= max(srctex.depth, 1)) return;
+  Texture& dsttex = ctx->textures[dstfb->color_attachment];
+  if (!dsttex.buf || dstfb->layer >= max(dsttex.depth, 1)) return;
+  if (srctex.internal_format != dsttex.internal_format) {
+    assert(false);
     return;
   }
-  assert(c->references > 0);
-  --c->references;
-  if (c->references > 0) {
+  // Force flipped Y onto dest coordinates
+  if (srcY1 < srcY0) {
+    swap(srcY0, srcY1);
+    swap(dstY0, dstY1);
+  }
+  bool invertY = dstY1 < dstY0;
+  if (invertY) {
+    swap(dstY0, dstY1);
+  }
+  IntRect srcReq = {srcX0, srcY0, srcX1, srcY1};
+  IntRect dstReq = {dstX0, dstY0, dstX1, dstY1};
+  if (srcReq.is_empty() || dstReq.is_empty()) {
     return;
   }
-  if (ctx == c) {
-    MakeCurrent(nullptr);
+  prepare_texture(srctex);
+  prepare_texture(dsttex, &dstReq);
+  if (!srcReq.same_size(dstReq) && filter == GL_LINEAR &&
+      (srctex.internal_format == GL_RGBA8 ||
+       srctex.internal_format == GL_R8)) {
+    linear_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer,
+                invertY);
+  } else {
+    scale_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer,
+               invertY);
   }
-  delete c;
 }
 
-size_t ReportMemory(size_t (*size_of_op)(void*)) {
-  size_t size = 0;
+void Finish() {}
+
+void MakeCurrent(void* ctx_ptr) {
+  ctx = (Context*)ctx_ptr;
   if (ctx) {
-    for (auto& t : ctx->textures) {
-      if (t && t->should_free()) {
-        size += size_of_op(t->buf);
+    setup_program(ctx->current_program);
+    blend_key = ctx->blend ? ctx->blend_key : BLEND_KEY_NONE;
+  } else {
+    setup_program(0);
+    blend_key = BLEND_KEY_NONE;
+  }
+}
+
+void* CreateContext() { return new Context; }
+
+void DestroyContext(void* ctx_ptr) {
+  if (!ctx_ptr) {
+    return;
+  }
+  if (ctx == ctx_ptr) {
+    MakeCurrent(nullptr);
+  }
+  delete (Context*)ctx_ptr;
+}
+
+void Composite(GLuint srcId, GLint srcX, GLint srcY, GLsizei srcWidth,
+               GLsizei srcHeight, GLint dstX, GLint dstY, GLboolean opaque,
+               GLboolean flip) {
+  Framebuffer& fb = ctx->framebuffers[0];
+  if (!fb.color_attachment) {
+    return;
+  }
+  Texture& srctex = ctx->textures[srcId];
+  if (!srctex.buf) return;
+  prepare_texture(srctex);
+  Texture& dsttex = ctx->textures[fb.color_attachment];
+  if (!dsttex.buf) return;
+  assert(srctex.bpp() == 4);
+  const int bpp = 4;
+  size_t src_stride = srctex.stride(bpp);
+  size_t dest_stride = dsttex.stride(bpp);
+  if (srcY < 0) {
+    dstY -= srcY;
+    srcHeight += srcY;
+    srcY = 0;
+  }
+  if (dstY < 0) {
+    srcY -= dstY;
+    srcHeight += dstY;
+    dstY = 0;
+  }
+  if (srcY + srcHeight > srctex.height) {
+    srcHeight = srctex.height - srcY;
+  }
+  if (dstY + srcHeight > dsttex.height) {
+    srcHeight = dsttex.height - dstY;
+  }
+  IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight};
+  prepare_texture(dsttex, &skip);
+  char* dest = dsttex.sample_ptr(dstX, flip ? dsttex.height - 1 - dstY : dstY,
+                                 fb.layer, bpp, dest_stride);
+  char* src = srctex.sample_ptr(srcX, srcY, 0, bpp, src_stride);
+  if (flip) {
+    dest_stride = -dest_stride;
+  }
+  if (opaque) {
+    for (int y = 0; y < srcHeight; y++) {
+      memcpy(dest, src, srcWidth * bpp);
+      dest += dest_stride;
+      src += src_stride;
+    }
+  } else {
+    for (int y = 0; y < srcHeight; y++) {
+      char* end = src + srcWidth * bpp;
+      while (src + 4 * bpp <= end) {
+        WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
+        WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
+        PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+        unaligned_store(dest, r);
+        src += 4 * bpp;
+        dest += 4 * bpp;
       }
+      if (src < end) {
+        WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
+        WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
+        U32 r = bit_cast<U32>(
+            pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))));
+        unaligned_store(dest, r.x);
+        if (src + bpp < end) {
+          unaligned_store(dest + bpp, r.y);
+          if (src + 2 * bpp < end) {
+            unaligned_store(dest + 2 * bpp, r.z);
+          }
+        }
+        dest += end - src;
+        src = end;
+      }
+      dest += dest_stride - srcWidth * bpp;
+      src += src_stride - srcWidth * bpp;
     }
   }
-  return size;
 }
+
 }  // extern "C"
diff --git a/third_party/webrender/swgl/src/gl_defs.h b/third_party/webrender/swgl/src/gl_defs.h
index 22219366ecf..c7e87230a3d 100644
--- a/third_party/webrender/swgl/src/gl_defs.h
+++ b/third_party/webrender/swgl/src/gl_defs.h
@@ -15,27 +15,20 @@ typedef float GLfloat;
 typedef double GLdouble;
 
 typedef uint32_t GLenum;
-typedef uint8_t GLboolean;
+typedef int32_t GLboolean;
 typedef uint32_t GLbitfield;
 
 typedef int32_t GLsizei;
 typedef size_t GLsizeiptr;
 typedef intptr_t GLintptr;
 
-#define GL_FALSE 0
-#define GL_TRUE 1
-
-#define GL_NONE 0
-
 #define GL_NO_ERROR 0
 
 #define GL_RGBA32F 0x8814
 #define GL_RGBA8 0x8058
 #define GL_R8 0x8229
-#define GL_R16 0x822A
 #define GL_RGBA32I 0x8D82
 #define GL_BGRA8 0x93A1
-#define GL_RG8 0x822B
 
 #define GL_BYTE 0x1400
 #define GL_UNSIGNED_BYTE 0x1401
@@ -44,7 +37,6 @@ typedef intptr_t GLintptr;
 #define GL_INT 0x1404
 #define GL_UNSIGNED_INT 0x1405
 #define GL_FLOAT 0x1406
-#define GL_DOUBLE 0x1408
 
 #define GL_RED 0x1903
 #define GL_GREEN 0x1904
@@ -54,7 +46,6 @@ typedef intptr_t GLintptr;
 #define GL_RGBA 0x1908
 #define GL_RGBA_INTEGER 0x8D99
 #define GL_BGRA 0x80E1
-#define GL_RG 0x8227
 
 #define GL_DEPTH_COMPONENT 0x1902
 #define GL_DEPTH_COMPONENT16 0x81A5
@@ -155,8 +146,6 @@ typedef intptr_t GLintptr;
 #define GL_ONE_MINUS_SRC1_ALPHA 0x88FB
 
 #define GL_FUNC_ADD 0x8006
-#define GL_MIN 0x8007
-#define GL_MAX 0x8008
 
 #define GL_NEVER 0x0200
 #define GL_LESS 0x0201
@@ -176,9 +165,6 @@ typedef intptr_t GLintptr;
 #define GL_VERSION 0x1F02
 #define GL_EXTENSIONS 0x1F03
 #define GL_NUM_EXTENSIONS 0x821D
-#define GL_MINOR_VERSION 0x821C
-#define GL_MAJOR_VERSION 0x821B
-#define GL_SHADING_LANGUAGE_VERSION 0x8B8C
 
 #define GL_POINTS 0x0000
 #define GL_LINES 0x0001
@@ -188,29 +174,3 @@ typedef intptr_t GLintptr;
 #define GL_TRIANGLE_STRIP 0x0005
 #define GL_TRIANGLE_FAN 0x0006
 #define GL_QUADS 0x0007
-
-#define GL_UNSIGNED_INT_8_8_8_8_REV 0x8367
-
-#define GL_RGB_422_APPLE 0x8A1F
-#define GL_UNSIGNED_SHORT_8_8_APPLE 0x85BA
-#define GL_UNSIGNED_SHORT_8_8_REV_APPLE 0x85BB
-#define GL_RGB_RAW_422_APPLE 0x8A51
-
-#define GL_MULTIPLY_KHR 0x9294
-#define GL_SCREEN_KHR 0x9295
-#define GL_OVERLAY_KHR 0x9296
-#define GL_DARKEN_KHR 0x9297
-#define GL_LIGHTEN_KHR 0x9298
-#define GL_COLORDODGE_KHR 0x9299
-#define GL_COLORBURN_KHR 0x929A
-#define GL_HARDLIGHT_KHR 0x929B
-#define GL_SOFTLIGHT_KHR 0x929C
-#define GL_DIFFERENCE_KHR 0x929E
-#define GL_EXCLUSION_KHR 0x92A0
-#define GL_HSL_HUE_KHR 0x92AD
-#define GL_HSL_SATURATION_KHR 0x92AE
-#define GL_HSL_COLOR_KHR 0x92AF
-#define GL_HSL_LUMINOSITY_KHR 0x92B0
-
-#define SWGL_BLEND_DROP_SHADOW 0xB001
-#define SWGL_BLEND_SUBPIXEL_TEXT 0xB002
diff --git a/third_party/webrender/swgl/src/glsl.h b/third_party/webrender/swgl/src/glsl.h
index bec63858b0d..cdedb43d567 100644
--- a/third_party/webrender/swgl/src/glsl.h
+++ b/third_party/webrender/swgl/src/glsl.h
@@ -2,45 +2,14 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
+// Some of this is copied from Skia and is governed by a BSD-style license
+// Every function in this file should be marked static and inline using SI.
 #define SI ALWAYS_INLINE static
 
 #include "vector_type.h"
 
 namespace glsl {
 
-enum TextureFormat { RGBA32F, RGBA32I, RGBA8, R8, RG8, R16, YUV422 };
-
-enum TextureFilter { NEAREST, LINEAR };
-
-struct samplerCommon {
-  uint32_t* buf = nullptr;
-  uint32_t stride = 0;  // in units of BPP if < 4, or dwords if BPP >= 4
-  uint32_t height = 0;
-  uint32_t width = 0;
-  TextureFormat format = TextureFormat::RGBA8;
-};
-
-struct samplerFilter {
-  TextureFilter filter = TextureFilter::NEAREST;
-};
-
-struct sampler2D_impl : samplerCommon, samplerFilter {};
-typedef sampler2D_impl* sampler2D;
-
-typedef struct sampler2DR8_impl : sampler2D_impl{} * sampler2DR8;
-typedef struct sampler2DRG8_impl : sampler2D_impl{} * sampler2DRG8;
-typedef struct sampler2DRGBA8_impl : sampler2D_impl{} * sampler2DRGBA8;
-typedef struct sampler2DRGBA32F_impl : sampler2D_impl{} * sampler2DRGBA32F;
-
-struct isampler2D_impl : samplerCommon {};
-typedef isampler2D_impl* isampler2D;
-
-struct isampler2DRGBA32I_impl : isampler2D_impl {};
-typedef isampler2DRGBA32I_impl* isampler2DRGBA32I;
-
-struct sampler2DRect_impl : samplerCommon, samplerFilter {};
-typedef sampler2DRect_impl* sampler2DRect;
-
 #if USE_SSE2
 SI bool test_all(Bool cond) { return _mm_movemask_ps(cond) == 0xF; }
 SI bool test_any(Bool cond) { return _mm_movemask_ps(cond) != 0; }
@@ -49,14 +18,9 @@ SI bool test_none(Bool cond) { return _mm_movemask_ps(cond) == 0; }
 SI bool test_all(Bool cond) {
   return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0xFFFFFFFFU;
 }
-SI bool test_any(Bool cond) {
-  return bit_cast<uint32_t>(CONVERT(cond, U8)) != 0;
-}
-SI bool test_none(Bool cond) {
-  return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0;
-}
+SI bool test_any(Bool cond) { return bit_cast<uint32_t>(CONVERT(cond, U8)) != 0; }
+SI bool test_none(Bool cond) { return bit_cast<uint32_t>(CONVERT(cond, U8)) == 0; }
 #endif
-SI bool test_equal(Bool cond) { return test_none(cond != cond.x); }
 
 float make_float(float n) { return n; }
 
@@ -110,23 +74,17 @@ struct vec4;
 struct ivec2;
 
 SI int32_t if_then_else(int32_t c, int32_t t, int32_t e) { return c ? t : e; }
-SI int32_t if_then_else(bool c, int32_t t, int32_t e) { return c ? t : e; }
 
 SI float if_then_else(int32_t c, float t, float e) { return c ? t : e; }
 
 SI Float if_then_else(I32 c, float t, float e) {
-  return bit_cast<Float>((c & bit_cast<I32>(Float(t))) |
-                         (~c & bit_cast<I32>(Float(e))));
+  return bit_cast<Float>((c & bit_cast<I32>(Float(t))) | (~c & bit_cast<I32>(Float(e))));
 }
 
 SI I32 if_then_else(I32 c, int32_t t, int32_t e) {
   return (c & I32(t)) | (~c & I32(e));
 }
 
-SI U32 if_then_else(I32 c, U32 t, U32 e) {
-  return bit_cast<U32>((c & bit_cast<I32>(t)) | (~c & bit_cast<I32>(e)));
-}
-
 SI Float if_then_else(I32 c, Float t, Float e) {
   return bit_cast<Float>((c & bit_cast<I32>(t)) | (~c & bit_cast<I32>(e)));
 }
@@ -137,10 +95,7 @@ SI Bool if_then_else(I32 c, Bool t, Bool e) { return (c & t) | (~c & e); }
 
 SI Bool if_then_else(int32_t c, Bool t, Bool e) { return c ? t : e; }
 
-SI I16 if_then_else(I16 c, I16 t, I16 e) { return (c & t) | (~c & e); }
-
-template <typename T>
-SI void swap(T& a, T& b) {
+template <typename T> SI void swap(T& a, T& b) {
   T t(a);
   a = b;
   b = t;
@@ -201,37 +156,7 @@ SI Float sqrt(Float v) {
 #endif
 }
 
-SI float recip(float x) {
-#if USE_SSE2
-  return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(x)));
-#else
-  return 1.0f / x;
-#endif
-}
-
-// Use a fast vector reciprocal approximation when available. This should only
-// be used in cases where it is okay that the approximation is imprecise -
-// essentially visually correct but numerically wrong. Otherwise just rely on
-// however the compiler would implement slower division if the platform doesn't
-// provide a convenient intrinsic.
-SI Float recip(Float v) {
-#if USE_SSE2
-  return _mm_rcp_ps(v);
-#elif USE_NEON
-  Float e = vrecpeq_f32(v);
-  return vrecpsq_f32(v, e) * e;
-#else
-  return 1.0f / v;
-#endif
-}
-
-SI float inversesqrt(float x) {
-#if USE_SSE2
-  return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(x)));
-#else
-  return 1.0f / sqrtf(x);
-#endif
-}
+SI float inversesqrt(float x) { return 1.0f / sqrtf(x); }
 
 SI Float inversesqrt(Float v) {
 #if USE_SSE2
@@ -269,45 +194,18 @@ enum XYZW {
   A = 3,
 };
 
-struct bvec4_scalar;
-
 struct bvec2_scalar {
   bool x;
   bool y;
 
   bvec2_scalar() : bvec2_scalar(false) {}
-  IMPLICIT constexpr bvec2_scalar(bool a) : x(a), y(a) {}
+  constexpr bvec2_scalar(bool a) : x(a), y(a) {}
   constexpr bvec2_scalar(bool x, bool y) : x(x), y(y) {}
-
-  bool& select(XYZW c) {
-    switch (c) {
-      case X:
-        return x;
-      case Y:
-        return y;
-      default:
-        UNREACHABLE;
-    }
-  }
-  bool sel(XYZW c1) { return select(c1); }
-
-  bvec2_scalar sel(XYZW c1, XYZW c2) {
-    return bvec2_scalar(select(c1), select(c2));
-  }
-  bvec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
-};
-
-struct bvec2_scalar1 {
-  bool x;
-
-  IMPLICIT constexpr bvec2_scalar1(bool a) : x(a) {}
-
-  operator bvec2_scalar() const { return bvec2_scalar(x); }
 };
 
 struct bvec2 {
   bvec2() : bvec2(0) {}
-  IMPLICIT bvec2(Bool a) : x(a), y(a) {}
+  bvec2(Bool a) : x(a), y(a) {}
   bvec2(Bool x, Bool y) : x(x), y(y) {}
   Bool& select(XYZW c) {
     switch (c) {
@@ -321,15 +219,13 @@ struct bvec2 {
   }
   Bool sel(XYZW c1) { return select(c1); }
 
-  bvec2 sel(XYZW c1, XYZW c2) { return bvec2(select(c1), select(c2)); }
-
   bvec2 operator~() { return bvec2(~x, ~y); }
 
   Bool x;
   Bool y;
 };
 
-bvec2_scalar1 make_bvec2(bool n) { return bvec2_scalar1(n); }
+bvec2_scalar make_bvec2(bool n) { return bvec2_scalar{n, n}; }
 
 bvec2_scalar make_bvec2(bool x, bool y) { return bvec2_scalar{x, y}; }
 
@@ -353,8 +249,8 @@ struct vec2_scalar {
   float y;
 
   constexpr vec2_scalar() : vec2_scalar(0.0f) {}
-  IMPLICIT constexpr vec2_scalar(float a) : x(a), y(a) {}
-  IMPLICIT constexpr vec2_scalar(int a) : x(a), y(a) {}
+  constexpr vec2_scalar(float a) : x(a), y(a) {}
+  constexpr vec2_scalar(int a) : x(a), y(a) {}
   constexpr vec2_scalar(float x, float y) : x(x), y(y) {}
 
   float& select(XYZW c) {
@@ -390,9 +286,6 @@ struct vec2_scalar {
   friend vec2_scalar operator*(vec2_scalar a, vec2_scalar b) {
     return vec2_scalar(a.x * b.x, a.y * b.y);
   }
-  friend vec2_scalar operator/(vec2_scalar a, float b) {
-    return vec2_scalar(a.x / b, a.y / b);
-  }
   friend vec2_scalar operator/(vec2_scalar a, vec2_scalar b) {
     return vec2_scalar(a.x / b.x, a.y / b.y);
   }
@@ -415,12 +308,6 @@ struct vec2_scalar {
     return *this;
   }
 
-  vec2_scalar operator/=(vec2_scalar a) {
-    x /= a.x;
-    y /= a.y;
-    return *this;
-  }
-
   vec2_scalar operator+=(vec2_scalar a) {
     x += a.x;
     y += a.y;
@@ -469,12 +356,12 @@ struct vec2 {
   typedef float element_type;
 
   constexpr vec2() : vec2(Float(0.0f)) {}
-  IMPLICIT constexpr vec2(Float a) : x(a), y(a) {}
+  constexpr vec2(Float a) : x(a), y(a) {}
   vec2(Float x, Float y) : x(x), y(y) {}
-  IMPLICIT constexpr vec2(vec2_scalar s) : x(s.x), y(s.y) {}
+  constexpr vec2(vec2_scalar s) : x(s.x), y(s.y) {}
   constexpr vec2(vec2_scalar s0, vec2_scalar s1, vec2_scalar s2, vec2_scalar s3)
       : x(Float{s0.x, s1.x, s2.x, s3.x}), y(Float{s0.y, s1.y, s2.y, s3.y}) {}
-  explicit vec2(ivec2 a);
+  vec2(ivec2 a);
   Float x;
   Float y;
 
@@ -583,7 +470,6 @@ vec2 operator*(vec2_scalar a, Float b) { return vec2(a.x * b, a.y * b); }
 vec2 operator*(Float a, vec2_scalar b) { return vec2(a * b.x, a * b.y); }
 
 SI vec2 min(vec2 a, vec2 b) { return vec2(min(a.x, b.x), min(a.y, b.y)); }
-SI vec2 min(vec2 a, Float b) { return vec2(min(a.x, b), min(a.y, b)); }
 
 SI vec2_scalar min(vec2_scalar a, vec2_scalar b) {
   return vec2_scalar{min(a.x, b.x), min(a.y, b.y)};
@@ -599,12 +485,8 @@ vec2 step(vec2 edge, vec2 x) {
   return vec2(step(edge.x, x.x), step(edge.y, x.y));
 }
 
-vec2_scalar step(vec2_scalar edge, vec2_scalar x) {
-  return vec2_scalar(step(edge.x, x.x), step(edge.y, x.y));
-}
-
-SI vec2 max(vec2 a, vec2 b) { return vec2(max(a.x, b.x), max(a.y, b.y)); }
-SI vec2 max(vec2 a, Float b) { return vec2(max(a.x, b), max(a.y, b)); }
+vec2 max(vec2 a, vec2 b) { return vec2(max(a.x, b.x), max(a.y, b.y)); }
+vec2 max(vec2 a, Float b) { return vec2(max(a.x, b), max(a.y, b)); }
 
 SI vec2_scalar max(vec2_scalar a, vec2_scalar b) {
   return vec2_scalar{max(a.x, b.x), max(a.y, b.y)};
@@ -617,31 +499,9 @@ Float length(vec2 a) { return sqrt(a.x * a.x + a.y * a.y); }
 
 float length(vec2_scalar a) { return hypotf(a.x, a.y); }
 
-template <typename A, typename B>
-SI auto distance(A a, B b) {
-  return length(a - b);
-}
+SI Float distance(vec2 a, vec2 b) { return length(a - b); }
 
-template <typename T>
-SI T normalize(T a) {
-  return a / length(a);
-}
-
-SI vec2 sqrt(vec2 a) { return vec2(sqrt(a.x), sqrt(a.y)); }
-
-SI vec2_scalar sqrt(vec2_scalar a) { return vec2_scalar(sqrt(a.x), sqrt(a.y)); }
-
-SI vec2 recip(vec2 a) { return vec2(recip(a.x), recip(a.y)); }
-
-SI vec2_scalar recip(vec2_scalar a) {
-  return vec2_scalar(recip(a.x), recip(a.y));
-}
-
-SI vec2 inversesqrt(vec2 a) { return vec2(inversesqrt(a.x), inversesqrt(a.y)); }
-
-SI vec2_scalar inversesqrt(vec2_scalar a) {
-  return vec2_scalar(inversesqrt(a.x), inversesqrt(a.y));
-}
+SI vec2 normalize(vec2 a) { return a / length(a); }
 
 #define abs __glsl_abs
 
@@ -657,13 +517,6 @@ Float abs(Float v) {
 #endif
 }
 
-float sign(float a) { return copysignf(1.0f, a); }
-
-Float sign(Float v) {
-  return bit_cast<Float>((bit_cast<I32>(v) & 0x80000000) |
-                         bit_cast<I32>(Float(1.0f)));
-}
-
 Float cast(U32 v) { return CONVERT((I32)v, Float); }
 Float cast(I32 v) { return CONVERT((I32)v, Float); }
 I32 cast(Float v) { return CONVERT(v, I32); }
@@ -725,22 +578,17 @@ SI I32 roundfast(Float v, Float scale) {
 #endif
 }
 
-template <typename T>
-SI auto round_pixel(T v, float scale = 255.0f) {
-  return roundfast(v, scale);
-}
+template <typename T> SI auto round_pixel(T v) { return roundfast(v, 255.0f); }
 
 #define round __glsl_round
 
 float round(float a) { return roundf(a); }
 
-Float round(Float v) { return floor(v + 0.5f); }
-
 float fract(float a) { return a - floor(a); }
 
-Float fract(Float v) { return v - floor(v); }
+Float round(Float v) { return floor(v + 0.5f); }
 
-vec2 fract(vec2 v) { return vec2(fract(v.x), fract(v.y)); }
+Float fract(Float v) { return v - floor(v); }
 
 // X derivatives can be approximated by dFdx(x) = x[1] - x[0].
 // Y derivatives are not easily available since we operate in terms of X spans
@@ -748,15 +596,11 @@ vec2 fract(vec2 v) { return vec2(fract(v.x), fract(v.y)); }
 // uniform scaling, and thus abs(dFdx(p.x)) + abs(dFdy(p.x)) = abs(dFdx(p.x)) +
 // abs(dFdx(p.y)) which mirrors abs(dFdx(p.y)) + abs(dFdy(p.y)) = abs(dFdx(p.y))
 // + abs(dFdx(p.x)).
-vec2_scalar fwidth(vec2 p) {
+vec2 fwidth(vec2 p) {
   Float d = abs(SHUFFLE(p.x, p.y, 1, 1, 5, 5) - SHUFFLE(p.x, p.y, 0, 0, 4, 4));
-  return vec2_scalar(d.x + d.z);
+  return vec2(d.xyxy + d.zwzw);
 }
 
-float dFdx(Float x) { return x.y - x.x; }
-
-vec2_scalar dFdx(vec2 p) { return vec2_scalar(dFdx(p.x), dFdx(p.y)); }
-
 // See
 // http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
 Float approx_log2(Float x) {
@@ -768,7 +612,6 @@ Float approx_log2(Float x) {
   return e - 124.225514990f - 1.498030302f * m -
          1.725879990f / (0.3520887068f + m);
 }
-
 Float approx_pow2(Float x) {
   Float f = fract(x);
   return bit_cast<Float>(
@@ -776,41 +619,16 @@ Float approx_pow2(Float x) {
                                       27.728023300f / (4.84252568f - f)));
 }
 
-#define pow __glsl_pow
-
-SI float pow(float x, float y) { return powf(x, y); }
-
+// From skia
 Float pow(Float x, Float y) {
   return if_then_else((x == 0) | (x == 1), x, approx_pow2(approx_log2(x) * y));
 }
 
-#define exp __glsl_exp
-
-SI float exp(float x) { return expf(x); }
-
 Float exp(Float y) {
-  float l2e = 1.4426950408889634074f;
-  return approx_pow2(l2e * y);
+  float x = 2.718281828459045235360287471352;
+  return approx_pow2(log2f(x) * y);
 }
 
-#define exp2 __glsl_exp2
-
-SI float exp2(float x) { return exp2f(x); }
-
-Float exp2(Float x) { return approx_pow2(x); }
-
-#define log __glsl_log
-
-SI float log(float x) { return logf(x); }
-
-Float log(Float x) { return approx_log2(x) * 0.69314718f; }
-
-#define log2 __glsl_log2
-
-SI float log2(float x) { return log2f(x); }
-
-Float log2(Float x) { return approx_log2(x); }
-
 struct ivec4;
 
 struct ivec2_scalar {
@@ -820,7 +638,7 @@ struct ivec2_scalar {
   int32_t y;
 
   ivec2_scalar() : ivec2_scalar(0) {}
-  IMPLICIT constexpr ivec2_scalar(int32_t a) : x(a), y(a) {}
+  constexpr ivec2_scalar(int32_t a) : x(a), y(a) {}
   constexpr ivec2_scalar(int32_t x, int32_t y) : x(x), y(y) {}
 
   int32_t& select(XYZW c) {
@@ -838,8 +656,6 @@ struct ivec2_scalar {
     return ivec2_scalar{select(c1), select(c2)};
   }
 
-  ivec2_scalar operator-() const { return ivec2_scalar{-x, -y}; }
-
   ivec2_scalar& operator+=(ivec2_scalar a) {
     x += a.x;
     y += a.y;
@@ -864,25 +680,17 @@ struct ivec2_scalar {
   friend ivec2_scalar operator+(ivec2_scalar a, ivec2_scalar b) {
     return ivec2_scalar{a.x + b.x, a.y + b.y};
   }
-
-  friend ivec2_scalar operator-(ivec2_scalar a, ivec2_scalar b) {
-    return ivec2_scalar{a.x - b.x, a.y - b.y};
-  }
-
-  friend bool operator==(const ivec2_scalar& l, const ivec2_scalar& r) {
-    return l.x == r.x && l.y == r.y;
-  }
 };
 
 struct ivec2 {
   typedef int32_t element_type;
 
   ivec2() : ivec2(I32(0)) {}
-  IMPLICIT ivec2(I32 a) : x(a), y(a) {}
+  ivec2(I32 a) : x(a), y(a) {}
   ivec2(I32 x, I32 y) : x(x), y(y) {}
-  IMPLICIT ivec2(vec2 a) : x(cast(a.x)), y(cast(a.y)) {}
+  ivec2(vec2 a) : x(cast(a.x)), y(cast(a.y)) {}
   ivec2(U32 x, U32 y) : x(CONVERT(x, I32)), y(CONVERT(y, I32)) {}
-  IMPLICIT constexpr ivec2(ivec2_scalar s) : x(s.x), y(s.y) {}
+  constexpr ivec2(ivec2_scalar s) : x(s.x), y(s.y) {}
   constexpr ivec2(ivec2_scalar s0, ivec2_scalar s1, ivec2_scalar s2,
                   ivec2_scalar s3)
       : x(I32{s0.x, s1.x, s2.x, s3.x}), y(I32{s0.y, s1.y, s2.y, s3.y}) {}
@@ -973,7 +781,7 @@ struct ivec3_scalar {
   int32_t z;
 
   ivec3_scalar() : ivec3_scalar(0) {}
-  IMPLICIT constexpr ivec3_scalar(int32_t a) : x(a), y(a), z(a) {}
+  constexpr ivec3_scalar(int32_t a) : x(a), y(a), z(a) {}
   constexpr ivec3_scalar(int32_t x, int32_t y, int32_t z) : x(x), y(y), z(z) {}
 
   int32_t& select(XYZW c) {
@@ -996,7 +804,7 @@ struct ivec3_scalar {
 
 struct ivec3 {
   ivec3() : ivec3(0) {}
-  IMPLICIT ivec3(I32 a) : x(a), y(a), z(a) {}
+  ivec3(I32 a) : x(a), y(a), z(a) {}
   ivec3(I32 x, I32 y, I32 z) : x(x), y(y), z(z) {}
   ivec3(ivec2 a, I32 b) : x(a.x), y(a.y), z(b) {}
   ivec3(vec2 a, Float b) : x(cast(a.x)), y(cast(a.y)), z(cast(b)) {}
@@ -1047,7 +855,7 @@ struct ivec4_scalar {
   int32_t w;
 
   ivec4_scalar() : ivec4_scalar(0) {}
-  IMPLICIT constexpr ivec4_scalar(int32_t a) : x(a), y(a), z(a), w(a) {}
+  constexpr ivec4_scalar(int32_t a) : x(a), y(a), z(a), w(a) {}
   constexpr ivec4_scalar(int32_t x, int32_t y, int32_t z, int32_t w)
       : x(x), y(y), z(z), w(w) {}
 
@@ -1073,31 +881,16 @@ struct ivec4_scalar {
   friend ivec4_scalar operator&(int32_t a, ivec4_scalar b) {
     return ivec4_scalar{a & b.x, a & b.y, a & b.z, a & b.w};
   }
-
-  int32_t& operator[](int index) {
-    switch (index) {
-      case 0:
-        return x;
-      case 1:
-        return y;
-      case 2:
-        return z;
-      case 3:
-        return w;
-      default:
-        UNREACHABLE;
-    }
-  }
 };
 
 struct ivec4 {
   typedef int32_t element_type;
 
   ivec4() : ivec4(I32(0)) {}
-  IMPLICIT ivec4(I32 a) : x(a), y(a), z(a), w(a) {}
+  ivec4(I32 a) : x(a), y(a), z(a), w(a) {}
   ivec4(I32 x, I32 y, I32 z, I32 w) : x(x), y(y), z(z), w(w) {}
   ivec4(ivec2 a, I32 b, I32 c) : x(a.x), y(a.y), z(b), w(c) {}
-  IMPLICIT constexpr ivec4(ivec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
+  constexpr ivec4(ivec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
   constexpr ivec4(ivec4_scalar s0, ivec4_scalar s1, ivec4_scalar s2,
                   ivec4_scalar s3)
       : x(I32{s0.x, s1.x, s2.x, s3.x}),
@@ -1190,21 +983,13 @@ struct bvec3_scalar {
   bool z;
 
   bvec3_scalar() : bvec3_scalar(false) {}
-  IMPLICIT constexpr bvec3_scalar(bool a) : x(a), y(a), z(a) {}
+  constexpr bvec3_scalar(bool a) : x(a), y(a), z(a) {}
   constexpr bvec3_scalar(bool x, bool y, bool z) : x(x), y(y), z(z) {}
 };
 
-struct bvec3_scalar1 {
-  bool x;
-
-  IMPLICIT constexpr bvec3_scalar1(bool a) : x(a) {}
-
-  operator bvec3_scalar() const { return bvec3_scalar(x); }
-};
-
 struct bvec3 {
   bvec3() : bvec3(0) {}
-  IMPLICIT bvec3(Bool a) : x(a), y(a), z(a) {}
+  bvec3(Bool a) : x(a), y(a), z(a) {}
   bvec3(Bool x, Bool y, Bool z) : x(x), y(y), z(z) {}
   Bool& select(XYZW c) {
     switch (c) {
@@ -1225,8 +1010,6 @@ struct bvec3 {
   Bool z;
 };
 
-bvec3_scalar1 make_bvec3(bool n) { return bvec3_scalar1(n); }
-
 struct bvec4_scalar {
   bool x;
   bool y;
@@ -1234,45 +1017,14 @@ struct bvec4_scalar {
   bool w;
 
   bvec4_scalar() : bvec4_scalar(false) {}
-  IMPLICIT constexpr bvec4_scalar(bool a) : x(a), y(a), z(a), w(a) {}
+  constexpr bvec4_scalar(bool a) : x(a), y(a), z(a), w(a) {}
   constexpr bvec4_scalar(bool x, bool y, bool z, bool w)
       : x(x), y(y), z(z), w(w) {}
-
-  bool& select(XYZW c) {
-    switch (c) {
-      case X:
-        return x;
-      case Y:
-        return y;
-      case Z:
-        return z;
-      case W:
-        return w;
-      default:
-        UNREACHABLE;
-    }
-  }
-  bool sel(XYZW c1) { return select(c1); }
-  bvec2_scalar sel(XYZW c1, XYZW c2) {
-    return bvec2_scalar(select(c1), select(c2));
-  }
-};
-
-bvec4_scalar bvec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
-  return bvec4_scalar{select(c1), select(c2), select(c3), select(c4)};
-}
-
-struct bvec4_scalar1 {
-  bool x;
-
-  IMPLICIT constexpr bvec4_scalar1(bool a) : x(a) {}
-
-  operator bvec4_scalar() const { return bvec4_scalar(x); }
 };
 
 struct bvec4 {
   bvec4() : bvec4(0) {}
-  IMPLICIT bvec4(Bool a) : x(a), y(a), z(a), w(a) {}
+  bvec4(Bool a) : x(a), y(a), z(a), w(a) {}
   bvec4(Bool x, Bool y, Bool z, Bool w) : x(x), y(y), z(z), w(w) {}
   bvec4(bvec2 x, bvec2 y) : x(x.x), y(x.y), z(y.x), w(y.y) {}
   Bool& select(XYZW c) {
@@ -1285,8 +1037,6 @@ struct bvec4 {
         return z;
       case W:
         return w;
-      default:
-        UNREACHABLE;
     }
   }
   Bool sel(XYZW c1) { return select(c1); }
@@ -1297,16 +1047,12 @@ struct bvec4 {
   Bool w;
 };
 
-bvec4_scalar1 make_bvec4(bool n) { return bvec4_scalar1(n); }
+bvec4_scalar make_bvec4(bool n) { return bvec4_scalar{n, n, n, n}; }
 
 bvec4_scalar make_bvec4(bool x, bool y, bool z, bool w) {
   return bvec4_scalar{x, y, z, w};
 }
 
-bvec4_scalar make_bvec4(bvec2_scalar a, bvec2_scalar b) {
-  return bvec4_scalar{a.x, a.y, b.x, b.y};
-}
-
 template <typename N>
 bvec4 make_bvec4(const N& n) {
   return bvec4(n);
@@ -1383,7 +1129,7 @@ struct vec3_scalar {
   float z;
 
   constexpr vec3_scalar() : vec3_scalar(0.0f) {}
-  IMPLICIT constexpr vec3_scalar(float a) : x(a), y(a), z(a) {}
+  constexpr vec3_scalar(float a) : x(a), y(a), z(a) {}
   constexpr vec3_scalar(float x, float y, float z) : x(x), y(y), z(z) {}
 
   float& select(XYZW c) {
@@ -1474,11 +1220,10 @@ struct vec3 {
   typedef float element_type;
 
   constexpr vec3() : vec3(Float(0.0f)) {}
-  IMPLICIT constexpr vec3(Float a) : x(a), y(a), z(a) {}
+  constexpr vec3(Float a) : x(a), y(a), z(a) {}
   constexpr vec3(Float x, Float y, Float z) : x(x), y(y), z(z) {}
   vec3(vec2 a, Float z) : x(a.x), y(a.y), z(z) {}
-  explicit vec3(vec4);
-  IMPLICIT constexpr vec3(vec3_scalar s) : x(s.x), y(s.y), z(s.z) {}
+  constexpr vec3(vec3_scalar s) : x(s.x), y(s.y), z(s.z) {}
   constexpr vec3(vec3_scalar s0, vec3_scalar s1, vec3_scalar s2, vec3_scalar s3)
       : x(Float{s0.x, s1.x, s2.x, s3.x}),
         y(Float{s0.y, s1.y, s2.y, s3.y}),
@@ -1507,8 +1252,6 @@ struct vec3 {
     return vec3(select(c1), select(c2), select(c3));
   }
 
-  vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4);
-
   vec2_ref lsel(XYZW c1, XYZW c2) { return vec2_ref(select(c1), select(c2)); }
 
   friend vec3 operator*(vec3 a, Float b) {
@@ -1605,26 +1348,13 @@ vec3 step(vec3 edge, vec3 x) {
   return vec3(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z));
 }
 
-vec3_scalar step(vec3_scalar edge, vec3_scalar x) {
-  return vec3_scalar(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z));
-}
-
 SI vec3 min(vec3 a, vec3 b) {
   return vec3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
 }
-SI vec3 min(vec3 a, Float b) {
-  return vec3(min(a.x, b), min(a.y, b), min(a.z, b));
-}
-SI vec3_scalar min(vec3_scalar a, vec3_scalar b) {
-  return vec3_scalar{min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)};
-}
-
 SI vec3 max(vec3 a, vec3 b) {
   return vec3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
 }
-SI vec3 max(vec3 a, Float b) {
-  return vec3(max(a.x, b), max(a.y, b), max(a.z, b));
-}
+
 SI vec3_scalar max(vec3_scalar a, vec3_scalar b) {
   return vec3_scalar{max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)};
 }
@@ -1670,15 +1400,11 @@ struct vec4_scalar {
   float w;
 
   constexpr vec4_scalar() : vec4_scalar(0.0f) {}
-  IMPLICIT constexpr vec4_scalar(float a) : x(a), y(a), z(a), w(a) {}
+  constexpr vec4_scalar(float a) : x(a), y(a), z(a), w(a) {}
   constexpr vec4_scalar(float x, float y, float z, float w)
       : x(x), y(y), z(z), w(w) {}
   vec4_scalar(vec3_scalar xyz, float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
 
-  static vec4_scalar load_from_ptr(const float* f) {
-    return vec4_scalar(f[0], f[1], f[2], f[3]);
-  }
-
   ALWAYS_INLINE float& select(XYZW c) {
     switch (c) {
       case X:
@@ -1700,9 +1426,6 @@ struct vec4_scalar {
   vec3_scalar sel(XYZW c1, XYZW c2, XYZW c3) {
     return vec3_scalar{select(c1), select(c2), select(c3)};
   }
-  vec4_scalar sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
-    return vec4_scalar{select(c1), select(c2), select(c3), select(c4)};
-  }
   vec2_scalar_ref lsel(XYZW c1, XYZW c2) {
     return vec2_scalar_ref(select(c1), select(c2));
   }
@@ -1750,56 +1473,30 @@ struct vec4_scalar {
     w /= a.w;
     return *this;
   }
-
-  vec4_scalar& operator*=(vec4_scalar a) {
-    x *= a.x;
-    y *= a.y;
-    z *= a.z;
-    w *= a.w;
-    return *this;
-  }
-
-  friend bool operator==(const vec4_scalar& l, const vec4_scalar& r) {
-    return l.x == r.x && l.y == r.y && l.z == r.z && l.w == r.w;
-  }
-
-  friend bool operator!=(const vec4_scalar& l, const vec4_scalar& r) {
-    return l.x != r.x || l.y != r.y || l.z != r.z || l.w != r.w;
-  }
 };
 
 vec4_scalar vec2_scalar::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
   return vec4_scalar{select(c1), select(c2), select(c3), select(c4)};
 }
 
-struct vec4_ref {
-  vec4_ref(Float& x, Float& y, Float& z, Float& w) : x(x), y(y), z(z), w(w) {}
-  Float& x;
-  Float& y;
-  Float& z;
-  Float& w;
-
-  vec4_ref& operator=(const vec4& a);
-};
-
 struct vec4 {
   typedef struct vec4 vector_type;
   typedef float element_type;
 
   constexpr vec4() : vec4(Float(0.0f)) {}
-  IMPLICIT constexpr vec4(Float a) : x(a), y(a), z(a), w(a) {}
+  constexpr vec4(Float a) : x(a), y(a), z(a), w(a) {}
   vec4(Float x, Float y, Float z, Float w) : x(x), y(y), z(z), w(w) {}
   vec4(vec3 xyz, Float w) : x(xyz.x), y(xyz.y), z(xyz.z), w(w) {}
   vec4(vec2 xy, vec2 zw) : x(xy.x), y(xy.y), z(zw.x), w(zw.y) {}
   vec4(vec2 xy, Float z, Float w) : x(xy.x), y(xy.y), z(z), w(w) {}
   vec4(Float x, Float y, vec2 zw) : x(x), y(y), z(zw.x), w(zw.y) {}
-  IMPLICIT constexpr vec4(vec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
+  constexpr vec4(vec4_scalar s) : x(s.x), y(s.y), z(s.z), w(s.w) {}
   constexpr vec4(vec4_scalar s0, vec4_scalar s1, vec4_scalar s2, vec4_scalar s3)
       : x(Float{s0.x, s1.x, s2.x, s3.x}),
         y(Float{s0.y, s1.y, s2.y, s3.y}),
         z(Float{s0.z, s1.z, s2.z, s3.z}),
         w(Float{s0.w, s1.w, s2.w, s3.w}) {}
-  ALWAYS_INLINE Float& select(XYZW c) {
+  Float& select(XYZW c) {
     switch (c) {
       case X:
         return x;
@@ -1813,29 +1510,18 @@ struct vec4 {
         UNREACHABLE;
     }
   }
-  ALWAYS_INLINE Float& sel(XYZW c1) { return select(c1); }
+  Float& sel(XYZW c1) { return select(c1); }
 
-  ALWAYS_INLINE vec2 sel(XYZW c1, XYZW c2) {
-    return vec2(select(c1), select(c2));
-  }
+  vec2 sel(XYZW c1, XYZW c2) { return vec2(select(c1), select(c2)); }
 
-  ALWAYS_INLINE vec3 sel(XYZW c1, XYZW c2, XYZW c3) {
+  vec3 sel(XYZW c1, XYZW c2, XYZW c3) {
     return vec3(select(c1), select(c2), select(c3));
   }
-  ALWAYS_INLINE vec3_ref lsel(XYZW c1, XYZW c2, XYZW c3) {
+  vec3_ref lsel(XYZW c1, XYZW c2, XYZW c3) {
     return vec3_ref(select(c1), select(c2), select(c3));
   }
 
-  ALWAYS_INLINE vec2_ref lsel(XYZW c1, XYZW c2) {
-    return vec2_ref(select(c1), select(c2));
-  }
-
-  ALWAYS_INLINE vec4 sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
-    return vec4(select(c1), select(c2), select(c3), select(c4));
-  }
-  ALWAYS_INLINE vec4_ref lsel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
-    return vec4_ref(select(c1), select(c2), select(c3), select(c4));
-  }
+  vec2_ref lsel(XYZW c1, XYZW c2) { return vec2_ref(select(c1), select(c2)); }
 
   Float& operator[](int index) {
     switch (index) {
@@ -1957,13 +1643,6 @@ struct vec4 {
     w /= a.w;
     return *this;
   }
-  vec4& operator*=(vec4 a) {
-    x *= a.x;
-    y *= a.y;
-    z *= a.z;
-    w *= a.w;
-    return *this;
-  }
   vec4& operator*=(Float a) {
     x *= a;
     y *= a;
@@ -1978,18 +1657,6 @@ struct vec4 {
   Float w;
 };
 
-inline vec4_ref& vec4_ref::operator=(const vec4& a) {
-  x = a.x;
-  y = a.y;
-  z = a.z;
-  w = a.w;
-  return *this;
-}
-
-inline vec4 vec3::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
-  return vec4(select(c1), select(c2), select(c3), select(c4));
-}
-
 vec4_scalar force_scalar(const vec4& v) {
   return vec4_scalar{force_scalar(v.x), force_scalar(v.y), force_scalar(v.z),
                      force_scalar(v.w)};
@@ -2017,10 +1684,6 @@ vec4_scalar make_vec4(float x, float y, const vec2_scalar& v) {
   return vec4_scalar{x, y, v.x, v.y};
 }
 
-ivec4_scalar make_ivec4(const vec4_scalar& v) {
-  return ivec4_scalar{int32_t(v.x), int32_t(v.y), int32_t(v.z), int32_t(v.w)};
-}
-
 template <typename N>
 vec4 make_vec4(const N& n) {
   return vec4(n);
@@ -2041,8 +1704,6 @@ vec4 make_vec4(const X& x, const Y& y, const Z& z, const W& w) {
   return vec4(x, y, z, w);
 }
 
-ALWAYS_INLINE vec3::vec3(vec4 v) : x(v.x), y(v.y), z(v.z) {}
-
 SI ivec4 roundfast(vec4 v, Float scale) {
   return ivec4(roundfast(v.x, scale), roundfast(v.y, scale),
                roundfast(v.z, scale), roundfast(v.w, scale));
@@ -2059,14 +1720,6 @@ SI vec4 if_then_else(I32 c, vec4 t, vec4 e) {
 
 SI vec4 if_then_else(int32_t c, vec4 t, vec4 e) { return c ? t : e; }
 
-SI vec4_scalar if_then_else(int32_t c, vec4_scalar t, vec4_scalar e) {
-  return c ? t : e;
-}
-
-SI vec2 clamp(vec2 a, Float minVal, Float maxVal) {
-  return vec2(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal));
-}
-
 SI vec2 clamp(vec2 a, vec2 minVal, vec2 maxVal) {
   return vec2(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y));
 }
@@ -2076,56 +1729,20 @@ SI vec2_scalar clamp(vec2_scalar a, vec2_scalar minVal, vec2_scalar maxVal) {
                      clamp(a.y, minVal.y, maxVal.y)};
 }
 
-SI vec2_scalar clamp(vec2_scalar a, float minVal, float maxVal) {
-  return vec2_scalar{clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal)};
-}
-
 SI I32 clamp(I32 a, I32 minVal, I32 maxVal) {
   a = if_then_else(a < minVal, minVal, a);
   return if_then_else(a > maxVal, maxVal, a);
 }
 
-SI vec3 clamp(vec3 a, Float minVal, Float maxVal) {
-  return vec3(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal),
-              clamp(a.z, minVal, maxVal));
-}
-
 SI vec3 clamp(vec3 a, vec3 minVal, vec3 maxVal) {
   return vec3(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
               clamp(a.z, minVal.z, maxVal.z));
 }
 
-SI vec4 clamp(vec4 a, Float minVal, Float maxVal) {
-  return vec4(clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal),
-              clamp(a.z, minVal, maxVal), clamp(a.w, minVal, maxVal));
-}
-
 SI vec4 clamp(vec4 a, vec4 minVal, vec4 maxVal) {
   return vec4(clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
               clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w));
 }
-
-SI vec4_scalar clamp(vec4_scalar a, vec4_scalar minVal, vec4_scalar maxVal) {
-  return vec4_scalar{
-      clamp(a.x, minVal.x, maxVal.x), clamp(a.y, minVal.y, maxVal.y),
-      clamp(a.z, minVal.z, maxVal.z), clamp(a.w, minVal.w, maxVal.w)};
-}
-
-SI vec4_scalar clamp(vec4_scalar a, float minVal, float maxVal) {
-  return vec4_scalar{clamp(a.x, minVal, maxVal), clamp(a.y, minVal, maxVal),
-                     clamp(a.z, minVal, maxVal), clamp(a.w, minVal, maxVal)};
-}
-
-vec4 step(vec4 edge, vec4 x) {
-  return vec4(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z),
-              step(edge.w, x.w));
-}
-
-vec4_scalar step(vec4_scalar edge, vec4_scalar x) {
-  return vec4_scalar(step(edge.x, x.x), step(edge.y, x.y), step(edge.z, x.z),
-                     step(edge.w, x.w));
-}
-
 template <typename T>
 auto lessThanEqual(T x, T y) -> decltype(x <= y) {
   return x <= y;
@@ -2163,20 +1780,6 @@ SI bvec2 lessThan(vec2 x, vec2 y) {
   return bvec2(lessThan(x.x, y.x), lessThan(x.y, y.y));
 }
 
-SI bvec2_scalar lessThan(vec2_scalar x, vec2_scalar y) {
-  return bvec2_scalar(lessThan(x.x, y.x), lessThan(x.y, y.y));
-}
-
-SI bvec4 lessThan(vec4 x, vec4 y) {
-  return bvec4(lessThan(x.x, y.x), lessThan(x.y, y.y), lessThan(x.z, y.z),
-               lessThan(x.w, y.w));
-}
-
-SI bvec4_scalar lessThan(vec4_scalar x, vec4_scalar y) {
-  return bvec4_scalar{lessThan(x.x, y.x), lessThan(x.y, y.y),
-                      lessThan(x.z, y.z), lessThan(x.w, y.w)};
-}
-
 template <typename T>
 auto greaterThan(T x, T y) -> decltype(x > y) {
   return x > y;
@@ -2186,20 +1789,6 @@ bvec2 greaterThan(vec2 x, vec2 y) {
   return bvec2(greaterThan(x.x, y.x), greaterThan(x.y, y.y));
 }
 
-bvec2_scalar greaterThan(vec2_scalar x, vec2_scalar y) {
-  return bvec2_scalar(greaterThan(x.x, y.x), greaterThan(x.y, y.y));
-}
-
-SI bvec4 greaterThan(vec4 x, vec4 y) {
-  return bvec4(greaterThan(x.x, y.x), greaterThan(x.y, y.y),
-               greaterThan(x.z, y.z), greaterThan(x.w, y.w));
-}
-
-SI bvec4_scalar greaterThan(vec4_scalar x, vec4_scalar y) {
-  return bvec4_scalar{greaterThan(x.x, y.x), greaterThan(x.y, y.y),
-                      greaterThan(x.z, y.z), greaterThan(x.w, y.w)};
-}
-
 template <typename T>
 auto greaterThanEqual(T x, T y) -> decltype(x >= y) {
   return x >= y;
@@ -2210,29 +1799,51 @@ bvec4 greaterThanEqual(vec4 x, vec4 y) {
                greaterThanEqual(x.z, y.z), greaterThanEqual(x.w, y.w));
 }
 
-template <typename T>
-auto equal(T x, T y) -> decltype(x > y) {
-  return x == y;
-}
+enum TextureFormat { RGBA32F, RGBA32I, RGBA8, R8 };
 
-bvec2 equal(vec2 x, vec2 y) { return bvec2(equal(x.x, y.x), equal(x.y, y.y)); }
+enum TextureFilter { NEAREST, LINEAR };
 
-bvec2_scalar equal(vec2_scalar x, vec2_scalar y) {
-  return bvec2_scalar(equal(x.x, y.x), equal(x.y, y.y));
-}
+struct samplerCommon {
+  uint32_t* buf = nullptr;
+  uint32_t stride = 0;  // in dwords
+  uint32_t height = 0;
+  uint32_t width = 0;
+  TextureFormat format = TextureFormat::RGBA8;
+};
 
-template <typename T>
-auto notEqual(T x, T y) -> decltype(x > y) {
-  return x != y;
-}
+struct samplerDepth {
+  int depth = 0;
+  uint32_t height_stride = 0;  // in dwords
+};
 
-bvec2 notEqual(vec2 x, vec2 y) {
-  return bvec2(notEqual(x.x, y.x), notEqual(x.y, y.y));
-}
+struct samplerFilter {
+  TextureFilter filter = TextureFilter::NEAREST;
+};
 
-bvec2_scalar notEqual(vec2_scalar x, vec2_scalar y) {
-  return bvec2_scalar(notEqual(x.x, y.x), notEqual(x.y, y.y));
-}
+struct sampler2DArray_impl : samplerCommon, samplerDepth, samplerFilter {};
+typedef sampler2DArray_impl* sampler2DArray;
+
+typedef struct sampler2DArrayR8_impl : sampler2DArray_impl{} * sampler2DArrayR8;
+typedef struct sampler2DArrayRGBA8_impl : sampler2DArray_impl{} *
+                                          sampler2DArrayRGBA8;
+typedef struct sampler2DArrayRGBA32F_impl : sampler2DArray_impl{} *
+                                            sampler2DArrayRGBA32F;
+
+struct sampler2D_impl : samplerCommon, samplerFilter {};
+typedef sampler2D_impl* sampler2D;
+
+typedef struct sampler2DR8_impl : sampler2D_impl{} * sampler2DR8;
+typedef struct sampler2DRGBA8_impl : sampler2D_impl{} * sampler2DRGBA8;
+typedef struct sampler2DRGBA32F_impl : sampler2D_impl{} * sampler2DRGBA32F;
+
+struct isampler2D_impl : samplerCommon {};
+typedef isampler2D_impl* isampler2D;
+
+struct isampler2DRGBA32I_impl : isampler2D_impl {};
+typedef isampler2DRGBA32I_impl* isampler2DRGBA32I;
+
+struct sampler2DRect_impl : samplerCommon, samplerFilter {};
+typedef sampler2DRect_impl* sampler2DRect;
 
 struct mat4_scalar;
 
@@ -2240,7 +1851,7 @@ struct mat2_scalar {
   vec2_scalar data[2];
 
   mat2_scalar() = default;
-  IMPLICIT constexpr mat2_scalar(float a) {
+  constexpr mat2_scalar(float a) {
     data[0] = vec2_scalar(a);
     data[1] = vec2_scalar(a);
   }
@@ -2248,7 +1859,7 @@ struct mat2_scalar {
     data[0] = a;
     data[1] = b;
   }
-  IMPLICIT mat2_scalar(const mat4_scalar& mat);
+  mat2_scalar(const mat4_scalar& mat);
 
   vec2_scalar& operator[](int index) { return data[index]; }
   const vec2_scalar& operator[](int index) const { return data[index]; }
@@ -2286,7 +1897,7 @@ struct mat2 {
   const vec2& operator[](int index) const { return data[index]; }
   mat2() = default;
 
-  IMPLICIT mat2(Float a) {
+  mat2(Float a) {
     data[0] = vec2(a);
     data[1] = vec2(a);
   }
@@ -2295,8 +1906,8 @@ struct mat2 {
     data[0] = a;
     data[1] = b;
   }
-  IMPLICIT mat2(const mat4& mat);
-  IMPLICIT constexpr mat2(mat2_scalar s) {
+  mat2(const mat4& mat);
+  constexpr mat2(mat2_scalar s) {
     data[0] = vec2(s.data[0]);
     data[1] = vec2(s.data[1]);
   }
@@ -2350,7 +1961,7 @@ struct mat3_scalar {
     data[1] = b;
     data[2] = c;
   }
-  IMPLICIT mat3_scalar(const mat4_scalar& mat);
+  mat3_scalar(const mat4_scalar& mat);
 
   vec3_scalar& operator[](int index) { return data[index]; }
   const vec3_scalar& operator[](int index) const { return data[index]; }
@@ -2384,7 +1995,7 @@ struct mat3 {
     data[2] = c;
   }
 
-  IMPLICIT constexpr mat3(mat3_scalar s) {
+  constexpr mat3(mat3_scalar s) {
     data[0] = vec3(s.data[0]);
     data[1] = vec3(s.data[1]);
     data[2] = vec3(s.data[2]);
@@ -2403,7 +2014,7 @@ struct mat3 {
     data[2] = vec3(d7, d8, d9);
   }
 
-  IMPLICIT mat3(const mat4& mat);
+  mat3(const mat4& mat);
 
   friend vec3 operator*(mat3 m, vec3 v) {
     vec3 u;
@@ -2490,7 +2101,7 @@ struct mat4 {
   vec4 data[4];
 
   mat4() = default;
-  IMPLICIT constexpr mat4(mat4_scalar s) {
+  constexpr mat4(mat4_scalar s) {
     data[0] = vec4(s.data[0]);
     data[1] = vec4(s.data[1]);
     data[2] = vec4(s.data[2]);
@@ -2522,15 +2133,15 @@ mat3::mat3(const mat4& mat)
            vec3(mat[1].x, mat[1].y, mat[1].z),
            vec3(mat[2].x, mat[2].y, mat[2].z)) {}
 
-IMPLICIT mat3_scalar::mat3_scalar(const mat4_scalar& mat)
+mat3_scalar::mat3_scalar(const mat4_scalar& mat)
     : mat3_scalar(vec3_scalar(mat[0].x, mat[0].y, mat[0].z),
                   vec3_scalar(mat[1].x, mat[1].y, mat[1].z),
                   vec3_scalar(mat[2].x, mat[2].y, mat[2].z)) {}
 
-IMPLICIT mat2::mat2(const mat4& mat)
+mat2::mat2(const mat4& mat)
     : mat2(vec2(mat[0].x, mat[0].y), vec2(mat[1].x, mat[1].y)) {}
 
-IMPLICIT mat2_scalar::mat2_scalar(const mat4_scalar& mat)
+mat2_scalar::mat2_scalar(const mat4_scalar& mat)
     : mat2_scalar(vec2_scalar(mat[0].x, mat[0].y),
                   vec2_scalar(mat[1].x, mat[1].y)) {}
 
@@ -2584,6 +2195,256 @@ SI mat4 if_then_else(I32 c, mat4 t, mat4 e) {
 
 SI mat4 if_then_else(int32_t c, mat4 t, mat4 e) { return c ? t : e; }
 
+SI I32 clampCoord(I32 coord, int limit) {
+#if USE_SSE2
+  return _mm_min_epi16(_mm_max_epi16(coord, _mm_setzero_si128()),
+                       _mm_set1_epi32(limit - 1));
+#else
+  return clamp(coord, 0, limit - 1);
+#endif
+}
+SI int clampCoord(int coord, int limit) {
+  return min(max(coord, 0), limit - 1);
+}
+template <typename T, typename S>
+SI T clamp2D(T P, S sampler) {
+  return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height)};
+}
+template <typename T>
+SI T clamp2DArray(T P, sampler2DArray sampler) {
+  return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height),
+           clampCoord(P.z, sampler->depth)};
+}
+
+float to_float(uint32_t x) { return x * (1.f / 255.f); }
+
+vec4 pixel_to_vec4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  U32 pixels = {a, b, c, d};
+  return vec4(cast((pixels >> 16) & 0xFF), cast((pixels >> 8) & 0xFF),
+              cast(pixels & 0xFF), cast(pixels >> 24)) *
+         (1.0f / 255.0f);
+}
+
+vec4 pixel_float_to_vec4(Float a, Float b, Float c, Float d) {
+  return vec4(Float{a.x, b.x, c.x, d.x}, Float{a.y, b.y, c.y, d.y},
+              Float{a.z, b.z, c.z, d.z}, Float{a.w, b.w, c.w, d.w});
+}
+
+ivec4 pixel_int_to_ivec4(I32 a, I32 b, I32 c, I32 d) {
+  return ivec4(I32{a.x, b.x, c.x, d.x}, I32{a.y, b.y, c.y, d.y},
+               I32{a.z, b.z, c.z, d.z}, I32{a.w, b.w, c.w, d.w});
+}
+
+vec4_scalar pixel_to_vec4(uint32_t p) {
+  U32 i = {(p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF, p >> 24};
+  Float f = cast(i) * (1.0f / 255.0f);
+  return vec4_scalar(f.x, f.y, f.z, f.w);
+}
+
+template <typename S>
+SI vec4 fetchOffsetsRGBA8(S sampler, I32 offset) {
+  return pixel_to_vec4(sampler->buf[offset.x], sampler->buf[offset.y],
+                       sampler->buf[offset.z], sampler->buf[offset.w]);
+}
+
+vec4 texelFetchRGBA8(sampler2D sampler, ivec2 P) {
+  I32 offset = P.x + P.y * sampler->stride;
+  return fetchOffsetsRGBA8(sampler, offset);
+}
+
+vec4 texelFetchRGBA8(sampler2DArray sampler, ivec3 P) {
+  assert(test_all(P.z == P.z.x));
+  I32 offset = P.x + P.y * sampler->stride + P.z.x * sampler->height_stride;
+  return fetchOffsetsRGBA8(sampler, offset);
+}
+
+template <typename S>
+SI Float fetchOffsetsR8(S sampler, I32 offset) {
+  U32 i = {
+      ((uint8_t*)sampler->buf)[offset.x], ((uint8_t*)sampler->buf)[offset.y],
+      ((uint8_t*)sampler->buf)[offset.z], ((uint8_t*)sampler->buf)[offset.w]};
+  return cast(i) * (1.0f / 255.0f);
+}
+
+vec4 texelFetchR8(sampler2D sampler, ivec2 P) {
+  I32 offset = P.x + P.y * sampler->stride;
+  return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f);
+}
+
+vec4 texelFetchR8(sampler2DArray sampler, ivec3 P) {
+  assert(test_all(P.z == P.z.x));
+  I32 offset = P.x + P.y * sampler->stride + P.z.x * sampler->height_stride;
+  return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f);
+}
+
+template <typename S>
+SI vec4 fetchOffsetsFloat(S sampler, I32 offset) {
+  return pixel_float_to_vec4(
+      *(Float*)&sampler->buf[offset.x], *(Float*)&sampler->buf[offset.y],
+      *(Float*)&sampler->buf[offset.z], *(Float*)&sampler->buf[offset.w]);
+}
+
+vec4 texelFetchFloat(sampler2D sampler, ivec2 P) {
+  I32 offset = P.x * 4 + P.y * sampler->stride;
+  return fetchOffsetsFloat(sampler, offset);
+}
+
+SI vec4 texelFetchFloat(sampler2DArray sampler, ivec3 P) {
+  assert(test_all(P.z == P.z.x));
+  I32 offset = P.x * 4 + P.y * sampler->stride + P.z.x * sampler->height_stride;
+  return fetchOffsetsFloat(sampler, offset);
+}
+
+vec4 texelFetch(sampler2D sampler, ivec2 P, int lod) {
+  assert(lod == 0);
+  P = clamp2D(P, sampler);
+  if (sampler->format == TextureFormat::RGBA32F) {
+    return texelFetchFloat(sampler, P);
+  } else if (sampler->format == TextureFormat::RGBA8) {
+    return texelFetchRGBA8(sampler, P);
+  } else {
+    assert(sampler->format == TextureFormat::R8);
+    return texelFetchR8(sampler, P);
+  }
+}
+
+vec4 texelFetch(sampler2DRGBA32F sampler, ivec2 P, int lod) {
+  assert(lod == 0);
+  P = clamp2D(P, sampler);
+  assert(sampler->format == TextureFormat::RGBA32F);
+  return texelFetchFloat(sampler, P);
+}
+
+vec4 texelFetch(sampler2DRGBA8 sampler, ivec2 P, int lod) {
+  assert(lod == 0);
+  P = clamp2D(P, sampler);
+  assert(sampler->format == TextureFormat::RGBA8);
+  return texelFetchRGBA8(sampler, P);
+}
+
+vec4 texelFetch(sampler2DR8 sampler, ivec2 P, int lod) {
+  assert(lod == 0);
+  P = clamp2D(P, sampler);
+  assert(sampler->format == TextureFormat::R8);
+  return texelFetchR8(sampler, P);
+}
+
+vec4_scalar texelFetch(sampler2D sampler, ivec2_scalar P, int lod) {
+  assert(lod == 0);
+  P = clamp2D(P, sampler);
+  if (sampler->format == TextureFormat::RGBA32F) {
+    return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+  } else {
+    assert(sampler->format == TextureFormat::RGBA8);
+    return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
+  }
+}
+
+vec4_scalar texelFetch(sampler2DRGBA32F sampler, ivec2_scalar P, int lod) {
+  assert(lod == 0);
+  P = clamp2D(P, sampler);
+  assert(sampler->format == TextureFormat::RGBA32F);
+  return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+vec4_scalar texelFetch(sampler2DRGBA8 sampler, ivec2_scalar P, int lod) {
+  assert(lod == 0);
+  P = clamp2D(P, sampler);
+  assert(sampler->format == TextureFormat::RGBA8);
+  return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
+}
+
+vec4_scalar texelFetch(sampler2DR8 sampler, ivec2_scalar P, int lod) {
+  assert(lod == 0);
+  P = clamp2D(P, sampler);
+  assert(sampler->format == TextureFormat::R8);
+  return vec4_scalar{
+      to_float(((uint8_t*)sampler->buf)[P.x + P.y * sampler->stride]), 0.0f,
+      0.0f, 0.0f};
+}
+
+vec4 texelFetch(sampler2DRect sampler, ivec2 P) {
+  P = clamp2D(P, sampler);
+  assert(sampler->format == TextureFormat::RGBA8);
+  I32 offset = P.x + P.y * sampler->stride;
+  return fetchOffsetsRGBA8(sampler, offset);
+}
+
+SI vec4 texelFetch(sampler2DArray sampler, ivec3 P, int lod) {
+  assert(lod == 0);
+  P = clamp2DArray(P, sampler);
+  if (sampler->format == TextureFormat::RGBA32F) {
+    return texelFetchFloat(sampler, P);
+  } else if (sampler->format == TextureFormat::R8) {
+    return texelFetchR8(sampler, P);
+  } else {
+    assert(sampler->format == TextureFormat::RGBA8);
+    return texelFetchRGBA8(sampler, P);
+  }
+}
+
+vec4 texelFetch(sampler2DArrayRGBA32F sampler, ivec3 P, int lod) {
+  assert(lod == 0);
+  P = clamp2DArray(P, sampler);
+  assert(sampler->format == TextureFormat::RGBA32F);
+  return texelFetchFloat(sampler, P);
+}
+
+vec4 texelFetch(sampler2DArrayRGBA8 sampler, ivec3 P, int lod) {
+  assert(lod == 0);
+  P = clamp2DArray(P, sampler);
+  assert(sampler->format == TextureFormat::RGBA8);
+  return texelFetchRGBA8(sampler, P);
+}
+
+vec4 texelFetch(sampler2DArrayR8 sampler, ivec3 P, int lod) {
+  assert(lod == 0);
+  P = clamp2DArray(P, sampler);
+  assert(sampler->format == TextureFormat::R8);
+  return texelFetchR8(sampler, P);
+}
+
+template <typename S>
+SI ivec4 fetchOffsetsInt(S sampler, I32 offset) {
+  return pixel_int_to_ivec4(
+      *(I32*)&sampler->buf[offset.x], *(I32*)&sampler->buf[offset.y],
+      *(I32*)&sampler->buf[offset.z], *(I32*)&sampler->buf[offset.w]);
+}
+
+ivec4 texelFetch(isampler2D sampler, ivec2 P, int lod) {
+  assert(lod == 0);
+  P = clamp2D(P, sampler);
+  assert(sampler->format == TextureFormat::RGBA32I);
+  I32 offset = P.x * 4 + P.y * sampler->stride;
+  return fetchOffsetsInt(sampler, offset);
+}
+
+ivec4_scalar texelFetch(isampler2D sampler, ivec2_scalar P, int lod) {
+  assert(lod == 0);
+  P = clamp2D(P, sampler);
+  assert(sampler->format == TextureFormat::RGBA32I);
+  return *(ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+SI vec4_scalar* texelFetchPtr(sampler2D sampler, ivec2_scalar P, int min_x,
+                              int max_x, int min_y, int max_y) {
+  P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x);
+  P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y);
+  assert(sampler->format == TextureFormat::RGBA32F);
+  return (vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+SI ivec4_scalar* texelFetchPtr(isampler2D sampler, ivec2_scalar P, int min_x,
+                               int max_x, int min_y, int max_y) {
+  P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x);
+  P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y);
+  assert(sampler->format == TextureFormat::RGBA32I);
+  return (ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
+}
+
+#define texelFetchOffset(sampler, P, lod, offset) \
+  texelFetch(sampler, (P) + (offset), lod)
+
 template <typename T, typename U, typename A,
           typename R = typename T::vector_type>
 SI R mix(T x, U y, A a) {
@@ -2598,19 +2459,416 @@ SI T mix(T x, T y, float a) {
 }
 
 template <typename T>
-SI T mix(T x, T y, vec2_scalar a) {
-  return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y)};
+SI T mix(T x, T y, vec4_scalar a) {
+  return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z),
+           mix(x.w, y.w, a.w)};
 }
 
+// Scale texture coords for quantization, subtract offset for filtering
+// (assuming coords already offset to texel centers), and round to nearest
+// 1/scale increment
 template <typename T>
-SI T mix(T x, T y, vec3_scalar a) {
-  return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z)};
+SI T linearQuantize(T P, float scale) {
+  return P * scale + (0.5f - 0.5f * scale);
 }
 
-template <typename T>
-SI T mix(T x, T y, vec4_scalar a) {
-  return T{mix(x.x, y.x, a.x), mix(x.y, y.y, a.y), mix(x.z, y.z, a.z),
-           mix(x.w, y.w, a.w)};
+// Helper version that also scales normalized texture coords for sampler
+template <typename T, typename S>
+SI T linearQuantize(T P, float scale, S sampler) {
+  P.x *= sampler->width;
+  P.y *= sampler->height;
+  return linearQuantize(P, scale);
+}
+
+template <typename S>
+vec4 textureLinearRGBA8(S sampler, vec2 P, int32_t zoffset = 0) {
+  assert(sampler->format == TextureFormat::RGBA8);
+
+#if USE_SSE2
+  ivec2 i(linearQuantize(P, 256, sampler));
+  ivec2 frac = i & (I32)0xFF;
+  i >>= 8;
+
+  // Pack coords so they get clamped into range, and also for later bounding
+  // of fractional coords. Store Y as low-bits for easier access, X as high.
+  __m128i yx = _mm_packs_epi32(i.y, i.x);
+  __m128i hw = _mm_packs_epi32(_mm_set1_epi32(sampler->height - 1),
+                               _mm_set1_epi32(sampler->width - 1));
+  // Clamp coords to valid range to prevent sampling outside texture.
+  __m128i clampyx = _mm_min_epi16(_mm_max_epi16(yx, _mm_setzero_si128()), hw);
+  // Multiply clamped Y by stride and add X offset.
+  __m128i row0 = _mm_madd_epi16(
+      _mm_unpacklo_epi16(clampyx, _mm_setzero_si128()),
+      _mm_set1_epi16(sampler->stride));
+  row0 = _mm_add_epi32(row0, _mm_unpackhi_epi16(clampyx, _mm_setzero_si128()));
+  // Add in layer offset if available
+  row0 = _mm_add_epi32(row0, _mm_set1_epi32(zoffset));
+
+  // Check if fractional coords are all zero, in which case skip filtering.
+  __m128i fracyx = _mm_packs_epi32(frac.y, frac.x);
+  if (!_mm_movemask_epi8(_mm_cmpgt_epi16(fracyx, _mm_setzero_si128()))) {
+    return fetchOffsetsRGBA8(sampler, row0);
+  }
+
+  // Check if coords were clamped at all above. If so, need to adjust fractions
+  // to avoid sampling outside the texture on the edges.
+  __m128i yxinside = _mm_andnot_si128(
+      _mm_cmplt_epi16(yx, _mm_setzero_si128()),
+      _mm_cmplt_epi16(yx, hw));
+  // Set fraction to zero when outside.
+  fracyx = _mm_and_si128(fracyx, yxinside);
+  // Store two side-by-side copies of X fraction, as below each pixel value
+  // will be interleaved to be next to the pixel value for the next row.
+  __m128i fracx = _mm_unpackhi_epi16(fracyx, fracyx);
+  // For Y fraction, we need to store 1-fraction before each fraction, as a
+  // madd will be used to weight and collapse all results as last step.
+  __m128i fracy = _mm_unpacklo_epi16(
+      _mm_sub_epi16(_mm_set1_epi16(256), fracyx), fracyx);
+
+  // Ensure we don't sample row off end of texture from added stride.
+  __m128i row1 = _mm_and_si128(yxinside, _mm_set1_epi16(sampler->stride));
+
+  // Load two adjacent pixels on each row and interleave them.
+  // r0,g0,b0,a0,r1,g1,b1,a1 \/ R0,G0,B0,A0,R1,G1,B1,A1
+  // r0,R0,g0,G0,b0,B0,a0,A0,r1,R1,g1,G1,b1,B1,a1,A1
+#  define LOAD_LANE(out, idx)                                               \
+    {                                                                       \
+      uint32_t* buf = &sampler->buf[_mm_cvtsi128_si32(                      \
+          _mm_shuffle_epi32(row0, _MM_SHUFFLE(idx, idx, idx, idx)))];       \
+      out = _mm_unpacklo_epi8(                                              \
+          _mm_loadl_epi64((__m128i*)buf),                                   \
+          _mm_loadl_epi64((__m128i*)(buf + _mm_extract_epi16(row1, idx)))); \
+    }
+  __m128i x, y, z, w;
+  LOAD_LANE(x, 0)
+  LOAD_LANE(y, 1)
+  LOAD_LANE(z, 2)
+  LOAD_LANE(w, 3)
+#  undef LOAD_LANE
+
+  // Need to transpose the data from AoS to SoA format. Best to do this here
+  // while the data is still packed into 8-bit components, requiring fewer
+  // insns.
+  // r0,R0,g0,G0,b0,B0,a0,A0,r1,R1,g1,G1,b1,B1,a1,A1 \/
+  // r2,R2,g2,G2,b2,B2,a2,A2,r3,R3,g3,G3,b3,B3,a3,A3
+  // ... r0,R0,r2,R2,g0,G0,g2,G2,b0,B0,b2,B2,a0,A0,a2,A2
+  // ... r1,R1,r3,R3,g1,G1,g3,G3,b1,B1,b3,B3,a1,A1,a3,A3
+  __m128i xy0 = _mm_unpacklo_epi16(x, y);
+  __m128i xy1 = _mm_unpackhi_epi16(x, y);
+  __m128i zw0 = _mm_unpacklo_epi16(z, w);
+  __m128i zw1 = _mm_unpackhi_epi16(z, w);
+  // r0,R0,r2,R2,g0,G0,g2,G2,b0,B0,b2,B2,a0,A0,a2,A2 \/
+  // r4,R4,r6,R6,g4,G4,g6,G6,b4,B4,b6,B6,a4,A4,a6,A6
+  // ... r0,R0,r2,R2,r4,R4,r6,R6,g0,G0,g2,G2,g4,G4,g6,G6
+  // ... b0,B0,b2,B2,b4,B4,b6,B6,a0,A0,a2,A2,a4,A4,a6,A6
+  __m128i rg0 = _mm_unpacklo_epi32(xy0, zw0);
+  __m128i ba0 = _mm_unpackhi_epi32(xy0, zw0);
+  __m128i rg1 = _mm_unpacklo_epi32(xy1, zw1);
+  __m128i ba1 = _mm_unpackhi_epi32(xy1, zw1);
+
+  // Expand packed SoA pixels for each column. Multiply then add columns with
+  // 8-bit precision so we don't carry to high byte of word accidentally. Use
+  // final madd insn to blend interleaved rows and expand result to 32 bits.
+#  define FILTER_COMPONENT(out, unpack, src0, src1)                            \
+    {                                                                          \
+      __m128i cc0 = unpack(src0, _mm_setzero_si128());                         \
+      __m128i cc1 = unpack(src1, _mm_setzero_si128());                         \
+      cc0 = _mm_add_epi8(                                                      \
+          cc0, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(cc1, cc0), fracx), \
+                              8));                                             \
+      out = _mm_cvtepi32_ps(_mm_madd_epi16(cc0, fracy));                       \
+    }
+  __m128 fr, fg, fb, fa;
+  FILTER_COMPONENT(fr, _mm_unpacklo_epi8, rg0, rg1);
+  FILTER_COMPONENT(fg, _mm_unpackhi_epi8, rg0, rg1);
+  FILTER_COMPONENT(fb, _mm_unpacklo_epi8, ba0, ba1);
+  FILTER_COMPONENT(fa, _mm_unpackhi_epi8, ba0, ba1);
+#  undef FILTER_COMPONENT
+
+  return vec4(fb, fg, fr, fa) * (1.0f / 0xFF00);
+#else
+  ivec2 i(linearQuantize(P, 128, sampler));
+  ivec2 frac = i & (I32)0x7F;
+  i >>= 7;
+
+  I32 row0 = clampCoord(i.x, sampler->width) +
+             clampCoord(i.y, sampler->height) * sampler->stride + zoffset;
+  I32 row1 = row0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) &
+                     I32(sampler->stride));
+  I16 fracx =
+      CONVERT(frac.x & (i.x >= 0 && i.x < int32_t(sampler->width) - 1), I16);
+  I16 fracy = CONVERT(frac.y, I16);
+
+  auto a0 =
+      CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.x]), V8<int16_t>);
+  auto a1 =
+      CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.x]), V8<int16_t>);
+  a0 += ((a1 - a0) * fracy.x) >> 7;
+
+  auto b0 =
+      CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.y]), V8<int16_t>);
+  auto b1 =
+      CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.y]), V8<int16_t>);
+  b0 += ((b1 - b0) * fracy.y) >> 7;
+
+  auto abl = zipLow(a0, b0);
+  auto abh = zipHigh(a0, b0);
+  abl += ((abh - abl) * fracx.xyxyxyxy) >> 7;
+
+  auto c0 =
+      CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.z]), V8<int16_t>);
+  auto c1 =
+      CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.z]), V8<int16_t>);
+  c0 += ((c1 - c0) * fracy.z) >> 7;
+
+  auto d0 =
+      CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row0.w]), V8<int16_t>);
+  auto d1 =
+      CONVERT(unaligned_load<V8<uint8_t> >(&sampler->buf[row1.w]), V8<int16_t>);
+  d0 += ((d1 - d0) * fracy.w) >> 7;
+
+  auto cdl = zipLow(c0, d0);
+  auto cdh = zipHigh(c0, d0);
+  cdl += ((cdh - cdl) * fracx.zwzwzwzw) >> 7;
+
+  auto rg = CONVERT(V8<uint16_t>(zip2Low(abl, cdl)), V8<float>);
+  auto ba = CONVERT(V8<uint16_t>(zip2High(abl, cdl)), V8<float>);
+
+  auto r = lowHalf(rg);
+  auto g = highHalf(rg);
+  auto b = lowHalf(ba);
+  auto a = highHalf(ba);
+  return vec4(b, g, r, a) * (1.0f / 255.0f);
+#endif
+}
+
+template <typename S>
+static U16 textureLinearPackedR8(S sampler, ivec2 i, int32_t zoffset) {
+  assert(sampler->format == TextureFormat::R8);
+  ivec2 frac = i & (I32)0x7F;
+  i >>= 7;
+
+  I32 row0 = clampCoord(i.x, sampler->width) +
+             clampCoord(i.y, sampler->height) * sampler->stride + zoffset;
+  I32 row1 = row0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) &
+                     I32(sampler->stride));
+  I16 fracx =
+      CONVERT(frac.x & (i.x >= 0 && i.x < int32_t(sampler->width) - 1), I16);
+  I16 fracy = CONVERT(frac.y, I16);
+
+  uint8_t* buf = (uint8_t*)sampler->buf;
+  auto a0 = unaligned_load<V2<uint8_t> >(&buf[row0.x]);
+  auto b0 = unaligned_load<V2<uint8_t> >(&buf[row0.y]);
+  auto c0 = unaligned_load<V2<uint8_t> >(&buf[row0.z]);
+  auto d0 = unaligned_load<V2<uint8_t> >(&buf[row0.w]);
+  auto abcd0 = CONVERT(combine(combine(a0, b0), combine(c0, d0)), V8<int16_t>);
+
+  auto a1 = unaligned_load<V2<uint8_t> >(&buf[row1.x]);
+  auto b1 = unaligned_load<V2<uint8_t> >(&buf[row1.y]);
+  auto c1 = unaligned_load<V2<uint8_t> >(&buf[row1.z]);
+  auto d1 = unaligned_load<V2<uint8_t> >(&buf[row1.w]);
+  auto abcd1 = CONVERT(combine(combine(a1, b1), combine(c1, d1)), V8<int16_t>);
+
+  abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7;
+
+  abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);
+  auto abcdl = lowHalf(abcd0);
+  auto abcdh = highHalf(abcd0);
+  abcdl += ((abcdh - abcdl) * fracx) >> 7;
+
+  return U16(abcdl);
+}
+
+template <typename S>
+vec4 textureLinearR8(S sampler, vec2 P, int32_t zoffset = 0) {
+  assert(sampler->format == TextureFormat::R8);
+
+#if USE_SSE2
+  ivec2 i(linearQuantize(P, 256, sampler));
+  ivec2 frac = i & (I32)0xFF;
+  i >>= 8;
+
+  // Pack coords so they get clamped into range, and also for later bounding
+  // of fractional coords. Store Y as low-bits for easier access, X as high.
+  __m128i yx = _mm_packs_epi32(i.y, i.x);
+  __m128i hw = _mm_packs_epi32(_mm_set1_epi32(sampler->height - 1),
+                               _mm_set1_epi32(sampler->width - 1));
+  // Clamp coords to valid range to prevent sampling outside texture.
+  __m128i clampyx = _mm_min_epi16(_mm_max_epi16(yx, _mm_setzero_si128()), hw);
+  // Multiply clamped Y by stride and add X offset.
+  __m128i row0 = _mm_madd_epi16(
+      _mm_unpacklo_epi16(clampyx, _mm_setzero_si128()),
+      _mm_set1_epi16(sampler->stride));
+  row0 = _mm_add_epi32(row0, _mm_unpackhi_epi16(clampyx, _mm_setzero_si128()));
+  // Add in layer offset if available
+  row0 = _mm_add_epi32(row0, _mm_set1_epi32(zoffset));
+
+  __m128i fracyx = _mm_packs_epi32(frac.y, frac.x);
+
+  // Check if coords were clamped at all above. If so, need to adjust fractions
+  // to avoid sampling outside the texture on the edges.
+  __m128i yxinside = _mm_andnot_si128(
+      _mm_cmplt_epi16(yx, _mm_setzero_si128()),
+      _mm_cmplt_epi16(yx, hw));
+  // Set fraction to zero when outside.
+  fracyx = _mm_and_si128(fracyx, yxinside);
+  // For X fraction, we need to store 1-fraction before each fraction, as a
+  // madd will be used to weight and collapse all results as last step.
+  __m128i fracx = _mm_unpackhi_epi16(
+      _mm_sub_epi16(_mm_set1_epi16(256), fracyx), fracyx);
+  // Store two side-by-side copies of Y fraction, as below each pixel value
+  // will be interleaved to be next to the pixel value for the next column.
+  __m128i fracy = _mm_unpacklo_epi16(fracyx, fracyx);
+
+  // Ensure we don't sample row off end of texture from added stride.
+  __m128i row1 = _mm_and_si128(yxinside, _mm_set1_epi16(sampler->stride));
+
+  // Calculate pointers for first row in each lane
+  uint8_t* buf = (uint8_t*)sampler->buf;
+  uint8_t* buf0 =
+      buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(0, 0, 0, 0)));
+  uint8_t* buf1 =
+      buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(1, 1, 1, 1)));
+  uint8_t* buf2 =
+      buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(2, 2, 2, 2)));
+  uint8_t* buf3 =
+      buf + _mm_cvtsi128_si32(_mm_shuffle_epi32(row0, _MM_SHUFFLE(3, 3, 3, 3)));
+  // Load adjacent columns from first row, pack into register, then expand.
+  __m128i cc0 = _mm_unpacklo_epi8(
+      _mm_setr_epi16(*(uint16_t*)buf0, *(uint16_t*)buf1, *(uint16_t*)buf2,
+                     *(uint16_t*)buf3, 0, 0, 0, 0),
+      _mm_setzero_si128());
+  // Load adjacent columns from next row, pack into register, then expand.
+  __m128i cc1 = _mm_unpacklo_epi8(
+      _mm_setr_epi16(*(uint16_t*)(buf0 + _mm_extract_epi16(row1, 0)),
+                     *(uint16_t*)(buf1 + _mm_extract_epi16(row1, 1)),
+                     *(uint16_t*)(buf2 + _mm_extract_epi16(row1, 2)),
+                     *(uint16_t*)(buf3 + _mm_extract_epi16(row1, 3)),
+                     0, 0, 0, 0),
+      _mm_setzero_si128());
+  // Multiply then add rows with 8-bit precision so we don't carry to high byte
+  // of word accidentally. Use final madd insn to blend interleaved columns and
+  // expand result to 32 bits.
+  __m128i cc = _mm_add_epi8(
+      cc0, _mm_srli_epi16(_mm_mullo_epi16(_mm_sub_epi16(cc1, cc0), fracy), 8));
+  __m128 r = _mm_cvtepi32_ps(_mm_madd_epi16(cc, fracx));
+  return vec4((Float)r * (1.0f / 0xFF00), 0.0f, 0.0f, 1.0f);
+#else
+  ivec2 i(linearQuantize(P, 128, sampler));
+  Float r = CONVERT(textureLinearPackedR8(sampler, i, zoffset), Float);
+  return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f);
+#endif
+}
+
+template <typename S>
+vec4 textureLinearRGBA32F(S sampler, vec2 P, int32_t zoffset = 0) {
+  assert(sampler->format == TextureFormat::RGBA32F);
+  P.x *= sampler->width;
+  P.y *= sampler->height;
+  P -= 0.5f;
+  vec2 f = floor(P);
+  vec2 r = P - f;
+  ivec2 i(f);
+  ivec2 c = clamp2D(i, sampler);
+  r.x = if_then_else(i.x >= 0 && i.x < sampler->width - 1, r.x, 0.0f);
+  I32 offset0 = c.x * 4 + c.y * sampler->stride + zoffset;
+  I32 offset1 = offset0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) &
+                           I32(sampler->stride));
+
+  Float c0 = mix(mix(*(Float*)&sampler->buf[offset0.x],
+                     *(Float*)&sampler->buf[offset0.x + 4], r.x),
+                 mix(*(Float*)&sampler->buf[offset1.x],
+                     *(Float*)&sampler->buf[offset1.x + 4], r.x),
+                 r.y);
+  Float c1 = mix(mix(*(Float*)&sampler->buf[offset0.y],
+                     *(Float*)&sampler->buf[offset0.y + 4], r.x),
+                 mix(*(Float*)&sampler->buf[offset1.y],
+                     *(Float*)&sampler->buf[offset1.y + 4], r.x),
+                 r.y);
+  Float c2 = mix(mix(*(Float*)&sampler->buf[offset0.z],
+                     *(Float*)&sampler->buf[offset0.z + 4], r.x),
+                 mix(*(Float*)&sampler->buf[offset1.z],
+                     *(Float*)&sampler->buf[offset1.z + 4], r.x),
+                 r.y);
+  Float c3 = mix(mix(*(Float*)&sampler->buf[offset0.w],
+                     *(Float*)&sampler->buf[offset0.w + 4], r.x),
+                 mix(*(Float*)&sampler->buf[offset1.w],
+                     *(Float*)&sampler->buf[offset1.w + 4], r.x),
+                 r.y);
+  return pixel_float_to_vec4(c0, c1, c2, c3);
+}
+
+SI vec4 texture(sampler2D sampler, vec2 P) {
+  if (sampler->filter == TextureFilter::LINEAR) {
+    if (sampler->format == TextureFormat::RGBA8) {
+      return textureLinearRGBA8(sampler, P);
+    } else if (sampler->format == TextureFormat::R8) {
+      return textureLinearR8(sampler, P);
+    } else {
+      assert(sampler->format == TextureFormat::RGBA32F);
+      return textureLinearRGBA32F(sampler, P);
+    }
+  } else {
+    ivec2 coord(roundzero(P.x, sampler->width), roundzero(P.y, sampler->height));
+    return texelFetch(sampler, coord, 0);
+  }
+}
+
+vec4 texture(sampler2DRect sampler, vec2 P) {
+  assert(sampler->format == TextureFormat::RGBA8);
+  if (sampler->filter == TextureFilter::LINEAR) {
+    return textureLinearRGBA8(sampler,
+                              P * vec2_scalar{1.0f / sampler->width, 1.0f / sampler->height});
+  } else {
+    ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f));
+    return texelFetch(sampler, coord);
+  }
+}
+
+SI vec4 texture(sampler2DArray sampler, vec3 P) {
+  if (sampler->filter == TextureFilter::LINEAR) {
+    // SSE2 can generate slow code for 32-bit multiply, and we never actually sample
+    // from different layers in one chunk, so do cheaper scalar multiplication instead.
+    assert(test_all(P.z == P.z.x));
+    int32_t zoffset =
+        clampCoord(roundeven(P.z.x, 1.0f), sampler->depth) * sampler->height_stride;
+    if (sampler->format == TextureFormat::RGBA8) {
+      return textureLinearRGBA8(sampler, vec2(P.x, P.y), zoffset);
+    } else if (sampler->format == TextureFormat::R8) {
+      return textureLinearR8(sampler, vec2(P.x, P.y), zoffset);
+    } else {
+      assert(sampler->format == TextureFormat::RGBA32F);
+      return textureLinearRGBA32F(sampler, vec2(P.x, P.y), zoffset);
+    }
+  } else {
+    // just do nearest for now
+    ivec3 coord(roundzero(P.x, sampler->width), roundzero(P.y, sampler->height),
+                roundeven(P.z, 1.0f));
+    return texelFetch(sampler, coord, 0);
+  }
+}
+
+vec4 texture(sampler2DArray sampler, vec3 P, float bias) {
+  assert(bias == 0.0f);
+  return texture(sampler, P);
+}
+
+vec4 textureLod(sampler2DArray sampler, vec3 P, float lod) {
+  assert(lod == 0.0f);
+  return texture(sampler, P);
+}
+
+ivec3_scalar textureSize(sampler2DArray sampler, int) {
+  return ivec3_scalar{int32_t(sampler->width), int32_t(sampler->height),
+                      int32_t(sampler->depth)};
+}
+
+ivec2_scalar textureSize(sampler2D sampler, int) {
+  return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
+}
+
+ivec2_scalar textureSize(sampler2DRect sampler) {
+  return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
 }
 
 ivec4 ivec2::sel(XYZW c1, XYZW c2, XYZW c3, XYZW c4) {
@@ -2675,30 +2933,15 @@ SI T mix(T x, T y, bvec4_scalar a) {
 }
 
 template <typename T>
-SI T mix(T x, T y, bvec4_scalar1 a) {
-  return a.x ? y : x;
-}
-
-template <typename T>
 SI T mix(T x, T y, bvec3_scalar a) {
   return T{a.x ? y.x : x.x, a.y ? y.y : x.y, a.z ? y.z : x.z};
 }
 
 template <typename T>
-SI T mix(T x, T y, bvec3_scalar1 a) {
-  return a.x ? y : x;
-}
-
-template <typename T>
 SI T mix(T x, T y, bvec2_scalar a) {
   return T{a.x ? y.x : x.x, a.y ? y.y : x.y};
 }
 
-template <typename T>
-SI T mix(T x, T y, bvec2_scalar1 a) {
-  return a.x ? y : x;
-}
-
 float dot(vec3_scalar a, vec3_scalar b) {
   return a.x * b.x + a.y * b.y + a.z * b.z;
 }
@@ -2736,28 +2979,7 @@ Float atan(Float v) { return {atanf(v.x), atanf(v.y), atanf(v.z), atanf(v.w)}; }
 float atan(float a, float b) { return atan2f(a, b); }
 
 Float atan(Float a, Float b) {
-  return {atan2f(a.x, b.x), atan2f(a.y, b.y), atan2f(a.z, b.z),
-          atan2f(a.w, b.w)};
-}
-
-bvec4 equal(vec4 x, vec4 y) {
-  return bvec4(equal(x.x, y.x), equal(x.y, y.y), equal(x.z, y.z),
-               equal(x.w, y.w));
-}
-
-bvec4_scalar equal(vec4_scalar x, vec4_scalar y) {
-  return bvec4_scalar(equal(x.x, y.x), equal(x.y, y.y), equal(x.z, y.z),
-                      equal(x.w, y.w));
-}
-
-bvec4 notEqual(vec4 x, vec4 y) {
-  return bvec4(notEqual(x.x, y.x), notEqual(x.y, y.y), notEqual(x.z, y.z),
-               notEqual(x.w, y.w));
-}
-
-bvec4_scalar notEqual(vec4_scalar x, vec4_scalar y) {
-  return bvec4_scalar(notEqual(x.x, y.x), notEqual(x.y, y.y),
-                      notEqual(x.z, y.z), notEqual(x.w, y.w));
+    return {atan2f(a.x, b.x), atan2f(a.y, b.y), atan2f(a.z, b.z), atan2f(a.w, b.w)};
 }
 
 bvec4 notEqual(ivec4 a, ivec4 b) {
@@ -2783,18 +3005,12 @@ vec2 abs(vec2 v) { return vec2(abs(v.x), abs(v.y)); }
 
 vec2_scalar abs(vec2_scalar v) { return vec2_scalar{fabsf(v.x), fabsf(v.y)}; }
 
-vec2 sign(vec2 v) { return vec2(sign(v.x), sign(v.y)); }
-
-vec2_scalar sign(vec2_scalar v) { return vec2_scalar{sign(v.x), sign(v.y)}; }
-
 Float mod(Float a, Float b) { return a - b * floor(a / b); }
 
 vec2 mod(vec2 a, vec2 b) { return vec2(mod(a.x, b.x), mod(a.y, b.y)); }
 
 vec3 abs(vec3 v) { return vec3(abs(v.x), abs(v.y), abs(v.z)); }
 
-vec3 sign(vec3 v) { return vec3(sign(v.x), sign(v.y), sign(v.z)); }
-
 mat2 inverse(mat2 v) {
   Float det = v[0].x * v[1].y - v[0].y * v[1].x;
   return mat2(vec2(v[1].y, -v[0].y), vec2(-v[1].x, v[0].x)) * (1. / det);
diff --git a/third_party/webrender/swgl/src/lib.rs b/third_party/webrender/swgl/src/lib.rs
index e8fc030e0c9..e19e85fd512 100644
--- a/third_party/webrender/swgl/src/lib.rs
+++ b/third_party/webrender/swgl/src/lib.rs
@@ -5,7 +5,7 @@
 #![crate_name = "swgl"]
 #![crate_type = "lib"]
 
-extern crate gleam;
+use gleam;
 
 mod swgl_fns;
 
diff --git a/third_party/webrender/swgl/src/program.h b/third_party/webrender/swgl/src/program.h
index 9ea7c6dd6eb..80e5a5b68f7 100644
--- a/third_party/webrender/swgl/src/program.h
+++ b/third_party/webrender/swgl/src/program.h
@@ -12,12 +12,6 @@ namespace glsl {
 // to operate in Float-sized chunks.
 typedef vec3 Interpolants;
 
-// Clip distances, if enabled, are always stored in the first SIMD chunk of the
-// interpolants.
-static ALWAYS_INLINE Float get_clip_distances(const Interpolants& interp) {
-  return interp.x;
-}
-
 struct VertexShaderImpl;
 struct FragmentShaderImpl;
 
@@ -29,14 +23,10 @@ struct ProgramImpl {
   virtual size_t interpolants_size() const = 0;
   virtual VertexShaderImpl* get_vertex_shader() = 0;
   virtual FragmentShaderImpl* get_fragment_shader() = 0;
-  virtual const char* get_name() const = 0;
 };
 
 typedef ProgramImpl* (*ProgramLoader)();
 
-// The maximum size of the gl_ClipDistance array.
-constexpr int32_t gl_MaxClipDistances = 4;
-
 struct VertexShaderImpl {
   typedef void (*SetUniform1iFunc)(VertexShaderImpl*, int index, int value);
   typedef void (*SetUniform4fvFunc)(VertexShaderImpl*, int index,
@@ -56,17 +46,7 @@ struct VertexShaderImpl {
   LoadAttribsFunc load_attribs_func = nullptr;
   RunPrimitiveFunc run_primitive_func = nullptr;
 
-  enum FLAGS {
-    CLIP_DISTANCE = 1 << 0,
-  };
-  int flags = 0;
-  void enable_clip_distance() { flags |= CLIP_DISTANCE; }
-  ALWAYS_INLINE bool use_clip_distance() const {
-    return (flags & CLIP_DISTANCE) != 0;
-  }
-
   vec4 gl_Position;
-  Float gl_ClipDistance[gl_MaxClipDistances];
 
   void set_uniform_1i(int index, int value) {
     (*set_uniform_1i_func)(this, index, value);
@@ -92,20 +72,18 @@ struct VertexShaderImpl {
   }
 };
 
-// The number of pixels in a step.
-constexpr int32_t swgl_StepSize = 4;
-
 struct FragmentShaderImpl {
   typedef void (*InitSpanFunc)(FragmentShaderImpl*, const void* interps,
-                               const void* step);
+                               const void* step, float step_width);
   typedef void (*RunFunc)(FragmentShaderImpl*);
-  typedef void (*SkipFunc)(FragmentShaderImpl*, int steps);
+  typedef void (*SkipFunc)(FragmentShaderImpl*, int chunks);
   typedef void (*InitSpanWFunc)(FragmentShaderImpl*, const void* interps,
-                                const void* step);
+                                const void* step, float step_width);
   typedef void (*RunWFunc)(FragmentShaderImpl*);
-  typedef void (*SkipWFunc)(FragmentShaderImpl*, int steps);
-  typedef int (*DrawSpanRGBA8Func)(FragmentShaderImpl*);
-  typedef int (*DrawSpanR8Func)(FragmentShaderImpl*);
+  typedef void (*SkipWFunc)(FragmentShaderImpl*, int chunks);
+  typedef void (*DrawSpanRGBA8Func)(FragmentShaderImpl*, uint32_t* buf,
+                                    int len);
+  typedef void (*DrawSpanR8Func)(FragmentShaderImpl*, uint8_t* buf, int len);
 
   InitSpanFunc init_span_func = nullptr;
   RunFunc run_func = nullptr;
@@ -129,27 +107,31 @@ struct FragmentShaderImpl {
   }
 
   vec4 gl_FragCoord;
+  vec2_scalar stepZW;
+  Bool isPixelDiscarded = false;
   vec4 gl_FragColor;
   vec4 gl_SecondaryFragColor;
 
-  vec2_scalar swgl_StepZW;
-  Bool swgl_IsPixelDiscarded = false;
-  // The current buffer position for committing span output.
-  uint32_t* swgl_OutRGBA8 = nullptr;
-  uint8_t* swgl_OutR8 = nullptr;
-  // The remaining number of pixels in the span.
-  int32_t swgl_SpanLength = 0;
+  ALWAYS_INLINE void step_fragcoord() { gl_FragCoord.x += 4; }
 
-  ALWAYS_INLINE void step_fragcoord(int steps = 4) { gl_FragCoord.x += steps; }
+  ALWAYS_INLINE void step_fragcoord(int chunks) {
+    gl_FragCoord.x += 4 * chunks;
+  }
+
+  ALWAYS_INLINE void step_perspective() {
+    gl_FragCoord.z += stepZW.x;
+    gl_FragCoord.w += stepZW.y;
+  }
 
-  ALWAYS_INLINE void step_perspective(int steps = 4) {
-    gl_FragCoord.z += swgl_StepZW.x * steps;
-    gl_FragCoord.w += swgl_StepZW.y * steps;
+  ALWAYS_INLINE void step_perspective(int chunks) {
+    gl_FragCoord.z += stepZW.x * chunks;
+    gl_FragCoord.w += stepZW.y * chunks;
   }
 
   template <bool W = false>
-  ALWAYS_INLINE void init_span(const void* interps, const void* step) {
-    (*(W ? init_span_w_func : init_span_func))(this, interps, step);
+  ALWAYS_INLINE void init_span(const void* interps, const void* step,
+                               float step_width) {
+    (*(W ? init_span_w_func : init_span_func))(this, interps, step, step_width);
   }
 
   template <bool W = false>
@@ -158,24 +140,20 @@ struct FragmentShaderImpl {
   }
 
   template <bool W = false>
-  ALWAYS_INLINE void skip(int steps = 4) {
-    (*(W ? skip_w_func : skip_func))(this, steps);
+  ALWAYS_INLINE void skip(int chunks = 1) {
+    (*(W ? skip_w_func : skip_func))(this, chunks);
   }
 
-  ALWAYS_INLINE int draw_span(uint32_t* buf, int len) {
-    swgl_OutRGBA8 = buf;
-    swgl_SpanLength = len;
-    return (*draw_span_RGBA8_func)(this);
+  ALWAYS_INLINE void draw_span(uint32_t* buf, int len) {
+    (*draw_span_RGBA8_func)(this, buf, len);
   }
 
   ALWAYS_INLINE bool has_draw_span(uint32_t*) {
     return draw_span_RGBA8_func != nullptr;
   }
 
-  ALWAYS_INLINE int draw_span(uint8_t* buf, int len) {
-    swgl_OutR8 = buf;
-    swgl_SpanLength = len;
-    return (*draw_span_R8_func)(this);
+  ALWAYS_INLINE void draw_span(uint8_t* buf, int len) {
+    (*draw_span_R8_func)(this, buf, len);
   }
 
   ALWAYS_INLINE bool has_draw_span(uint8_t*) {
diff --git a/third_party/webrender/swgl/src/rasterize.h b/third_party/webrender/swgl/src/rasterize.h
deleted file mode 100644
index 48f3b9e5898..00000000000
--- a/third_party/webrender/swgl/src/rasterize.h
+++ /dev/null
@@ -1,1670 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-// The SWGL depth buffer is roughly organized as a span buffer where each row
-// of the depth buffer is a list of spans, and each span has a constant depth
-// and a run length (represented by DepthRun). The span from start..start+count
-// is placed directly at that start index in the row's array of runs, so that
-// there is no need to explicitly record the start index at all. This also
-// avoids the need to move items around in the run array to manage insertions
-// since space is implicitly always available for a run between any two
-// pre-existing runs. Linkage from one run to the next is implicitly defined by
-// the count, so if a run exists from start..start+count, the next run will
-// implicitly pick up right at index start+count where that preceding run left
-// off. All of the DepthRun items that are after the head of the run can remain
-// uninitialized until the run needs to be split and a new run needs to start
-// somewhere in between.
-// For uses like perspective-correct rasterization or with a discard mask, a
-// run is not an efficient representation, and it is more beneficial to have
-// a flattened array of individual depth samples that can be masked off easily.
-// To support this case, the first run in a given row's run array may have a
-// zero count, signaling that this entire row is flattened. Critically, the
-// depth and count fields in DepthRun are ordered (endian-dependently) so that
-// the DepthRun struct can be interpreted as a sign-extended int32_t depth. It
-// is then possible to just treat the entire row as an array of int32_t depth
-// samples that can be processed with SIMD comparisons, since the count field
-// behaves as just the sign-extension of the depth field. The count field is
-// limited to 8 bits so that we can support depth values up to 24 bits.
-// When a depth buffer is cleared, each row is initialized to a maximal runs
-// spanning the entire row. In the normal case, the depth buffer will continue
-// to manage itself as a list of runs. If perspective or discard is used for
-// a given row, the row will be converted to the flattened representation to
-// support it, after which it will only ever revert back to runs if the depth
-// buffer is cleared.
-
-// The largest 24-bit depth value supported.
-constexpr uint32_t MAX_DEPTH_VALUE = 0xFFFFFF;
-// The longest 8-bit depth run that is supported, aligned to SIMD chunk size.
-constexpr uint32_t MAX_DEPTH_RUN = 255 & ~3;
-
-struct DepthRun {
-  // Ensure that depth always occupies the LSB and count the MSB so that we
-  // can sign-extend depth just by setting count to zero, marking it flat.
-  // When count is non-zero, then this is interpreted as an actual run and
-  // depth is read in isolation.
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  uint32_t depth : 24;
-  uint32_t count : 8;
-#else
-  uint32_t count : 8;
-  uint32_t depth : 24;
-#endif
-
-  DepthRun() = default;
-  DepthRun(uint32_t depth, uint8_t count) : depth(depth), count(count) {}
-
-  // If count is zero, this is actually a flat depth sample rather than a run.
-  bool is_flat() const { return !count; }
-
-  // Compare a source depth from rasterization with a stored depth value.
-  template <int FUNC>
-  ALWAYS_INLINE bool compare(uint32_t src) const {
-    switch (FUNC) {
-      case GL_LEQUAL:
-        return src <= depth;
-      case GL_LESS:
-        return src < depth;
-      case GL_ALWAYS:
-        return true;
-      default:
-        assert(false);
-        return false;
-    }
-  }
-};
-
-// Fills runs at the given position with the given depth up to the span width.
-static ALWAYS_INLINE void set_depth_runs(DepthRun* runs, uint32_t depth,
-                                         uint32_t width) {
-  // If the width exceeds the maximum run size, then we need to output clamped
-  // runs first.
-  for (; width >= MAX_DEPTH_RUN;
-       runs += MAX_DEPTH_RUN, width -= MAX_DEPTH_RUN) {
-    *runs = DepthRun(depth, MAX_DEPTH_RUN);
-  }
-  // If there are still any left over samples to fill under the maximum run
-  // size, then output one last run for them.
-  if (width > 0) {
-    *runs = DepthRun(depth, width);
-  }
-}
-
-// A cursor for reading and modifying a row's depth run array. It locates
-// and iterates through a desired span within all the runs, testing if
-// the depth of this span passes or fails the depth test against existing
-// runs. If desired, new runs may be inserted to represent depth occlusion
-// from this span in the run array.
-struct DepthCursor {
-  // Current position of run the cursor has advanced to.
-  DepthRun* cur = nullptr;
-  // The start of the remaining potential samples in the desired span.
-  DepthRun* start = nullptr;
-  // The end of the potential samples in the desired span.
-  DepthRun* end = nullptr;
-
-  DepthCursor() = default;
-
-  // Construct a cursor with runs for a given row's run array and the bounds
-  // of the span we wish to iterate within it.
-  DepthCursor(DepthRun* runs, int num_runs, int span_offset, int span_count)
-      : cur(runs), start(&runs[span_offset]), end(start + span_count) {
-    // This cursor should never iterate over flat runs
-    assert(!runs->is_flat());
-    DepthRun* end_runs = &runs[num_runs];
-    // Clamp end of span to end of row
-    if (end > end_runs) {
-      end = end_runs;
-    }
-    // If the span starts past the end of the row, just advance immediately
-    // to it to signal that we're done.
-    if (start >= end_runs) {
-      cur = end_runs;
-      start = end_runs;
-      return;
-    }
-    // Otherwise, find the first depth run that contains the start of the span.
-    // If the span starts after the given run, then we need to keep searching
-    // through the row to find an appropriate run. The check above already
-    // guaranteed that the span starts within the row's runs, and the search
-    // won't fall off the end.
-    for (;;) {
-      assert(cur < end);
-      DepthRun* next = cur + cur->count;
-      if (start < next) {
-        break;
-      }
-      cur = next;
-    }
-  }
-
-  // The cursor is valid if the current position is at the end or if the run
-  // contains the start position.
-  bool valid() const {
-    return cur >= end || (cur <= start && start < cur + cur->count);
-  }
-
-  // Skip past any initial runs that fail the depth test. If we find a run that
-  // would pass, then return the accumulated length between where we started
-  // and that position. Otherwise, if we fall off the end, return -1 to signal
-  // that there are no more passed runs at the end of this failed region and
-  // so it is safe for the caller to stop processing any more regions in this
-  // row.
-  template <int FUNC>
-  int skip_failed(uint32_t val) {
-    assert(valid());
-    DepthRun* prev = start;
-    while (cur < end) {
-      if (cur->compare<FUNC>(val)) {
-        return start - prev;
-      }
-      cur += cur->count;
-      start = cur;
-    }
-    return -1;
-  }
-
-  // Helper to convert function parameters into template parameters to hoist
-  // some checks out of inner loops.
-  ALWAYS_INLINE int skip_failed(uint32_t val, GLenum func) {
-    switch (func) {
-      case GL_LEQUAL:
-        return skip_failed<GL_LEQUAL>(val);
-      case GL_LESS:
-        return skip_failed<GL_LESS>(val);
-      default:
-        assert(false);
-        return -1;
-    }
-  }
-
-  // Find a region of runs that passes the depth test. It is assumed the caller
-  // has called skip_failed first to skip past any runs that failed the depth
-  // test. This stops when it finds a run that fails the depth test or we fall
-  // off the end of the row. If the write mask is enabled, this will insert runs
-  // to represent this new region that passed the depth test. The length of the
-  // region is returned.
-  template <int FUNC, bool MASK>
-  int check_passed(uint32_t val) {
-    assert(valid());
-    DepthRun* prev = cur;
-    while (cur < end) {
-      if (!cur->compare<FUNC>(val)) {
-        break;
-      }
-      DepthRun* next = cur + cur->count;
-      if (next > end) {
-        if (MASK) {
-          // Chop the current run where the end of the span falls, making a new
-          // run from the end of the span till the next run. The beginning of
-          // the current run will be folded into the run from the start of the
-          // passed region before returning below.
-          *end = DepthRun(cur->depth, next - end);
-        }
-        // If the next run starts past the end, then just advance the current
-        // run to the end to signal that we're now at the end of the row.
-        next = end;
-      }
-      cur = next;
-    }
-    // If we haven't advanced past the start of the span region, then we found
-    // nothing that passed.
-    if (cur <= start) {
-      return 0;
-    }
-    // If 'end' fell within the middle of a passing run, then 'cur' will end up
-    // pointing at the new partial run created at 'end' where the passing run
-    // was split to accommodate starting in the middle. The preceding runs will
-    // be fixed below to properly join with this new split.
-    int passed = cur - start;
-    if (MASK) {
-      // If the search started from a run before the start of the span, then
-      // edit that run to meet up with the start.
-      if (prev < start) {
-        prev->count = start - prev;
-      }
-      // Create a new run for the entirety of the passed samples.
-      set_depth_runs(start, val, passed);
-    }
-    start = cur;
-    return passed;
-  }
-
-  // Helper to convert function parameters into template parameters to hoist
-  // some checks out of inner loops.
-  template <bool MASK>
-  ALWAYS_INLINE int check_passed(uint32_t val, GLenum func) {
-    switch (func) {
-      case GL_LEQUAL:
-        return check_passed<GL_LEQUAL, MASK>(val);
-      case GL_LESS:
-        return check_passed<GL_LESS, MASK>(val);
-      default:
-        assert(false);
-        return 0;
-    }
-  }
-
-  ALWAYS_INLINE int check_passed(uint32_t val, GLenum func, bool mask) {
-    return mask ? check_passed<true>(val, func)
-                : check_passed<false>(val, func);
-  }
-
-  // Fill a region of runs with a given depth value, bypassing any depth test.
-  ALWAYS_INLINE void fill(uint32_t depth) {
-    check_passed<GL_ALWAYS, true>(depth);
-  }
-};
-
-// Initialize a depth texture by setting the first run in each row to encompass
-// the entire row.
-void Texture::init_depth_runs(uint32_t depth) {
-  if (!buf) return;
-  DepthRun* runs = (DepthRun*)buf;
-  for (int y = 0; y < height; y++) {
-    set_depth_runs(runs, depth, width);
-    runs += stride() / sizeof(DepthRun);
-  }
-  set_cleared(true);
-}
-
-// Fill a portion of the run array with flattened depth samples.
-static ALWAYS_INLINE void fill_flat_depth(DepthRun* dst, size_t n,
-                                          uint32_t depth) {
-  fill_n((uint32_t*)dst, n, depth);
-}
-
-// Fills a scissored region of a depth texture with a given depth.
-void Texture::fill_depth_runs(uint32_t depth, const IntRect& scissor) {
-  if (!buf) return;
-  assert(cleared());
-  IntRect bb = bounds().intersection(scissor - offset);
-  DepthRun* runs = (DepthRun*)sample_ptr(0, bb.y0);
-  for (int rows = bb.height(); rows > 0; rows--) {
-    if (bb.width() >= width) {
-      // If the scissor region encompasses the entire row, reset the row to a
-      // single run encompassing the entire row.
-      set_depth_runs(runs, depth, width);
-    } else if (runs->is_flat()) {
-      // If the row is flattened, just directly fill the portion of the row.
-      fill_flat_depth(&runs[bb.x0], bb.width(), depth);
-    } else {
-      // Otherwise, if we are still using runs, then set up a cursor to fill
-      // it with depth runs.
-      DepthCursor(runs, width, bb.x0, bb.width()).fill(depth);
-    }
-    runs += stride() / sizeof(DepthRun);
-  }
-}
-
-using ZMask = I32;
-
-#if USE_SSE2
-#  define ZMASK_NONE_PASSED 0xFFFF
-#  define ZMASK_ALL_PASSED 0
-static inline uint32_t zmask_code(ZMask mask) {
-  return _mm_movemask_epi8(mask);
-}
-#else
-#  define ZMASK_NONE_PASSED 0xFFFFFFFFU
-#  define ZMASK_ALL_PASSED 0
-static inline uint32_t zmask_code(ZMask mask) {
-  return bit_cast<uint32_t>(CONVERT(mask, U8));
-}
-#endif
-
-// Interprets items in the depth buffer as sign-extended 32-bit depth values
-// instead of as runs. Returns a mask that signals which samples in the given
-// chunk passed or failed the depth test with given Z value.
-template <bool DISCARD>
-static ALWAYS_INLINE bool check_depth(I32 src, DepthRun* zbuf, ZMask& outmask,
-                                      int span = 4) {
-  // SSE2 does not support unsigned comparison. So ensure Z value is
-  // sign-extended to int32_t.
-  I32 dest = unaligned_load<I32>(zbuf);
-  // Invert the depth test to check which pixels failed and should be discarded.
-  ZMask mask = ctx->depthfunc == GL_LEQUAL
-                   ?
-                   // GL_LEQUAL: Not(LessEqual) = Greater
-                   ZMask(src > dest)
-                   :
-                   // GL_LESS: Not(Less) = GreaterEqual
-                   ZMask(src >= dest);
-  // Mask off any unused lanes in the span.
-  mask |= ZMask(span) < ZMask{1, 2, 3, 4};
-  if (zmask_code(mask) == ZMASK_NONE_PASSED) {
-    return false;
-  }
-  if (!DISCARD && ctx->depthmask) {
-    unaligned_store(zbuf, (mask & dest) | (~mask & src));
-  }
-  outmask = mask;
-  return true;
-}
-
-static ALWAYS_INLINE I32 packDepth() {
-  return cast(fragment_shader->gl_FragCoord.z * MAX_DEPTH_VALUE);
-}
-
-static ALWAYS_INLINE void discard_depth(I32 src, DepthRun* zbuf, I32 mask) {
-  if (ctx->depthmask) {
-    I32 dest = unaligned_load<I32>(zbuf);
-    mask |= fragment_shader->swgl_IsPixelDiscarded;
-    unaligned_store(zbuf, (mask & dest) | (~mask & src));
-  }
-}
-
-static ALWAYS_INLINE void mask_output(uint32_t* buf, ZMask zmask,
-                                      int span = 4) {
-  WideRGBA8 r = pack_pixels_RGBA8();
-  PackedRGBA8 dst = load_span<PackedRGBA8>(buf, span);
-  if (blend_key) r = blend_pixels(buf, dst, r, span);
-  PackedRGBA8 mask = bit_cast<PackedRGBA8>(zmask);
-  store_span(buf, (mask & dst) | (~mask & pack(r)), span);
-}
-
-template <bool DISCARD>
-static ALWAYS_INLINE void discard_output(uint32_t* buf, int span = 4) {
-  mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span);
-}
-
-template <>
-ALWAYS_INLINE void discard_output<false>(uint32_t* buf, int span) {
-  WideRGBA8 r = pack_pixels_RGBA8();
-  if (blend_key)
-    r = blend_pixels(buf, load_span<PackedRGBA8>(buf, span), r, span);
-  store_span(buf, pack(r), span);
-}
-
-static ALWAYS_INLINE void mask_output(uint8_t* buf, ZMask zmask, int span = 4) {
-  WideR8 r = pack_pixels_R8();
-  WideR8 dst = unpack(load_span<PackedR8>(buf, span));
-  if (blend_key) r = blend_pixels(buf, dst, r, span);
-  WideR8 mask = packR8(zmask);
-  store_span(buf, pack((mask & dst) | (~mask & r)), span);
-}
-
-template <bool DISCARD>
-static ALWAYS_INLINE void discard_output(uint8_t* buf, int span = 4) {
-  mask_output(buf, fragment_shader->swgl_IsPixelDiscarded, span);
-}
-
-template <>
-ALWAYS_INLINE void discard_output<false>(uint8_t* buf, int span) {
-  WideR8 r = pack_pixels_R8();
-  if (blend_key)
-    r = blend_pixels(buf, unpack(load_span<PackedR8>(buf, span)), r, span);
-  store_span(buf, pack(r), span);
-}
-
-struct ClipRect {
-  float x0;
-  float y0;
-  float x1;
-  float y1;
-
-  explicit ClipRect(const IntRect& i)
-      : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {}
-  explicit ClipRect(const Texture& t) : ClipRect(ctx->apply_scissor(t)) {
-    // If blending is enabled, set blend_key to reflect the resolved blend
-    // state for the currently drawn primitive.
-    if (ctx->blend) {
-      blend_key = ctx->blend_key;
-      if (swgl_ClipFlags) {
-        // If there is a blend override set, replace the blend key with it.
-        if (swgl_ClipFlags & SWGL_CLIP_FLAG_BLEND_OVERRIDE) {
-          blend_key = swgl_BlendOverride;
-        }
-        // If a clip mask is available, set up blending state to use the clip
-        // mask.
-        if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) {
-          assert(swgl_ClipMask->format == TextureFormat::R8);
-          // Constrain the clip mask bounds to always fall within the clip mask.
-          swgl_ClipMaskBounds.intersect(IntRect{0, 0, int(swgl_ClipMask->width),
-                                                int(swgl_ClipMask->height)});
-          // The clip mask offset is relative to the viewport.
-          swgl_ClipMaskOffset += ctx->viewport.origin() - t.offset;
-          // The clip mask bounds are relative to the clip mask offset.
-          swgl_ClipMaskBounds.offset(swgl_ClipMaskOffset);
-          // Finally, constrain the clip rectangle by the clip mask bounds.
-          intersect(swgl_ClipMaskBounds);
-          // Modify the blend key so that it will use the clip mask while
-          // blending.
-          restore_clip_mask();
-        }
-        if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) {
-          // Modify the blend key so that it will use AA while blending.
-          restore_aa();
-        }
-      }
-    } else {
-      blend_key = BLEND_KEY_NONE;
-      swgl_ClipFlags = 0;
-    }
-  }
-
-  FloatRange x_range() const { return {x0, x1}; }
-
-  void intersect(const IntRect& c) {
-    x0 = max(x0, float(c.x0));
-    y0 = max(y0, float(c.y0));
-    x1 = min(x1, float(c.x1));
-    y1 = min(y1, float(c.y1));
-  }
-
-  template <typename P>
-  void set_clip_mask(int x, int y, P* buf) const {
-    if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) {
-      swgl_SpanBuf = buf;
-      swgl_ClipMaskBuf = (uint8_t*)swgl_ClipMask->buf +
-                         (y - swgl_ClipMaskOffset.y) * swgl_ClipMask->stride +
-                         (x - swgl_ClipMaskOffset.x);
-    }
-  }
-
-  template <typename P>
-  bool overlaps(int nump, const P* p) const {
-    // Generate a mask of which side of the clip rect all of a polygon's points
-    // fall inside of. This is a cheap conservative estimate of whether the
-    // bounding box of the polygon might overlap the clip rect, rather than an
-    // exact test that would require multiple slower line intersections.
-    int sides = 0;
-    for (int i = 0; i < nump; i++) {
-      sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2;
-      sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8;
-    }
-    return sides == 0xF;
-  }
-};
-
-// Given a current X position at the center Y position of a row, return the X
-// position of the left and right intercepts of the row top and bottom.
-template <typename E>
-static ALWAYS_INLINE FloatRange x_intercepts(const E& e) {
-  float rad = 0.5f * abs(e.x_slope());
-  return {e.cur_x() - rad, e.cur_x() + rad};
-}
-
-// Return the AA sub-span corresponding to a given edge. If AA is requested,
-// then this finds the X intercepts with the row clipped into range of the
-// edge and finally conservatively rounds them out. If there is no AA, then
-// it just returns the current rounded X position clipped within bounds.
-template <typename E>
-static ALWAYS_INLINE IntRange aa_edge(const E& e, const FloatRange& bounds) {
-  return e.edgeMask ? bounds.clip(x_intercepts(e)).round_out()
-                    : bounds.clip({e.cur_x(), e.cur_x()}).round();
-}
-
-// Calculate the initial AA coverage as an approximation of the distance from
-// the center of the pixel in the direction of the edge slope. Given an edge
-// (x,y)..(x+dx,y+dy), then the normalized tangent vector along the edge is
-// (dx,dy)/sqrt(dx^2+dy^2). We know that for dy=1 then dx=e.x_slope. We rotate
-// the tangent vector either -90 or +90 degrees to get the edge normal vector,
-// where 'dx=-dy and 'dy=dx. Once normalized by 1/sqrt(dx^2+dy^2), scale into
-// the range of 0..256 so that we can cheaply convert to a fixed-point scale
-// factor. It is assumed that at exactly the pixel center the opacity is half
-// (128) and linearly decreases along the normal vector at 1:1 scale with the
-// slope. While not entirely accurate, this gives a reasonably agreeable looking
-// approximation of AA. For edges on which there is no AA, just force the
-// opacity to maximum (256) with no slope, relying on the span clipping to trim
-// pixels outside the span.
-template <typename E>
-static ALWAYS_INLINE FloatRange aa_dist(const E& e, float dir) {
-  if (e.edgeMask) {
-    float dx = (dir * 256.0f) * inversesqrt(1.0f + e.x_slope() * e.x_slope());
-    return {128.0f + dx * (e.cur_x() - 0.5f), -dx};
-  } else {
-    return {256.0f, 0.0f};
-  }
-}
-
-template <typename P, typename E>
-static ALWAYS_INLINE IntRange aa_span(P* buf, const E& left, const E& right,
-                                      const FloatRange& bounds) {
-  // If there is no AA, just return the span from the rounded left edge X
-  // position to the rounded right edge X position. Clip the span to be within
-  // the valid bounds.
-  if (!(swgl_ClipFlags & SWGL_CLIP_FLAG_AA)) {
-    return bounds.clip({left.cur_x(), right.cur_x()}).round();
-  }
-
-  // Calculate the left and right AA spans along with the coverage distances
-  // and slopes necessary to do blending.
-  IntRange leftAA = aa_edge(left, bounds);
-  FloatRange leftDist = aa_dist(left, -1.0f);
-  IntRange rightAA = aa_edge(right, bounds);
-  FloatRange rightDist = aa_dist(right, 1.0f);
-
-  // Use the pointer into the destination buffer as a status indicator of the
-  // coverage offset. The pointer is calculated so that subtracting it with
-  // the current destination pointer will yield a negative value if the span
-  // is outside the opaque area and otherwise will yield a positive value
-  // above the opaque size. This pointer is stored as a uint8 pointer so that
-  // there are no hidden multiplication instructions and will just return a
-  // 1:1 linear memory address. Thus the size of the opaque region must also
-  // be scaled by the pixel size in bytes.
-  swgl_OpaqueStart = (const uint8_t*)(buf + leftAA.end);
-  swgl_OpaqueSize = max(rightAA.start - leftAA.end - 3, 0) * sizeof(P);
-
-  // Offset the coverage distances by the end of the left AA span, which
-  // corresponds to the opaque start pointer, so that pixels become opaque
-  // immediately after. The distances are also offset for each lane in the
-  // chunk.
-  Float offset = cast(leftAA.end + (I32){0, 1, 2, 3});
-  swgl_LeftAADist = leftDist.start + offset * leftDist.end;
-  swgl_RightAADist = rightDist.start + offset * rightDist.end;
-  swgl_AASlope =
-      (Float){leftDist.end, rightDist.end, 0.0f, 0.0f} / float(sizeof(P));
-
-  // Return the full span width from the start of the left span to the end of
-  // the right span.
-  return {leftAA.start, rightAA.end};
-}
-
-// Calculate the span the user clip distances occupy from the left and right
-// edges at the current row.
-template <typename E>
-static ALWAYS_INLINE IntRange clip_distance_range(const E& left,
-                                                  const E& right) {
-  Float leftClip = get_clip_distances(left.interp);
-  Float rightClip = get_clip_distances(right.interp);
-  // Get the change in clip dist per X step.
-  Float clipStep = (rightClip - leftClip) / (right.cur_x() - left.cur_x());
-  // Find the zero intercepts starting from the left edge.
-  Float clipDist = left.cur_x() - leftClip * recip(clipStep);
-  // Find the distance to the start of the span for any clip distances that
-  // are increasing in value. If the clip distance is constant or decreasing
-  // in value, then check if it starts outside the clip volume.
-  Float start = if_then_else(clipStep > 0.0f, clipDist,
-                             if_then_else(leftClip < 0.0f, 1.0e6f, 0.0f));
-  // Find the distance to the end of the span for any clip distances that are
-  // decreasing in value. If the clip distance is constant or increasing in
-  // value, then check if it ends inside the clip volume.
-  Float end = if_then_else(clipStep < 0.0f, clipDist,
-                           if_then_else(rightClip >= 0.0f, 1.0e6f, 0.0f));
-  // Find the furthest start offset.
-  start = max(start, start.zwxy);
-  // Find the closest end offset.
-  end = min(end, end.zwxy);
-  // Finally, round the offsets to an integer span that can be used to bound
-  // the current span.
-  return FloatRange{max(start.x, start.y), min(end.x, end.y)}.round();
-}
-
-// Converts a run array into a flattened array of depth samples. This just
-// walks through every run and fills the samples with the depth value from
-// the run.
-static void flatten_depth_runs(DepthRun* runs, size_t width) {
-  if (runs->is_flat()) {
-    return;
-  }
-  while (width > 0) {
-    size_t n = runs->count;
-    fill_flat_depth(runs, n, runs->depth);
-    runs += n;
-    width -= n;
-  }
-}
-
-// Helper function for drawing passed depth runs within the depth buffer.
-// Flattened depth (perspective or discard) is not supported.
-template <typename P>
-static ALWAYS_INLINE void draw_depth_span(uint32_t z, P* buf,
-                                          DepthCursor& cursor) {
-  for (;;) {
-    // Get the span that passes the depth test. Assume on entry that
-    // any failed runs have already been skipped.
-    int span = cursor.check_passed(z, ctx->depthfunc, ctx->depthmask);
-    // If nothing passed, since we already skipped passed failed runs
-    // previously, we must have hit the end of the row. Bail out.
-    if (span <= 0) {
-      break;
-    }
-    if (span >= 4) {
-      // If we have a draw specialization, try to process as many 4-pixel
-      // chunks as possible using it.
-      if (fragment_shader->has_draw_span(buf)) {
-        int drawn = fragment_shader->draw_span(buf, span & ~3);
-        buf += drawn;
-        span -= drawn;
-      }
-      // Otherwise, just process each chunk individually.
-      while (span >= 4) {
-        fragment_shader->run();
-        discard_output<false>(buf);
-        buf += 4;
-        span -= 4;
-      }
-    }
-    // If we have a partial chunk left over, we still have to process it as if
-    // it were a full chunk. Mask off only the part of the chunk we want to
-    // use.
-    if (span > 0) {
-      fragment_shader->run();
-      discard_output<false>(buf, span);
-      buf += span;
-    }
-    // Skip past any runs that fail the depth test.
-    int skip = cursor.skip_failed(z, ctx->depthfunc);
-    // If there aren't any, that means we won't encounter any more passing runs
-    // and so it's safe to bail out.
-    if (skip <= 0) {
-      break;
-    }
-    // Advance interpolants for the fragment shader past the skipped region.
-    // If we processed a partial chunk above, we actually advanced the
-    // interpolants a full chunk in the fragment shader's run function. Thus,
-    // we need to first subtract off that 4-pixel chunk and only partially
-    // advance them to that partial chunk before we can add on the rest of the
-    // skips. This is combined with the skip here for efficiency's sake.
-    fragment_shader->skip(skip - (span > 0 ? 4 - span : 0));
-    buf += skip;
-  }
-}
-
-// Draw a simple span in 4-pixel wide chunks, optionally using depth.
-template <bool DISCARD, bool W, typename P, typename Z>
-static ALWAYS_INLINE void draw_span(P* buf, DepthRun* depth, int span, Z z) {
-  if (depth) {
-    // Depth testing is enabled. If perspective is used, Z values will vary
-    // across the span, we use packDepth to generate packed Z values suitable
-    // for depth testing based on current values from gl_FragCoord.z.
-    // Otherwise, for the no-perspective case, we just use the provided Z.
-    // Process 4-pixel chunks first.
-    for (; span >= 4; span -= 4, buf += 4, depth += 4) {
-      I32 zsrc = z();
-      ZMask zmask;
-      if (check_depth<DISCARD>(zsrc, depth, zmask)) {
-        fragment_shader->run<W>();
-        mask_output(buf, zmask);
-        if (DISCARD) discard_depth(zsrc, depth, zmask);
-      } else {
-        fragment_shader->skip<W>();
-      }
-    }
-    // If there are any remaining pixels, do a partial chunk.
-    if (span > 0) {
-      I32 zsrc = z();
-      ZMask zmask;
-      if (check_depth<DISCARD>(zsrc, depth, zmask, span)) {
-        fragment_shader->run<W>();
-        mask_output(buf, zmask, span);
-        if (DISCARD) discard_depth(zsrc, depth, zmask);
-      }
-    }
-  } else {
-    // Process 4-pixel chunks first.
-    for (; span >= 4; span -= 4, buf += 4) {
-      fragment_shader->run<W>();
-      discard_output<DISCARD>(buf);
-    }
-    // If there are any remaining pixels, do a partial chunk.
-    if (span > 0) {
-      fragment_shader->run<W>();
-      discard_output<DISCARD>(buf, span);
-    }
-  }
-}
-
-// Called during rasterization to forcefully clear a row on which delayed clear
-// has been enabled. If we know that we are going to completely overwrite a part
-// of the row, then we only need to clear the row outside of that part. However,
-// if blending or discard is enabled, the values of that underlying part of the
-// row may be used regardless to produce the final rasterization result, so we
-// have to then clear the entire underlying row to prepare it.
-template <typename P>
-static inline void prepare_row(Texture& colortex, int y, int startx, int endx,
-                               bool use_discard, DepthRun* depth,
-                               uint32_t z = 0, DepthCursor* cursor = nullptr) {
-  assert(colortex.delay_clear > 0);
-  // Delayed clear is enabled for the color buffer. Check if needs clear.
-  uint32_t& mask = colortex.cleared_rows[y / 32];
-  if ((mask & (1 << (y & 31))) == 0) {
-    mask |= 1 << (y & 31);
-    colortex.delay_clear--;
-    if (blend_key || use_discard) {
-      // If depth test, blending, or discard is used, old color values
-      // might be sampled, so we need to clear the entire row to fill it.
-      force_clear_row<P>(colortex, y);
-    } else if (depth) {
-      if (depth->is_flat() || !cursor) {
-        // If flat depth is used, we can't cheaply predict if which samples will
-        // pass.
-        force_clear_row<P>(colortex, y);
-      } else {
-        // Otherwise if depth runs are used, see how many samples initially pass
-        // the depth test and only fill the row outside those. The fragment
-        // shader will fill the row within the passed samples.
-        int passed =
-            DepthCursor(*cursor).check_passed<false>(z, ctx->depthfunc);
-        if (startx > 0 || startx + passed < colortex.width) {
-          force_clear_row<P>(colortex, y, startx, startx + passed);
-        }
-      }
-    } else if (startx > 0 || endx < colortex.width) {
-      // Otherwise, we only need to clear the row outside of the span.
-      // The fragment shader will fill the row within the span itself.
-      force_clear_row<P>(colortex, y, startx, endx);
-    }
-  }
-}
-
-// Perpendicular dot-product is the dot-product of a vector with the
-// perpendicular vector of the other, i.e. dot(a, {-b.y, b.x})
-template <typename T>
-static ALWAYS_INLINE auto perpDot(T a, T b) {
-  return a.x * b.y - a.y * b.x;
-}
-
-// Check if the winding of the initial edges is flipped, requiring us to swap
-// the edges to avoid spans having negative lengths. Assume that l0.y == r0.y
-// due to the initial edge scan in draw_quad/perspective_spans.
-template <typename T>
-static ALWAYS_INLINE bool checkIfEdgesFlipped(T l0, T l1, T r0, T r1) {
-  // If the starting point of the left edge is to the right of the starting
-  // point of the right edge, then just assume the edges are flipped. If the
-  // left and right starting points are the same, then check the sign of the
-  // cross-product of the edges to see if the edges are flipped. Otherwise,
-  // if the left starting point is actually just to the left of the right
-  // starting point, then assume no edge flip.
-  return l0.x > r0.x || (l0.x == r0.x && perpDot(l1 - l0, r1 - r0) > 0.0f);
-}
-
-// Draw spans for each row of a given quad (or triangle) with a constant Z
-// value. The quad is assumed convex. It is clipped to fall within the given
-// clip rect. In short, this function rasterizes a quad by first finding a
-// top most starting point and then from there tracing down the left and right
-// sides of this quad until it hits the bottom, outputting a span between the
-// current left and right positions at each row along the way. Points are
-// assumed to be ordered in either CW or CCW to support this, but currently
-// both orders (CW and CCW) are supported and equivalent.
-template <typename P>
-static inline void draw_quad_spans(int nump, Point2D p[4], uint32_t z,
-                                   Interpolants interp_outs[4],
-                                   Texture& colortex, Texture& depthtex,
-                                   const ClipRect& clipRect) {
-  // Only triangles and convex quads supported.
-  assert(nump == 3 || nump == 4);
-
-  Point2D l0, r0, l1, r1;
-  int l0i, r0i, l1i, r1i;
-  {
-    // Find the index of the top-most (smallest Y) point from which
-    // rasterization can start.
-    int top = nump > 3 && p[3].y < p[2].y
-                  ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3)
-                                     : (p[1].y < p[3].y ? 1 : 3))
-                  : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2)
-                                     : (p[1].y < p[2].y ? 1 : 2));
-    // Helper to find next index in the points array, walking forward.
-#define NEXT_POINT(idx)   \
-  ({                      \
-    int cur = (idx) + 1;  \
-    cur < nump ? cur : 0; \
-  })
-    // Helper to find the previous index in the points array, walking backward.
-#define PREV_POINT(idx)        \
-  ({                           \
-    int cur = (idx)-1;         \
-    cur >= 0 ? cur : nump - 1; \
-  })
-    // Start looking for "left"-side and "right"-side descending edges starting
-    // from the determined top point.
-    int next = NEXT_POINT(top);
-    int prev = PREV_POINT(top);
-    if (p[top].y == p[next].y) {
-      // If the next point is on the same row as the top, then advance one more
-      // time to the next point and use that as the "left" descending edge.
-      l0i = next;
-      l1i = NEXT_POINT(next);
-      // Assume top and prev form a descending "right" edge, as otherwise this
-      // will be a collapsed polygon and harmlessly bail out down below.
-      r0i = top;
-      r1i = prev;
-    } else if (p[top].y == p[prev].y) {
-      // If the prev point is on the same row as the top, then advance to the
-      // prev again and use that as the "right" descending edge.
-      // Assume top and next form a non-empty descending "left" edge.
-      l0i = top;
-      l1i = next;
-      r0i = prev;
-      r1i = PREV_POINT(prev);
-    } else {
-      // Both next and prev are on distinct rows from top, so both "left" and
-      // "right" edges are non-empty/descending.
-      l0i = r0i = top;
-      l1i = next;
-      r1i = prev;
-    }
-    // Load the points from the indices.
-    l0 = p[l0i];  // Start of left edge
-    r0 = p[r0i];  // End of left edge
-    l1 = p[l1i];  // Start of right edge
-    r1 = p[r1i];  // End of right edge
-    //    debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1:
-    //    %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i,
-    //    r1.x, r1.y);
-  }
-
-  struct Edge {
-    float yScale;
-    float xSlope;
-    float x;
-    Interpolants interpSlope;
-    Interpolants interp;
-    bool edgeMask;
-
-    Edge(float y, const Point2D& p0, const Point2D& p1, const Interpolants& i0,
-         const Interpolants& i1, int edgeIndex)
-        :  // Inverse Y scale for slope calculations. Avoid divide on 0-length
-           // edge. Later checks below ensure that Y <= p1.y, or otherwise we
-           // don't use this edge. We just need to guard against Y == p1.y ==
-           // p0.y. In that case, Y - p0.y == 0 and will cancel out the slopes
-           // below, except if yScale is Inf for some reason (or worse, NaN),
-           // which 1/(p1.y-p0.y) might produce if we don't bound it.
-          yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
-          // Calculate dX/dY slope
-          xSlope((p1.x - p0.x) * yScale),
-          // Initialize current X based on Y and slope
-          x(p0.x + (y - p0.y) * xSlope),
-          // Calculate change in interpolants per change in Y
-          interpSlope((i1 - i0) * yScale),
-          // Initialize current interpolants based on Y and slope
-          interp(i0 + (y - p0.y) * interpSlope),
-          // Extract the edge mask status for this edge
-          edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {}
-
-    void nextRow() {
-      // step current X and interpolants to next row from slope
-      x += xSlope;
-      interp += interpSlope;
-    }
-
-    float cur_x() const { return x; }
-    float x_slope() const { return xSlope; }
-  };
-
-  // Vertex selection above should result in equal left and right start rows
-  assert(l0.y == r0.y);
-  // Find the start y, clip to within the clip rect, and round to row center.
-  // If AA is enabled, round out conservatively rather than round to nearest.
-  float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f;
-  float y = floor(max(l0.y, clipRect.y0) + aaRound) + 0.5f;
-  // Initialize left and right edges from end points and start Y
-  Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
-  Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
-  // WR does not use backface culling, so check if edges are flipped.
-  bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1);
-  if (flipped) swap(left, right);
-  // Get pointer to color buffer and depth buffer at current Y
-  P* fbuf = (P*)colortex.sample_ptr(0, int(y));
-  DepthRun* fdepth = (DepthRun*)depthtex.sample_ptr(0, int(y));
-  // Loop along advancing Ys, rasterizing spans at each row
-  float checkY = min(min(l1.y, r1.y), clipRect.y1);
-  // Ensure we don't rasterize out edge bounds
-  FloatRange clipSpan =
-      clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
-  for (;;) {
-    // Check if we maybe passed edge ends or outside clip rect...
-    if (y > checkY) {
-      // If we're outside the clip rect, we're done.
-      if (y > clipRect.y1) break;
-        // Helper to find the next non-duplicate vertex that doesn't loop back.
-#define STEP_EDGE(y, e0i, e0, e1i, e1, STEP_POINT, end)     \
-  do {                                                      \
-    /* Set new start of edge to be end of old edge */       \
-    e0i = e1i;                                              \
-    e0 = e1;                                                \
-    /* Set new end of edge to next point */                 \
-    e1i = STEP_POINT(e1i);                                  \
-    e1 = p[e1i];                                            \
-    /* If the edge crossed the end, we're done. */          \
-    if (e0i == end) return;                                 \
-    /* Otherwise, it doesn't advance, so keep searching. */ \
-  } while (y > e1.y)
-      // Check if Y advanced past the end of the left edge
-      if (y > l1.y) {
-        // Step to next left edge past Y and reset edge interpolants.
-        STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i);
-        (flipped ? right : left) =
-            Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
-      }
-      // Check if Y advanced past the end of the right edge
-      if (y > r1.y) {
-        // Step to next right edge past Y and reset edge interpolants.
-        STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i);
-        (flipped ? left : right) =
-            Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
-      }
-      // Reset the clip bounds for the new edges
-      clipSpan =
-          clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
-      // Reset check condition for next time around.
-      checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1);
-    }
-
-    // Calculate a potentially AA'd span and check if it is non-empty.
-    IntRange span = aa_span(fbuf, left, right, clipSpan);
-    if (span.len() > 0) {
-      // If user clip planes are enabled, use them to bound the current span.
-      if (vertex_shader->use_clip_distance()) {
-        span = span.intersect(clip_distance_range(left, right));
-        if (span.len() <= 0) goto next_span;
-      }
-      ctx->shaded_rows++;
-      ctx->shaded_pixels += span.len();
-      // Advance color/depth buffer pointers to the start of the span.
-      P* buf = fbuf + span.start;
-      // Check if we will need to use depth-buffer or discard on this span.
-      DepthRun* depth =
-          depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr;
-      DepthCursor cursor;
-      bool use_discard = fragment_shader->use_discard();
-      if (use_discard) {
-        if (depth) {
-          // If we're using discard, we may have to unpredictably drop out some
-          // samples. Flatten the depth run array here to allow this.
-          if (!depth->is_flat()) {
-            flatten_depth_runs(depth, depthtex.width);
-          }
-          // Advance to the depth sample at the start of the span.
-          depth += span.start;
-        }
-      } else if (depth) {
-        if (!depth->is_flat()) {
-          // We're not using discard and the depth row is still organized into
-          // runs. Skip past any runs that would fail the depth test so we
-          // don't have to do any extra work to process them with the rest of
-          // the span.
-          cursor = DepthCursor(depth, depthtex.width, span.start, span.len());
-          int skipped = cursor.skip_failed(z, ctx->depthfunc);
-          // If we fell off the row, that means we couldn't find any passing
-          // runs. We can just skip the entire span.
-          if (skipped < 0) {
-            goto next_span;
-          }
-          buf += skipped;
-          span.start += skipped;
-        } else {
-          // The row is already flattened, so just advance to the span start.
-          depth += span.start;
-        }
-      }
-
-      if (colortex.delay_clear) {
-        // Delayed clear is enabled for the color buffer. Check if needs clear.
-        prepare_row<P>(colortex, int(y), span.start, span.end, use_discard,
-                       depth, z, &cursor);
-      }
-
-      // Initialize fragment shader interpolants to current span position.
-      fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1);
-      fragment_shader->gl_FragCoord.y = y;
-      {
-        // Change in interpolants is difference between current right and left
-        // edges per the change in right and left X.
-        Interpolants step =
-            (right.interp - left.interp) * (1.0f / (right.x - left.x));
-        // Advance current interpolants to X at start of span.
-        Interpolants o = left.interp + step * (span.start + 0.5f - left.x);
-        fragment_shader->init_span(&o, &step);
-      }
-      clipRect.set_clip_mask(span.start, y, buf);
-      if (!use_discard) {
-        // Fast paths for the case where fragment discard is not used.
-        if (depth) {
-          // If depth is used, we want to process entire depth runs if depth is
-          // not flattened.
-          if (!depth->is_flat()) {
-            draw_depth_span(z, buf, cursor);
-            goto next_span;
-          }
-          // Otherwise, flattened depth must fall back to the slightly slower
-          // per-chunk depth test path in draw_span below.
-        } else {
-          // Check if the fragment shader has an optimized draw specialization.
-          if (span.len() >= 4 && fragment_shader->has_draw_span(buf)) {
-            // Draw specialization expects 4-pixel chunks.
-            int drawn = fragment_shader->draw_span(buf, span.len() & ~3);
-            buf += drawn;
-            span.start += drawn;
-          }
-        }
-        draw_span<false, false>(buf, depth, span.len(), [=] { return z; });
-      } else {
-        // If discard is used, then use slower fallbacks. This should be rare.
-        // Just needs to work, doesn't need to be too fast yet...
-        draw_span<true, false>(buf, depth, span.len(), [=] { return z; });
-      }
-    }
-  next_span:
-    // Advance Y and edge interpolants to next row.
-    y++;
-    left.nextRow();
-    right.nextRow();
-    // Advance buffers to next row.
-    fbuf += colortex.stride() / sizeof(P);
-    fdepth += depthtex.stride() / sizeof(DepthRun);
-  }
-}
-
-// Draw perspective-correct spans for a convex quad that has been clipped to
-// the near and far Z planes, possibly producing a clipped convex polygon with
-// more than 4 sides. This assumes the Z value will vary across the spans and
-// requires interpolants to factor in W values. This tends to be slower than
-// the simpler 2D draw_quad_spans above, especially since we can't optimize the
-// depth test easily when Z values, and should be used only rarely if possible.
-template <typename P>
-static inline void draw_perspective_spans(int nump, Point3D* p,
-                                          Interpolants* interp_outs,
-                                          Texture& colortex, Texture& depthtex,
-                                          const ClipRect& clipRect) {
-  Point3D l0, r0, l1, r1;
-  int l0i, r0i, l1i, r1i;
-  {
-    // Find the index of the top-most point (smallest Y) from which
-    // rasterization can start.
-    int top = 0;
-    for (int i = 1; i < nump; i++) {
-      if (p[i].y < p[top].y) {
-        top = i;
-      }
-    }
-    // Find left-most top point, the start of the left descending edge.
-    // Advance forward in the points array, searching at most nump points
-    // in case the polygon is flat.
-    l0i = top;
-    for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) {
-      l0i = i;
-    }
-    if (l0i == nump - 1) {
-      for (int i = 0; i <= top && p[i].y == p[top].y; i++) {
-        l0i = i;
-      }
-    }
-    // Find right-most top point, the start of the right descending edge.
-    // Advance backward in the points array, searching at most nump points.
-    r0i = top;
-    for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) {
-      r0i = i;
-    }
-    if (r0i == 0) {
-      for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) {
-        r0i = i;
-      }
-    }
-    // End of left edge is next point after left edge start.
-    l1i = NEXT_POINT(l0i);
-    // End of right edge is prev point after right edge start.
-    r1i = PREV_POINT(r0i);
-    l0 = p[l0i];  // Start of left edge
-    r0 = p[r0i];  // End of left edge
-    l1 = p[l1i];  // Start of right edge
-    r1 = p[r1i];  // End of right edge
-  }
-
-  struct Edge {
-    float yScale;
-    // Current coordinates for edge. Where in the 2D case of draw_quad_spans,
-    // it is enough to just track the X coordinate as we advance along the rows,
-    // for the perspective case we also need to keep track of Z and W. For
-    // simplicity, we just use the full 3D point to track all these coordinates.
-    Point3D pSlope;
-    Point3D p;
-    Interpolants interpSlope;
-    Interpolants interp;
-    bool edgeMask;
-
-    Edge(float y, const Point3D& p0, const Point3D& p1, const Interpolants& i0,
-         const Interpolants& i1, int edgeIndex)
-        :  // Inverse Y scale for slope calculations. Avoid divide on 0-length
-           // edge.
-          yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
-          // Calculate dX/dY slope
-          pSlope((p1 - p0) * yScale),
-          // Initialize current coords based on Y and slope
-          p(p0 + (y - p0.y) * pSlope),
-          // Crucially, these interpolants must be scaled by the point's 1/w
-          // value, which allows linear interpolation in a perspective-correct
-          // manner. This will be canceled out inside the fragment shader later.
-          // Calculate change in interpolants per change in Y
-          interpSlope((i1 * p1.w - i0 * p0.w) * yScale),
-          // Initialize current interpolants based on Y and slope
-          interp(i0 * p0.w + (y - p0.y) * interpSlope),
-          // Extract the edge mask status for this edge
-          edgeMask((swgl_AAEdgeMask >> edgeIndex) & 1) {}
-
-    float x() const { return p.x; }
-    vec2_scalar zw() const { return {p.z, p.w}; }
-
-    void nextRow() {
-      // step current coords and interpolants to next row from slope
-      p += pSlope;
-      interp += interpSlope;
-    }
-
-    float cur_x() const { return p.x; }
-    float x_slope() const { return pSlope.x; }
-  };
-
-  // Vertex selection above should result in equal left and right start rows
-  assert(l0.y == r0.y);
-  // Find the start y, clip to within the clip rect, and round to row center.
-  // If AA is enabled, round out conservatively rather than round to nearest.
-  float aaRound = swgl_ClipFlags & SWGL_CLIP_FLAG_AA ? 0.0f : 0.5f;
-  float y = floor(max(l0.y, clipRect.y0) + aaRound) + 0.5f;
-  // Initialize left and right edges from end points and start Y
-  Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
-  Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
-  // WR does not use backface culling, so check if edges are flipped.
-  bool flipped = checkIfEdgesFlipped(l0, l1, r0, r1);
-  if (flipped) swap(left, right);
-  // Get pointer to color buffer and depth buffer at current Y
-  P* fbuf = (P*)colortex.sample_ptr(0, int(y));
-  DepthRun* fdepth = (DepthRun*)depthtex.sample_ptr(0, int(y));
-  // Loop along advancing Ys, rasterizing spans at each row
-  float checkY = min(min(l1.y, r1.y), clipRect.y1);
-  // Ensure we don't rasterize out edge bounds
-  FloatRange clipSpan =
-      clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
-  for (;;) {
-    // Check if we maybe passed edge ends or outside clip rect...
-    if (y > checkY) {
-      // If we're outside the clip rect, we're done.
-      if (y > clipRect.y1) break;
-      // Check if Y advanced past the end of the left edge
-      if (y > l1.y) {
-        // Step to next left edge past Y and reset edge interpolants.
-        STEP_EDGE(y, l0i, l0, l1i, l1, NEXT_POINT, r1i);
-        (flipped ? right : left) =
-            Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i], l1i);
-      }
-      // Check if Y advanced past the end of the right edge
-      if (y > r1.y) {
-        // Step to next right edge past Y and reset edge interpolants.
-        STEP_EDGE(y, r0i, r0, r1i, r1, PREV_POINT, l1i);
-        (flipped ? left : right) =
-            Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i], r0i);
-      }
-      // Reset the clip bounds for the new edges
-      clipSpan =
-          clipRect.x_range().clip(x_range(l0, l1).merge(x_range(r0, r1)));
-      // Reset check condition for next time around.
-      checkY = min(ceil(min(l1.y, r1.y) - aaRound), clipRect.y1);
-    }
-
-    // Calculate a potentially AA'd span and check if it is non-empty.
-    IntRange span = aa_span(fbuf, left, right, clipSpan);
-    if (span.len() > 0) {
-      // If user clip planes are enabled, use them to bound the current span.
-      if (vertex_shader->use_clip_distance()) {
-        span = span.intersect(clip_distance_range(left, right));
-        if (span.len() <= 0) goto next_span;
-      }
-      ctx->shaded_rows++;
-      ctx->shaded_pixels += span.len();
-      // Advance color/depth buffer pointers to the start of the span.
-      P* buf = fbuf + span.start;
-      // Check if the we will need to use depth-buffer or discard on this span.
-      DepthRun* depth =
-          depthtex.buf != nullptr && depthtex.cleared() ? fdepth : nullptr;
-      bool use_discard = fragment_shader->use_discard();
-      if (depth) {
-        // Perspective may cause the depth value to vary on a per sample basis.
-        // Ensure the depth row is flattened to allow testing of individual
-        // samples
-        if (!depth->is_flat()) {
-          flatten_depth_runs(depth, depthtex.width);
-        }
-        // Advance to the depth sample at the start of the span.
-        depth += span.start;
-      }
-      if (colortex.delay_clear) {
-        // Delayed clear is enabled for the color buffer. Check if needs clear.
-        prepare_row<P>(colortex, int(y), span.start, span.end, use_discard,
-                       depth);
-      }
-      // Initialize fragment shader interpolants to current span position.
-      fragment_shader->gl_FragCoord.x = init_interp(span.start + 0.5f, 1);
-      fragment_shader->gl_FragCoord.y = y;
-      {
-        // Calculate the fragment Z and W change per change in fragment X step.
-        vec2_scalar stepZW =
-            (right.zw() - left.zw()) * (1.0f / (right.x() - left.x()));
-        // Calculate initial Z and W values for span start.
-        vec2_scalar zw = left.zw() + stepZW * (span.start + 0.5f - left.x());
-        // Set fragment shader's Z and W values so that it can use them to
-        // cancel out the 1/w baked into the interpolants.
-        fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x);
-        fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y);
-        fragment_shader->swgl_StepZW = stepZW;
-        // Change in interpolants is difference between current right and left
-        // edges per the change in right and left X. The left and right
-        // interpolant values were previously multipled by 1/w, so the step and
-        // initial span values take this into account.
-        Interpolants step =
-            (right.interp - left.interp) * (1.0f / (right.x() - left.x()));
-        // Advance current interpolants to X at start of span.
-        Interpolants o = left.interp + step * (span.start + 0.5f - left.x());
-        fragment_shader->init_span<true>(&o, &step);
-      }
-      clipRect.set_clip_mask(span.start, y, buf);
-      if (!use_discard) {
-        // No discard is used. Common case.
-        draw_span<false, true>(buf, depth, span.len(), packDepth);
-      } else {
-        // Discard is used. Rare.
-        draw_span<true, true>(buf, depth, span.len(), packDepth);
-      }
-    }
-  next_span:
-    // Advance Y and edge interpolants to next row.
-    y++;
-    left.nextRow();
-    right.nextRow();
-    // Advance buffers to next row.
-    fbuf += colortex.stride() / sizeof(P);
-    fdepth += depthtex.stride() / sizeof(DepthRun);
-  }
-}
-
-// Clip a primitive against both sides of a view-frustum axis, producing
-// intermediate vertexes with interpolated attributes that will no longer
-// intersect the selected axis planes. This assumes the primitive is convex
-// and should produce at most N+2 vertexes for each invocation (only in the
-// worst case where one point falls outside on each of the opposite sides
-// with the rest of the points inside). The supplied AA edge mask will be
-// modified such that it corresponds to the clipped polygon edges.
-template <XYZW AXIS>
-static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP,
-                     Interpolants* outInterp, int& outEdgeMask) {
-  // Potential mask bits of which side of a plane a coordinate falls on.
-  enum SIDE { POSITIVE = 1, NEGATIVE = 2 };
-  int numClip = 0;
-  int edgeMask = outEdgeMask;
-  Point3D prev = p[nump - 1];
-  Interpolants prevInterp = interp[nump - 1];
-  float prevCoord = prev.select(AXIS);
-  // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and
-  // if so, remember which side it is outside of. In the special case that W is
-  // negative and |C| < |W|, both -W <= C and C <= W will be false, such that
-  // we must consider the coordinate as falling outside of both plane sides
-  // simultaneously. We test each condition separately and combine them to form
-  // a mask of which plane sides we exceeded. If we neglect to consider both
-  // sides simultaneously, points can erroneously oscillate from one plane side
-  // to the other and exceed the supported maximum number of clip outputs.
-  int prevMask = (prevCoord < -prev.w ? NEGATIVE : 0) |
-                 (prevCoord > prev.w ? POSITIVE : 0);
-  // Loop through points, finding edges that cross the planes by evaluating
-  // the side at each point.
-  outEdgeMask = 0;
-  for (int i = 0; i < nump; i++, edgeMask >>= 1) {
-    Point3D cur = p[i];
-    Interpolants curInterp = interp[i];
-    float curCoord = cur.select(AXIS);
-    int curMask =
-        (curCoord < -cur.w ? NEGATIVE : 0) | (curCoord > cur.w ? POSITIVE : 0);
-    // Check if the previous and current end points are on different sides. If
-    // the masks of sides intersect, then we consider them to be on the same
-    // side. So in the case the masks do not intersect, we then consider them
-    // to fall on different sides.
-    if (!(curMask & prevMask)) {
-      // One of the edge's end points is outside the plane with the other
-      // inside the plane. Find the offset where it crosses the plane and
-      // adjust the point and interpolants to there.
-      if (prevMask) {
-        // Edge that was previously outside crosses inside.
-        // Evaluate plane equation for previous and current end-point
-        // based on previous side and calculate relative offset.
-        if (numClip >= nump + 2) {
-          // If for some reason we produced more vertexes than we support, just
-          // bail out.
-          assert(false);
-          return 0;
-        }
-        // The positive plane is assigned the sign 1, and the negative plane is
-        // assigned -1. If the point falls outside both planes, that means W is
-        // negative. To compensate for this, we must interpolate the coordinate
-        // till W=0, at which point we can choose a single plane side for the
-        // coordinate to fall on since W will no longer be negative. To compute
-        // the coordinate where W=0, we compute K = prev.w / (prev.w-cur.w) and
-        // interpolate C = prev.C + K*(cur.C - prev.C). The sign of C will be
-        // the side of the plane we need to consider. Substituting K into the
-        // comparison C < 0, we can then avoid the division in K with a
-        // cross-multiplication.
-        float prevSide =
-            (prevMask & NEGATIVE) && (!(prevMask & POSITIVE) ||
-                                      prevCoord * (cur.w - prev.w) <
-                                          prev.w * (curCoord - prevCoord))
-                ? -1
-                : 1;
-        float prevDist = prevCoord - prevSide * prev.w;
-        float curDist = curCoord - prevSide * cur.w;
-        // It may happen that after we interpolate by the weight k that due to
-        // floating point rounding we've underestimated the value necessary to
-        // push it over the clipping boundary. Just in case, nudge the mantissa
-        // by a single increment so that we essentially round it up and move it
-        // further inside the clipping boundary. We use nextafter to do this in
-        // a portable fashion.
-        float k = prevDist / (prevDist - curDist);
-        Point3D clipped = prev + (cur - prev) * k;
-        if (prevSide * clipped.select(AXIS) > clipped.w) {
-          k = nextafterf(k, 1.0f);
-          clipped = prev + (cur - prev) * k;
-        }
-        outP[numClip] = clipped;
-        outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
-        // Don't output the current edge mask since start point was outside.
-        numClip++;
-      }
-      if (curMask) {
-        // Edge that was previously inside crosses outside.
-        // Evaluate plane equation for previous and current end-point
-        // based on current side and calculate relative offset.
-        if (numClip >= nump + 2) {
-          assert(false);
-          return 0;
-        }
-        // In the case the coordinate falls on both plane sides, the computation
-        // here is much the same as for prevSide, but since we are going from a
-        // previous W that is positive to current W that is negative, then the
-        // sign of cur.w - prev.w will flip in the equation. The resulting sign
-        // is negated to compensate for this.
-        float curSide =
-            (curMask & POSITIVE) && (!(curMask & NEGATIVE) ||
-                                     prevCoord * (cur.w - prev.w) <
-                                         prev.w * (curCoord - prevCoord))
-                ? 1
-                : -1;
-        float prevDist = prevCoord - curSide * prev.w;
-        float curDist = curCoord - curSide * cur.w;
-        // Calculate interpolation weight k and the nudge it inside clipping
-        // boundary with nextafter. Note that since we were previously inside
-        // and now crossing outside, we have to flip the nudge direction for
-        // the weight towards 0 instead of 1.
-        float k = prevDist / (prevDist - curDist);
-        Point3D clipped = prev + (cur - prev) * k;
-        if (curSide * clipped.select(AXIS) > clipped.w) {
-          k = nextafterf(k, 0.0f);
-          clipped = prev + (cur - prev) * k;
-        }
-        outP[numClip] = clipped;
-        outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
-        // Output the current edge mask since the end point is inside.
-        outEdgeMask |= (edgeMask & 1) << numClip;
-        numClip++;
-      }
-    }
-    if (!curMask) {
-      // The current end point is inside the plane, so output point unmodified.
-      if (numClip >= nump + 2) {
-        assert(false);
-        return 0;
-      }
-      outP[numClip] = cur;
-      outInterp[numClip] = curInterp;
-      // Output the current edge mask since the end point is inside.
-      outEdgeMask |= (edgeMask & 1) << numClip;
-      numClip++;
-    }
-    prev = cur;
-    prevInterp = curInterp;
-    prevCoord = curCoord;
-    prevMask = curMask;
-  }
-  return numClip;
-}
-
-// Helper function to dispatch to perspective span drawing with points that
-// have already been transformed and clipped.
-static inline void draw_perspective_clipped(int nump, Point3D* p_clip,
-                                            Interpolants* interp_clip,
-                                            Texture& colortex,
-                                            Texture& depthtex) {
-  // If polygon is ouside clip rect, nothing to draw.
-  ClipRect clipRect(colortex);
-  if (!clipRect.overlaps(nump, p_clip)) {
-    return;
-  }
-
-  // Finally draw perspective-correct spans for the polygon.
-  if (colortex.internal_format == GL_RGBA8) {
-    draw_perspective_spans<uint32_t>(nump, p_clip, interp_clip, colortex,
-                                     depthtex, clipRect);
-  } else if (colortex.internal_format == GL_R8) {
-    draw_perspective_spans<uint8_t>(nump, p_clip, interp_clip, colortex,
-                                    depthtex, clipRect);
-  } else {
-    assert(false);
-  }
-}
-
-// Draws a perspective-correct 3D primitive with varying Z value, as opposed
-// to a simple 2D planar primitive with a constant Z value that could be
-// trivially Z rejected. This requires clipping the primitive against the near
-// and far planes to ensure it stays within the valid Z-buffer range. The Z
-// and W of each fragment of the primitives are interpolated across the
-// generated spans and then depth-tested as appropriate.
-// Additionally, vertex attributes must be interpolated with perspective-
-// correction by dividing by W before interpolation, and then later multiplied
-// by W again to produce the final correct attribute value for each fragment.
-// This process is expensive and should be avoided if possible for primitive
-// batches that are known ahead of time to not need perspective-correction.
-static void draw_perspective(int nump, Interpolants interp_outs[4],
-                             Texture& colortex, Texture& depthtex) {
-  // Lines are not supported with perspective.
-  assert(nump >= 3);
-  // Convert output of vertex shader to screen space.
-  vec4 pos = vertex_shader->gl_Position;
-  vec3_scalar scale =
-      vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f;
-  vec3_scalar offset =
-      make_vec3(make_vec2(ctx->viewport.origin() - colortex.offset), 0.0f) +
-      scale;
-  // Verify if point is between near and far planes, rejecting NaN.
-  if (test_all(pos.z > -pos.w && pos.z < pos.w)) {
-    // No points cross the near or far planes, so no clipping required.
-    // Just divide coords by W and convert to viewport. We assume the W
-    // coordinate is non-zero and the reciprocal is finite since it would
-    // otherwise fail the test_none condition.
-    Float w = 1.0f / pos.w;
-    vec3 screen = pos.sel(X, Y, Z) * w * scale + offset;
-    Point3D p[4] = {{screen.x.x, screen.y.x, screen.z.x, w.x},
-                    {screen.x.y, screen.y.y, screen.z.y, w.y},
-                    {screen.x.z, screen.y.z, screen.z.z, w.z},
-                    {screen.x.w, screen.y.w, screen.z.w, w.w}};
-    draw_perspective_clipped(nump, p, interp_outs, colortex, depthtex);
-  } else {
-    // Points cross the near or far planes, so we need to clip.
-    // Start with the original 3 or 4 points...
-    Point3D p[4] = {{pos.x.x, pos.y.x, pos.z.x, pos.w.x},
-                    {pos.x.y, pos.y.y, pos.z.y, pos.w.y},
-                    {pos.x.z, pos.y.z, pos.z.z, pos.w.z},
-                    {pos.x.w, pos.y.w, pos.z.w, pos.w.w}};
-    // Clipping can expand the points by 1 for each of 6 view frustum planes.
-    Point3D p_clip[4 + 6];
-    Interpolants interp_clip[4 + 6];
-    // Clip against near and far Z planes.
-    nump = clip_side<Z>(nump, p, interp_outs, p_clip, interp_clip,
-                        swgl_AAEdgeMask);
-    // If no points are left inside the view frustum, there's nothing to draw.
-    if (nump < 3) {
-      return;
-    }
-    // After clipping against only the near and far planes, we might still
-    // produce points where W = 0, exactly at the camera plane. OpenGL specifies
-    // that for clip coordinates, points must satisfy:
-    //   -W <= X <= W
-    //   -W <= Y <= W
-    //   -W <= Z <= W
-    // When Z = W = 0, this is trivially satisfied, but when we transform and
-    // divide by W below it will produce a divide by 0. Usually we want to only
-    // clip Z to avoid the extra work of clipping X and Y. We can still project
-    // points that fall outside the view frustum X and Y so long as Z is valid.
-    // The span drawing code will then ensure X and Y are clamped to viewport
-    // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y,
-    // will push W further inside the view frustum so that it is no longer 0,
-    // allowing us to finally proceed to projecting the points to the screen.
-    for (int i = 0; i < nump; i++) {
-      // Found an invalid W, so need to clip against X and Y...
-      if (p_clip[i].w <= 0.0f) {
-        // Ping-pong p_clip -> p_tmp -> p_clip.
-        Point3D p_tmp[4 + 6];
-        Interpolants interp_tmp[4 + 6];
-        nump = clip_side<X>(nump, p_clip, interp_clip, p_tmp, interp_tmp,
-                            swgl_AAEdgeMask);
-        if (nump < 3) return;
-        nump = clip_side<Y>(nump, p_tmp, interp_tmp, p_clip, interp_clip,
-                            swgl_AAEdgeMask);
-        if (nump < 3) return;
-        // After clipping against X and Y planes, there's still points left
-        // to draw, so proceed to trying projection now...
-        break;
-      }
-    }
-    // Divide coords by W and convert to viewport.
-    for (int i = 0; i < nump; i++) {
-      float w = 1.0f / p_clip[i].w;
-      // If the W coord is essentially zero, small enough that division would
-      // result in Inf/NaN, then just set the reciprocal itself to zero so that
-      // the coordinates becomes zeroed out, as the only valid point that
-      // satisfies -W <= X/Y/Z <= W is all zeroes.
-      if (!isfinite(w)) w = 0.0f;
-      p_clip[i] = Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w);
-    }
-    draw_perspective_clipped(nump, p_clip, interp_clip, colortex, depthtex);
-  }
-}
-
-static void draw_quad(int nump, Texture& colortex, Texture& depthtex) {
-  // Run vertex shader once for the primitive's vertices.
-  // Reserve space for 6 sets of interpolants, in case we need to clip against
-  // near and far planes in the perspective case.
-  Interpolants interp_outs[4];
-  swgl_ClipFlags = 0;
-  vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants));
-  vec4 pos = vertex_shader->gl_Position;
-  // Check if any vertex W is different from another. If so, use perspective.
-  if (test_any(pos.w != pos.w.x)) {
-    draw_perspective(nump, interp_outs, colortex, depthtex);
-    return;
-  }
-
-  // Convert output of vertex shader to screen space.
-  // Divide coords by W and convert to viewport.
-  float w = 1.0f / pos.w.x;
-  // If the W coord is essentially zero, small enough that division would
-  // result in Inf/NaN, then just set the reciprocal itself to zero so that
-  // the coordinates becomes zeroed out, as the only valid point that
-  // satisfies -W <= X/Y/Z <= W is all zeroes.
-  if (!isfinite(w)) w = 0.0f;
-  vec2 screen = (pos.sel(X, Y) * w + 1) * 0.5f *
-                    vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) +
-                make_vec2(ctx->viewport.origin() - colortex.offset);
-  Point2D p[4] = {{screen.x.x, screen.y.x},
-                  {screen.x.y, screen.y.y},
-                  {screen.x.z, screen.y.z},
-                  {screen.x.w, screen.y.w}};
-
-  // If quad is ouside clip rect, nothing to draw.
-  ClipRect clipRect(colortex);
-  if (!clipRect.overlaps(nump, p)) {
-    return;
-  }
-
-  // Since the quad is assumed 2D, Z is constant across the quad.
-  float screenZ = (pos.z.x * w + 1) * 0.5f;
-  if (screenZ < 0 || screenZ > 1) {
-    // Z values would cross the near or far plane, so just bail.
-    return;
-  }
-  // Since Z doesn't need to be interpolated, just set the fragment shader's
-  // Z and W values here, once and for all fragment shader invocations.
-  uint32_t z = uint32_t(MAX_DEPTH_VALUE * screenZ);
-  fragment_shader->gl_FragCoord.z = screenZ;
-  fragment_shader->gl_FragCoord.w = w;
-
-  // If supplied a line, adjust it so that it is a quad at least 1 pixel thick.
-  // Assume that for a line that all 4 SIMD lanes were actually filled with
-  // vertexes 0, 1, 1, 0.
-  if (nump == 2) {
-    // Nudge Y height to span at least 1 pixel by advancing to next pixel
-    // boundary so that we step at least 1 row when drawing spans.
-    if (int(p[0].y + 0.5f) == int(p[1].y + 0.5f)) {
-      p[2].y = 1 + int(p[1].y + 0.5f);
-      p[3].y = p[2].y;
-      // Nudge X width to span at least 1 pixel so that rounded coords fall on
-      // separate pixels.
-      if (int(p[0].x + 0.5f) == int(p[1].x + 0.5f)) {
-        p[1].x += 1.0f;
-        p[2].x += 1.0f;
-      }
-    } else {
-      // If the line already spans at least 1 row, then assume line is vertical
-      // or diagonal and just needs to be dilated horizontally.
-      p[2].x += 1.0f;
-      p[3].x += 1.0f;
-    }
-    // Pretend that it's a quad now...
-    nump = 4;
-  }
-
-  // Finally draw 2D spans for the quad. Currently only supports drawing to
-  // RGBA8 and R8 color buffers.
-  if (colortex.internal_format == GL_RGBA8) {
-    draw_quad_spans<uint32_t>(nump, p, z, interp_outs, colortex, depthtex,
-                              clipRect);
-  } else if (colortex.internal_format == GL_R8) {
-    draw_quad_spans<uint8_t>(nump, p, z, interp_outs, colortex, depthtex,
-                             clipRect);
-  } else {
-    assert(false);
-  }
-}
-
-template <typename INDEX>
-static inline void draw_elements(GLsizei count, GLsizei instancecount,
-                                 size_t offset, VertexArray& v,
-                                 Texture& colortex, Texture& depthtex) {
-  Buffer& indices_buf = ctx->buffers[v.element_array_buffer_binding];
-  if (!indices_buf.buf || offset >= indices_buf.size) {
-    return;
-  }
-  assert((offset & (sizeof(INDEX) - 1)) == 0);
-  INDEX* indices = (INDEX*)(indices_buf.buf + offset);
-  count = min(count, (GLsizei)((indices_buf.size - offset) / sizeof(INDEX)));
-  // Triangles must be indexed at offsets 0, 1, 2.
-  // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3.
-  if (count == 6 && indices[1] == indices[0] + 1 &&
-      indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) {
-    assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1);
-    // Fast path - since there is only a single quad, we only load per-vertex
-    // attribs once for all instances, as they won't change across instances
-    // or within an instance.
-    vertex_shader->load_attribs(v.attribs, indices[0], 0, 4);
-    draw_quad(4, colortex, depthtex);
-    for (GLsizei instance = 1; instance < instancecount; instance++) {
-      vertex_shader->load_attribs(v.attribs, indices[0], instance, 0);
-      draw_quad(4, colortex, depthtex);
-    }
-  } else {
-    for (GLsizei instance = 0; instance < instancecount; instance++) {
-      for (GLsizei i = 0; i + 3 <= count; i += 3) {
-        if (indices[i + 1] != indices[i] + 1 ||
-            indices[i + 2] != indices[i] + 2) {
-          continue;
-        }
-        if (i + 6 <= count && indices[i + 5] == indices[i] + 3) {
-          assert(indices[i + 3] == indices[i] + 2 &&
-                 indices[i + 4] == indices[i] + 1);
-          vertex_shader->load_attribs(v.attribs, indices[i], instance, 4);
-          draw_quad(4, colortex, depthtex);
-          i += 3;
-        } else {
-          vertex_shader->load_attribs(v.attribs, indices[i], instance, 3);
-          draw_quad(3, colortex, depthtex);
-        }
-      }
-    }
-  }
-}
diff --git a/third_party/webrender/swgl/src/swgl_ext.h b/third_party/webrender/swgl/src/swgl_ext.h
deleted file mode 100644
index 52d240e0818..00000000000
--- a/third_party/webrender/swgl/src/swgl_ext.h
+++ /dev/null
@@ -1,1826 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-// When using a solid color with clip masking, the cost of loading the clip mask
-// in the blend stage exceeds the cost of processing the color. Here we handle
-// the entire span of clip mask texture before the blend stage to more
-// efficiently process it and modulate it with color without incurring blend
-// stage overheads.
-template <typename P, typename C>
-static void commit_masked_solid_span(P* buf, C color, int len) {
-  override_clip_mask();
-  uint8_t* mask = get_clip_mask(buf);
-  for (P* end = &buf[len]; buf < end; buf += 4, mask += 4) {
-    commit_span(
-        buf,
-        blend_span(
-            buf,
-            applyColor(expand_mask(buf, unpack(unaligned_load<PackedR8>(mask))),
-                       color)));
-  }
-  restore_clip_mask();
-}
-
-// When using a solid color with anti-aliasing, most of the solid span will not
-// benefit from anti-aliasing in the opaque region. We only want to apply the AA
-// blend stage in the non-opaque start and end of the span where AA is needed.
-template <typename P, typename R>
-static ALWAYS_INLINE void commit_aa_solid_span(P* buf, R r, int len) {
-  if (int start = min((get_aa_opaque_start(buf) + 3) & ~3, len)) {
-    commit_solid_span<true>(buf, r, start);
-    buf += start;
-    len -= start;
-  }
-  if (int opaque = min((get_aa_opaque_size(buf) + 3) & ~3, len)) {
-    override_aa();
-    commit_solid_span<true>(buf, r, opaque);
-    restore_aa();
-    buf += opaque;
-    len -= opaque;
-  }
-  if (len > 0) {
-    commit_solid_span<true>(buf, r, len);
-  }
-}
-
-// Forces a value with vector run-class to have scalar run-class.
-template <typename T>
-static ALWAYS_INLINE auto swgl_forceScalar(T v) -> decltype(force_scalar(v)) {
-  return force_scalar(v);
-}
-
-// Advance all varying inperpolants by a single chunk
-#define swgl_stepInterp() step_interp_inputs()
-
-// Pseudo-intrinsic that accesses the interpolation step for a given varying
-#define swgl_interpStep(v) (interp_step.v)
-
-// Commit an entire span of a solid color. This dispatches to clip-masked and
-// anti-aliased fast-paths as appropriate.
-#define swgl_commitSolid(format, v, n)                                   \
-  do {                                                                   \
-    int len = (n);                                                       \
-    if (blend_key) {                                                     \
-      if (swgl_ClipFlags & SWGL_CLIP_FLAG_MASK) {                        \
-        commit_masked_solid_span(swgl_Out##format,                       \
-                                 packColor(swgl_Out##format, (v)), len); \
-      } else if (swgl_ClipFlags & SWGL_CLIP_FLAG_AA) {                   \
-        commit_aa_solid_span(swgl_Out##format,                           \
-                             pack_span(swgl_Out##format, (v)), len);     \
-      } else {                                                           \
-        commit_solid_span<true>(swgl_Out##format,                        \
-                                pack_span(swgl_Out##format, (v)), len);  \
-      }                                                                  \
-    } else {                                                             \
-      commit_solid_span<false>(swgl_Out##format,                         \
-                               pack_span(swgl_Out##format, (v)), len);   \
-    }                                                                    \
-    swgl_Out##format += len;                                             \
-    swgl_SpanLength -= len;                                              \
-  } while (0)
-#define swgl_commitSolidRGBA8(v) swgl_commitSolid(RGBA8, v, swgl_SpanLength)
-#define swgl_commitSolidR8(v) swgl_commitSolid(R8, v, swgl_SpanLength)
-#define swgl_commitPartialSolidRGBA8(len, v) \
-  swgl_commitSolid(RGBA8, v, min(int(len), swgl_SpanLength))
-#define swgl_commitPartialSolidR8(len, v) \
-  swgl_commitSolid(R8, v, min(int(len), swgl_SpanLength))
-
-#define swgl_commitChunk(format, chunk)                 \
-  do {                                                  \
-    auto r = chunk;                                     \
-    if (blend_key) r = blend_span(swgl_Out##format, r); \
-    commit_span(swgl_Out##format, r);                   \
-    swgl_Out##format += swgl_StepSize;                  \
-    swgl_SpanLength -= swgl_StepSize;                   \
-  } while (0)
-
-// Commit a single chunk of a color
-#define swgl_commitColor(format, color) \
-  swgl_commitChunk(format, pack_pixels_##format(color))
-#define swgl_commitColorRGBA8(color) swgl_commitColor(RGBA8, color)
-#define swgl_commitColorR8(color) swgl_commitColor(R8, color)
-
-template <typename S>
-static ALWAYS_INLINE bool swgl_isTextureLinear(S s) {
-  return s->filter == TextureFilter::LINEAR;
-}
-
-template <typename S>
-static ALWAYS_INLINE bool swgl_isTextureRGBA8(S s) {
-  return s->format == TextureFormat::RGBA8;
-}
-
-template <typename S>
-static ALWAYS_INLINE bool swgl_isTextureR8(S s) {
-  return s->format == TextureFormat::R8;
-}
-
-// Use the default linear quantization scale of 128. This gives 7 bits of
-// fractional precision, which when multiplied with a signed 9 bit value
-// still fits in a 16 bit integer.
-const int swgl_LinearQuantizeScale = 128;
-
-// Quantizes UVs for access into a linear texture.
-template <typename S, typename T>
-static ALWAYS_INLINE T swgl_linearQuantize(S s, T p) {
-  return linearQuantize(p, swgl_LinearQuantizeScale, s);
-}
-
-// Quantizes an interpolation step for UVs for access into a linear texture.
-template <typename S, typename T>
-static ALWAYS_INLINE T swgl_linearQuantizeStep(S s, T p) {
-  return samplerScale(s, p) * swgl_LinearQuantizeScale;
-}
-
-template <typename S>
-static ALWAYS_INLINE WideRGBA8 textureLinearUnpacked(UNUSED uint32_t* buf,
-                                                     S sampler, ivec2 i) {
-  return textureLinearUnpackedRGBA8(sampler, i);
-}
-
-template <typename S>
-static ALWAYS_INLINE WideR8 textureLinearUnpacked(UNUSED uint8_t* buf,
-                                                  S sampler, ivec2 i) {
-  return textureLinearUnpackedR8(sampler, i);
-}
-
-template <typename S>
-static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint32_t* buf) {
-  return swgl_isTextureRGBA8(s);
-}
-
-template <typename S>
-static ALWAYS_INLINE bool matchTextureFormat(S s, UNUSED uint8_t* buf) {
-  return swgl_isTextureR8(s);
-}
-
-// Quantizes the UVs to the 2^7 scale needed for calculating fractional offsets
-// for linear sampling.
-#define LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv)     \
-  uv = swgl_linearQuantize(sampler, uv);                                      \
-  vec2_scalar uv_step =                                                       \
-      float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};   \
-  vec2_scalar min_uv = max(                                                   \
-      swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f); \
-  vec2_scalar max_uv =                                                        \
-      max(swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}),    \
-          min_uv);
-
-// Implements the fallback linear filter that can deal with clamping and
-// arbitrary scales.
-template <bool BLEND, typename S, typename C, typename P>
-static P* blendTextureLinearFallback(S sampler, vec2 uv, int span,
-                                     vec2_scalar uv_step, vec2_scalar min_uv,
-                                     vec2_scalar max_uv, C color, P* buf) {
-  for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
-    commit_blend_span<BLEND>(
-        buf, applyColor(textureLinearUnpacked(buf, sampler,
-                                              ivec2(clamp(uv, min_uv, max_uv))),
-                        color));
-  }
-  return buf;
-}
-
-static ALWAYS_INLINE U64 castForShuffle(V16<int16_t> r) {
-  return bit_cast<U64>(r);
-}
-static ALWAYS_INLINE U16 castForShuffle(V4<int16_t> r) {
-  return bit_cast<U16>(r);
-}
-
-static ALWAYS_INLINE V16<int16_t> applyFracX(V16<int16_t> r, I16 fracx) {
-  return r * fracx.xxxxyyyyzzzzwwww;
-}
-static ALWAYS_INLINE V4<int16_t> applyFracX(V4<int16_t> r, I16 fracx) {
-  return r * fracx;
-}
-
-// Implements a faster linear filter that works with axis-aligned constant Y but
-// scales less than 1, i.e. upscaling. In this case we can optimize for the
-// constant Y fraction as well as load all chunks from memory in a single tap
-// for each row.
-template <bool BLEND, typename S, typename C, typename P>
-static void blendTextureLinearUpscale(S sampler, vec2 uv, int span,
-                                      vec2_scalar uv_step, vec2_scalar min_uv,
-                                      vec2_scalar max_uv, C color, P* buf) {
-  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
-  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
-  typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
-
-  ivec2 i(clamp(uv, min_uv, max_uv));
-  ivec2 frac = i;
-  i >>= 7;
-  P* row0 = (P*)sampler->buf + computeRow(sampler, ivec2_scalar(0, i.y.x));
-  P* row1 = row0 + computeNextRowOffset(sampler, ivec2_scalar(0, i.y.x));
-  I16 fracx = computeFracX(sampler, i, frac);
-  int16_t fracy = computeFracY(frac).x;
-  auto src0 =
-      CONVERT(unaligned_load<packed_type>(&row0[i.x.x]), signed_unpacked_type);
-  auto src1 =
-      CONVERT(unaligned_load<packed_type>(&row1[i.x.x]), signed_unpacked_type);
-  auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
-
-  // We attempt to sample ahead by one chunk and interpolate it with the current
-  // one. However, due to the complication of upscaling, we may not necessarily
-  // shift in all the next set of samples.
-  for (P* end = buf + span; buf < end; buf += 4) {
-    uv.x += uv_step.x;
-    I32 ixn = cast(uv.x);
-    I16 fracn = computeFracNoClamp(ixn);
-    ixn >>= 7;
-    auto src0n = CONVERT(unaligned_load<packed_type>(&row0[ixn.x]),
-                         signed_unpacked_type);
-    auto src1n = CONVERT(unaligned_load<packed_type>(&row1[ixn.x]),
-                         signed_unpacked_type);
-    auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
-
-    // Since we're upscaling, we know that a source pixel has a larger footprint
-    // than the destination pixel, and thus all the source pixels needed for
-    // this chunk will fall within a single chunk of texture data. However,
-    // since the source pixels don't map 1:1 with destination pixels, we need to
-    // shift the source pixels over based on their offset from the start of the
-    // chunk. This could conceivably be optimized better with usage of PSHUFB or
-    // VTBL instructions However, since PSHUFB requires SSSE3, instead we resort
-    // to masking in the correct pixels to avoid having to index into memory.
-    // For the last sample to interpolate with, we need to potentially shift in
-    // a sample from the next chunk over in the case the samples fill out an
-    // entire chunk.
-    auto shuf = src;
-    auto shufn = SHUFFLE(src, ixn.x == i.x.w ? srcn.yyyy : srcn, 1, 2, 3, 4);
-    if (i.x.y == i.x.x) {
-      shuf = shuf.xxyz;
-      shufn = shufn.xxyz;
-    }
-    if (i.x.z == i.x.y) {
-      shuf = shuf.xyyz;
-      shufn = shufn.xyyz;
-    }
-    if (i.x.w == i.x.z) {
-      shuf = shuf.xyzz;
-      shufn = shufn.xyzz;
-    }
-
-    // Convert back to a signed unpacked type so that we can interpolate the
-    // final result.
-    auto interp = bit_cast<signed_unpacked_type>(shuf);
-    auto interpn = bit_cast<signed_unpacked_type>(shufn);
-    interp += applyFracX(interpn - interp, fracx) >> 7;
-
-    commit_blend_span<BLEND>(
-        buf, applyColor(bit_cast<unpacked_type>(interp), color));
-
-    i.x = ixn;
-    fracx = fracn;
-    src = srcn;
-  }
-}
-
-// This is the fastest variant of the linear filter that still provides
-// filtering. In cases where there is no scaling required, but we have a
-// subpixel offset that forces us to blend in neighboring pixels, we can
-// optimize away most of the memory loads and shuffling that is required by the
-// fallback filter.
-template <bool BLEND, typename S, typename C, typename P>
-static void blendTextureLinearFast(S sampler, vec2 uv, int span,
-                                   vec2_scalar min_uv, vec2_scalar max_uv,
-                                   C color, P* buf) {
-  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
-  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
-  typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
-
-  ivec2 i(clamp(uv, min_uv, max_uv));
-  ivec2 frac = i;
-  i >>= 7;
-  P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i));
-  P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i));
-  int16_t fracx = computeFracX(sampler, i, frac).x;
-  int16_t fracy = computeFracY(frac).x;
-  auto src0 = CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
-  auto src1 = CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
-  auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
-
-  // Since there is no scaling, we sample ahead by one chunk and interpolate it
-  // with the current one. We can then reuse this value on the next iteration.
-  for (P* end = buf + span; buf < end; buf += 4) {
-    row0 += 4;
-    row1 += 4;
-    auto src0n =
-        CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
-    auto src1n =
-        CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
-    auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
-
-    // For the last sample to interpolate with, we need to potentially shift in
-    // a sample from the next chunk over since the samples fill out an entire
-    // chunk.
-    auto interp = bit_cast<signed_unpacked_type>(src);
-    auto interpn =
-        bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 2, 3, 4));
-    interp += ((interpn - interp) * fracx) >> 7;
-
-    commit_blend_span<BLEND>(
-        buf, applyColor(bit_cast<unpacked_type>(interp), color));
-
-    src = srcn;
-  }
-}
-
-// Implements a faster linear filter that works with axis-aligned constant Y but
-// downscaling the texture by half. In this case we can optimize for the
-// constant X/Y fractions and reduction factor while minimizing shuffling.
-template <bool BLEND, typename S, typename C, typename P>
-static NO_INLINE void blendTextureLinearDownscale(S sampler, vec2 uv, int span,
-                                                  vec2_scalar min_uv,
-                                                  vec2_scalar max_uv, C color,
-                                                  P* buf) {
-  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
-  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
-  typedef VectorType<int16_t, 4 * sizeof(P)> signed_unpacked_type;
-
-  ivec2 i(clamp(uv, min_uv, max_uv));
-  ivec2 frac = i;
-  i >>= 7;
-  P* row0 = (P*)sampler->buf + computeRow(sampler, force_scalar(i));
-  P* row1 = row0 + computeNextRowOffset(sampler, force_scalar(i));
-  int16_t fracx = computeFracX(sampler, i, frac).x;
-  int16_t fracy = computeFracY(frac).x;
-
-  for (P* end = buf + span; buf < end; buf += 4) {
-    auto src0 =
-        CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
-    auto src1 =
-        CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
-    auto src = castForShuffle(src0 + (((src1 - src0) * fracy) >> 7));
-    row0 += 4;
-    row1 += 4;
-    auto src0n =
-        CONVERT(unaligned_load<packed_type>(row0), signed_unpacked_type);
-    auto src1n =
-        CONVERT(unaligned_load<packed_type>(row1), signed_unpacked_type);
-    auto srcn = castForShuffle(src0n + (((src1n - src0n) * fracy) >> 7));
-    row0 += 4;
-    row1 += 4;
-
-    auto interp =
-        bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 0, 2, 4, 6));
-    auto interpn =
-        bit_cast<signed_unpacked_type>(SHUFFLE(src, srcn, 1, 3, 5, 7));
-    interp += ((interpn - interp) * fracx) >> 7;
-
-    commit_blend_span<BLEND>(
-        buf, applyColor(bit_cast<unpacked_type>(interp), color));
-  }
-}
-
-enum LinearFilter {
-  // No linear filter is needed.
-  LINEAR_FILTER_NEAREST = 0,
-  // The most general linear filter that handles clamping and varying scales.
-  LINEAR_FILTER_FALLBACK,
-  // A linear filter optimized for axis-aligned upscaling.
-  LINEAR_FILTER_UPSCALE,
-  // A linear filter with no scaling but with subpixel offset.
-  LINEAR_FILTER_FAST,
-  // A linear filter optimized for 2x axis-aligned downscaling.
-  LINEAR_FILTER_DOWNSCALE
-};
-
-// Dispatches to an appropriate linear filter depending on the selected filter.
-template <bool BLEND, typename S, typename C, typename P>
-static P* blendTextureLinearDispatch(S sampler, vec2 uv, int span,
-                                     vec2_scalar uv_step, vec2_scalar min_uv,
-                                     vec2_scalar max_uv, C color, P* buf,
-                                     LinearFilter filter) {
-  P* end = buf + span;
-  if (filter != LINEAR_FILTER_FALLBACK) {
-    // If we're not using the fallback, then Y is constant across the entire
-    // row. We just need to ensure that we handle any samples that might pull
-    // data from before the start of the row and require clamping.
-    float beforeDist = max(0.0f, min_uv.x) - uv.x.x;
-    if (beforeDist > 0) {
-      int before = clamp(int(ceil(beforeDist / uv_step.x)) * swgl_StepSize, 0,
-                         int(end - buf));
-      buf = blendTextureLinearFallback<BLEND>(sampler, uv, before, uv_step,
-                                              min_uv, max_uv, color, buf);
-      uv.x += (before / swgl_StepSize) * uv_step.x;
-    }
-    // We need to check how many samples we can take from inside the row without
-    // requiring clamping. In case the filter oversamples the row by a step, we
-    // subtract off a step from the width to leave some room.
-    float insideDist =
-        min(max_uv.x, float((int(sampler->width) - swgl_StepSize) *
-                            swgl_LinearQuantizeScale)) -
-        uv.x.x;
-    if (uv_step.x > 0.0f && insideDist >= uv_step.x) {
-      int inside = int(end - buf);
-      if (filter == LINEAR_FILTER_DOWNSCALE) {
-        inside = clamp(int(insideDist * (0.5f / swgl_LinearQuantizeScale)) &
-                           ~(swgl_StepSize - 1),
-                       0, inside);
-        blendTextureLinearDownscale<BLEND>(sampler, uv, inside, min_uv, max_uv,
-                                           color, buf);
-      } else if (filter == LINEAR_FILTER_UPSCALE) {
-        inside = clamp(int(insideDist / uv_step.x) * swgl_StepSize, 0, inside);
-        blendTextureLinearUpscale<BLEND>(sampler, uv, inside, uv_step, min_uv,
-                                         max_uv, color, buf);
-      } else {
-        inside = clamp(int(insideDist * (1.0f / swgl_LinearQuantizeScale)) &
-                           ~(swgl_StepSize - 1),
-                       0, inside);
-        blendTextureLinearFast<BLEND>(sampler, uv, inside, min_uv, max_uv,
-                                      color, buf);
-      }
-      buf += inside;
-      uv.x += (inside / swgl_StepSize) * uv_step.x;
-    }
-  }
-  // If the fallback filter was requested, or if there are any samples left that
-  // may be outside the row and require clamping, then handle that with here.
-  if (buf < end) {
-    buf = blendTextureLinearFallback<BLEND>(
-        sampler, uv, int(end - buf), uv_step, min_uv, max_uv, color, buf);
-  }
-  return buf;
-}
-
-// Helper function to quantize UVs for linear filtering before dispatch
-template <bool BLEND, typename S, typename C, typename P>
-static inline int blendTextureLinear(S sampler, vec2 uv, int span,
-                                     const vec4_scalar& uv_rect, C color,
-                                     P* buf, LinearFilter filter) {
-  if (!matchTextureFormat(sampler, buf)) {
-    return 0;
-  }
-  LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv);
-  blendTextureLinearDispatch<BLEND>(sampler, uv, span, uv_step, min_uv, max_uv,
-                                    color, buf, filter);
-  return span;
-}
-
-// Samples an axis-aligned span of on a single row of a texture using 1:1
-// nearest filtering. Sampling is constrained to only fall within the given UV
-// bounds. This requires a pointer to the destination buffer. An optional color
-// modulus can be supplied.
-template <bool BLEND, typename S, typename C, typename P>
-static int blendTextureNearestFast(S sampler, vec2 uv, int span,
-                                   const vec4_scalar& uv_rect, C color,
-                                   P* buf) {
-  if (!matchTextureFormat(sampler, buf)) {
-    return 0;
-  }
-
-  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
-
-  ivec2_scalar i = make_ivec2(samplerScale(sampler, force_scalar(uv)));
-  ivec2_scalar minUV =
-      make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y}));
-  ivec2_scalar maxUV =
-      make_ivec2(samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w}));
-
-  // Calculate the row pointer within the buffer, clamping to within valid row
-  // bounds.
-  P* row =
-      &((P*)sampler
-            ->buf)[clamp(clampCoord(i.y, sampler->height), minUV.y, maxUV.y) *
-                   sampler->stride];
-  // Find clamped X bounds within the row.
-  int minX = clamp(minUV.x, 0, sampler->width - 1);
-  int maxX = clamp(maxUV.x, minX, sampler->width - 1);
-  int curX = i.x;
-  int endX = i.x + span;
-  // If we need to start sampling below the valid sample bounds, then we need to
-  // fill this section with a constant clamped sample.
-  if (curX < minX) {
-    int n = min(minX, endX) - curX;
-    auto src =
-        applyColor(unpack(bit_cast<packed_type>(V4<P>(row[minX]))), color);
-    commit_solid_span<BLEND>(buf, src, n);
-    buf += n;
-    curX += n;
-  }
-  // Here we only deal with valid samples within the sample bounds. No clamping
-  // should occur here within these inner loops.
-  int n = max(min(maxX + 1, endX) - curX, 0);
-  // Try to process as many chunks as possible with full loads and stores.
-  for (int end = curX + (n & ~3); curX < end; curX += 4, buf += 4) {
-    auto src = applyColor(unaligned_load<packed_type>(&row[curX]), color);
-    commit_blend_span<BLEND>(buf, src);
-  }
-  n &= 3;
-  // If we have any leftover samples after processing chunks, use partial loads
-  // and stores.
-  if (n > 0) {
-    auto src = applyColor(partial_load_span<packed_type>(&row[curX], n), color);
-    commit_blend_span<BLEND>(buf, src, n);
-    buf += n;
-    curX += n;
-  }
-  // If we still have samples left above the valid sample bounds, then we again
-  // need to fill this section with a constant clamped sample.
-  if (curX < endX) {
-    auto src =
-        applyColor(unpack(bit_cast<packed_type>(V4<P>(row[maxX]))), color);
-    commit_solid_span<BLEND>(buf, src, endX - curX);
-  }
-  return span;
-}
-
-// We need to verify that the pixel step reasonably approximates stepping by a
-// single texel for every pixel we need to reproduce. Try to ensure that the
-// margin of error is no more than approximately 2^-7. Also, we check here if
-// the scaling can be quantized for acceleration.
-template <typename T>
-static ALWAYS_INLINE int spanNeedsScale(int span, T P) {
-  span &= ~(128 - 1);
-  span += 128;
-  int scaled = round((P.x.y - P.x.x) * span);
-  return scaled != span ? (scaled == span * 2 ? 2 : 1) : 0;
-}
-
-// Helper function to decide whether we can safely apply 1:1 nearest filtering
-// without diverging too much from the linear filter.
-template <typename S, typename T>
-static inline LinearFilter needsTextureLinear(S sampler, T P, int span) {
-  // First verify if the row Y doesn't change across samples
-  if (P.y.x != P.y.y) {
-    return LINEAR_FILTER_FALLBACK;
-  }
-  P = samplerScale(sampler, P);
-  if (int scale = spanNeedsScale(span, P)) {
-    // If the source region is not flipped and smaller than the destination,
-    // then we can use the upscaling filter since row Y is constant.
-    return P.x.x < P.x.y && P.x.y - P.x.x <= 1
-               ? LINEAR_FILTER_UPSCALE
-               : (scale == 2 ? LINEAR_FILTER_DOWNSCALE
-                             : LINEAR_FILTER_FALLBACK);
-  }
-  // Also verify that we're reasonably close to the center of a texel
-  // so that it doesn't look that much different than if a linear filter
-  // was used.
-  if ((int(P.x.x * 4.0f + 0.5f) & 3) != 2 ||
-      (int(P.y.x * 4.0f + 0.5f) & 3) != 2) {
-    // The source and destination regions are the same, but there is a
-    // significant subpixel offset. We can use a faster linear filter to deal
-    // with the offset in this case.
-    return LINEAR_FILTER_FAST;
-  }
-  // Otherwise, we have a constant 1:1 step and we're stepping reasonably close
-  // to the center of each pixel, so it's safe to disable the linear filter and
-  // use nearest.
-  return LINEAR_FILTER_NEAREST;
-}
-
-// Commit an entire span with linear filtering
-#define swgl_commitTextureLinear(format, s, p, uv_rect, color, n)              \
-  do {                                                                         \
-    auto packed_color = packColor(swgl_Out##format, color);                    \
-    int len = (n);                                                             \
-    int drawn = 0;                                                             \
-    if (LinearFilter filter = needsTextureLinear(s, p, len)) {                 \
-      if (blend_key) {                                                         \
-        drawn = blendTextureLinear<true>(s, p, len, uv_rect, packed_color,     \
-                                         swgl_Out##format, filter);            \
-      } else {                                                                 \
-        drawn = blendTextureLinear<false>(s, p, len, uv_rect, packed_color,    \
-                                          swgl_Out##format, filter);           \
-      }                                                                        \
-    } else if (blend_key) {                                                    \
-      drawn = blendTextureNearestFast<true>(s, p, len, uv_rect, packed_color,  \
-                                            swgl_Out##format);                 \
-    } else {                                                                   \
-      drawn = blendTextureNearestFast<false>(s, p, len, uv_rect, packed_color, \
-                                             swgl_Out##format);                \
-    }                                                                          \
-    swgl_Out##format += drawn;                                                 \
-    swgl_SpanLength -= drawn;                                                  \
-  } while (0)
-#define swgl_commitTextureLinearRGBA8(s, p, uv_rect) \
-  swgl_commitTextureLinear(RGBA8, s, p, uv_rect, NoColor(), swgl_SpanLength)
-#define swgl_commitTextureLinearR8(s, p, uv_rect) \
-  swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(), swgl_SpanLength)
-
-// Commit a partial span with linear filtering, optionally inverting the color
-#define swgl_commitPartialTextureLinearR8(len, s, p, uv_rect) \
-  swgl_commitTextureLinear(R8, s, p, uv_rect, NoColor(),      \
-                           min(int(len), swgl_SpanLength))
-#define swgl_commitPartialTextureLinearInvertR8(len, s, p, uv_rect) \
-  swgl_commitTextureLinear(R8, s, p, uv_rect, InvertColor(),        \
-                           min(int(len), swgl_SpanLength))
-
-// Commit an entire span with linear filtering that is scaled by a color
-#define swgl_commitTextureLinearColorRGBA8(s, p, uv_rect, color) \
-  swgl_commitTextureLinear(RGBA8, s, p, uv_rect, color, swgl_SpanLength)
-#define swgl_commitTextureLinearColorR8(s, p, uv_rect, color) \
-  swgl_commitTextureLinear(R8, s, p, uv_rect, color, swgl_SpanLength)
-
-// Helper function that samples from an R8 texture while expanding it to support
-// a differing framebuffer format.
-template <bool BLEND, typename S, typename C, typename P>
-static inline int blendTextureLinearR8(S sampler, vec2 uv, int span,
-                                       const vec4_scalar& uv_rect, C color,
-                                       P* buf) {
-  if (!swgl_isTextureR8(sampler)) {
-    return 0;
-  }
-  LINEAR_QUANTIZE_UV(sampler, uv, uv_step, uv_rect, min_uv, max_uv);
-  for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
-    commit_blend_span<BLEND>(
-        buf, applyColor(expand_mask(buf, textureLinearUnpackedR8(
-                                             sampler,
-                                             ivec2(clamp(uv, min_uv, max_uv)))),
-                        color));
-  }
-  return span;
-}
-
-// Commit an entire span with linear filtering while expanding from R8 to RGBA8
-#define swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, color)      \
-  do {                                                                    \
-    auto packed_color = packColor(swgl_OutRGBA8, color);                  \
-    int drawn = 0;                                                        \
-    if (blend_key) {                                                      \
-      drawn = blendTextureLinearR8<true>(s, p, swgl_SpanLength, uv_rect,  \
-                                         packed_color, swgl_OutRGBA8);    \
-    } else {                                                              \
-      drawn = blendTextureLinearR8<false>(s, p, swgl_SpanLength, uv_rect, \
-                                          packed_color, swgl_OutRGBA8);   \
-    }                                                                     \
-    swgl_OutRGBA8 += drawn;                                               \
-    swgl_SpanLength -= drawn;                                             \
-  } while (0)
-#define swgl_commitTextureLinearR8ToRGBA8(s, p, uv_rect) \
-  swgl_commitTextureLinearColorR8ToRGBA8(s, p, uv_rect, NoColor())
-
-// Compute repeating UVs, possibly constrained by tile repeat limits
-static inline vec2 tileRepeatUV(vec2 uv, const vec2_scalar& tile_repeat) {
-  if (tile_repeat.x > 0.0f) {
-    // Clamp to a number slightly less than the tile repeat limit so that
-    // it results in a number close to but not equal to 1 after fract().
-    // This avoids fract() yielding 0 if the limit was left as whole integer.
-    uv = clamp(uv, vec2_scalar(0.0f), tile_repeat - 1.0e-6f);
-  }
-  return fract(uv);
-}
-
-// Compute the number of non-repeating steps before we need to potentially
-// repeat the UVs.
-static inline int computeNoRepeatSteps(Float uv, float uv_step,
-                                       float tile_repeat, int steps) {
-  if (uv.w < uv.x) {
-    // Ensure the UV taps are ordered low to high.
-    uv = uv.wzyx;
-  }
-  // Check if the samples cross the boundary of the next whole integer or the
-  // tile repeat limit, whichever is lower.
-  float limit = floor(uv.x) + 1.0f;
-  if (tile_repeat > 0.0f) {
-    limit = min(limit, tile_repeat);
-  }
-  return uv.x >= 0.0f && uv.w < limit
-             ? (uv_step != 0.0f
-                    ? int(min(float(steps), (limit - uv.x) / uv_step))
-                    : steps)
-             : 0;
-}
-
-// Blends an entire span of texture with linear filtering and repeating UVs.
-template <bool BLEND, typename S, typename C, typename P>
-static int blendTextureLinearRepeat(S sampler, vec2 uv, int span,
-                                    const vec2_scalar& tile_repeat,
-                                    const vec4_scalar& uv_repeat,
-                                    const vec4_scalar& uv_rect, C color,
-                                    P* buf) {
-  if (!matchTextureFormat(sampler, buf)) {
-    return 0;
-  }
-  vec2_scalar uv_scale = {uv_repeat.z - uv_repeat.x, uv_repeat.w - uv_repeat.y};
-  vec2_scalar uv_offset = {uv_repeat.x, uv_repeat.y};
-  // Choose a linear filter to use for no-repeat sub-spans
-  LinearFilter filter =
-      needsTextureLinear(sampler, uv * uv_scale + uv_offset, span);
-  // We need to step UVs unscaled and unquantized so that we can modulo them
-  // with fract. We use uv_scale and uv_offset to map them into the correct
-  // range.
-  vec2_scalar uv_step =
-      float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};
-  uv_scale = swgl_linearQuantizeStep(sampler, uv_scale);
-  uv_offset = swgl_linearQuantize(sampler, uv_offset);
-  vec2_scalar min_uv = max(
-      swgl_linearQuantize(sampler, vec2_scalar{uv_rect.x, uv_rect.y}), 0.0f);
-  vec2_scalar max_uv = max(
-      swgl_linearQuantize(sampler, vec2_scalar{uv_rect.z, uv_rect.w}), min_uv);
-  for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
-    int steps = int(end - buf) / swgl_StepSize;
-    // Find the sub-span before UVs repeat to avoid expensive repeat math
-    steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps);
-    if (steps > 0) {
-      steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps);
-      if (steps > 0) {
-        buf = blendTextureLinearDispatch<BLEND>(
-            sampler, fract(uv) * uv_scale + uv_offset, steps * swgl_StepSize,
-            uv_step * uv_scale, min_uv, max_uv, color, buf, filter);
-        if (buf >= end) {
-          break;
-        }
-        uv += steps * uv_step;
-      }
-    }
-    // UVs might repeat within this step, so explicitly compute repeated UVs
-    vec2 repeated_uv = clamp(
-        tileRepeatUV(uv, tile_repeat) * uv_scale + uv_offset, min_uv, max_uv);
-    commit_blend_span<BLEND>(
-        buf, applyColor(textureLinearUnpacked(buf, sampler, ivec2(repeated_uv)),
-                        color));
-  }
-  return span;
-}
-
-// Commit an entire span with linear filtering and repeating UVs
-#define swgl_commitTextureLinearRepeat(format, s, p, tile_repeat, uv_repeat,   \
-                                       uv_rect, color)                         \
-  do {                                                                         \
-    auto packed_color = packColor(swgl_Out##format, color);                    \
-    int drawn = 0;                                                             \
-    if (blend_key) {                                                           \
-      drawn = blendTextureLinearRepeat<true>(s, p, swgl_SpanLength,            \
-                                             tile_repeat, uv_repeat, uv_rect,  \
-                                             packed_color, swgl_Out##format);  \
-    } else {                                                                   \
-      drawn = blendTextureLinearRepeat<false>(s, p, swgl_SpanLength,           \
-                                              tile_repeat, uv_repeat, uv_rect, \
-                                              packed_color, swgl_Out##format); \
-    }                                                                          \
-    swgl_Out##format += drawn;                                                 \
-    swgl_SpanLength -= drawn;                                                  \
-  } while (0)
-#define swgl_commitTextureLinearRepeatRGBA8(s, p, tile_repeat, uv_repeat,      \
-                                            uv_rect)                           \
-  swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \
-                                 NoColor())
-#define swgl_commitTextureLinearRepeatColorRGBA8(s, p, tile_repeat, uv_repeat, \
-                                                 uv_rect, color)               \
-  swgl_commitTextureLinearRepeat(RGBA8, s, p, tile_repeat, uv_repeat, uv_rect, \
-                                 color)
-
-template <typename S>
-static ALWAYS_INLINE PackedRGBA8 textureNearestPacked(UNUSED uint32_t* buf,
-                                                      S sampler, ivec2 i) {
-  return textureNearestPackedRGBA8(sampler, i);
-}
-
-// Blends an entire span of texture with nearest filtering and either
-// repeated or clamped UVs.
-template <bool BLEND, bool REPEAT, typename S, typename C, typename P>
-static int blendTextureNearestRepeat(S sampler, vec2 uv, int span,
-                                     const vec2_scalar& tile_repeat,
-                                     const vec4_scalar& uv_rect, C color,
-                                     P* buf) {
-  if (!matchTextureFormat(sampler, buf)) {
-    return 0;
-  }
-  if (!REPEAT) {
-    // If clamping, then we step pre-scaled to the sampler. For repeat modes,
-    // this will be accomplished via uv_scale instead.
-    uv = samplerScale(sampler, uv);
-  }
-  vec2_scalar uv_step =
-      float(swgl_StepSize) * vec2_scalar{uv.x.y - uv.x.x, uv.y.y - uv.y.x};
-  vec2_scalar min_uv = samplerScale(sampler, vec2_scalar{uv_rect.x, uv_rect.y});
-  vec2_scalar max_uv = samplerScale(sampler, vec2_scalar{uv_rect.z, uv_rect.w});
-  vec2_scalar uv_scale = max_uv - min_uv;
-  // If the effective sampling area of this texture is only a single pixel, then
-  // treat it as a solid span. For repeat modes, the bounds are specified on
-  // pixel boundaries, whereas for clamp modes, bounds are on pixel centers, so
-  // the test varies depending on which. If the sample range on an axis is
-  // greater than one pixel, we can still check if we don't move far enough from
-  // the pixel center on that axis to hit the next pixel.
-  if ((int(min_uv.x) + (REPEAT ? 1 : 0) >= int(max_uv.x) ||
-       (uv_step.x * span * (REPEAT ? uv_scale.x : 1.0f) < 0.5f)) &&
-      (int(min_uv.y) + (REPEAT ? 1 : 0) >= int(max_uv.y) ||
-       (uv_step.y * span * (REPEAT ? uv_scale.y : 1.0f) < 0.5f))) {
-    vec2 repeated_uv = REPEAT
-                           ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv
-                           : clamp(uv, min_uv, max_uv);
-    commit_solid_span<BLEND>(buf,
-                             applyColor(unpack(textureNearestPacked(
-                                            buf, sampler, ivec2(repeated_uv))),
-                                        color),
-                             span);
-  } else {
-    for (P* end = buf + span; buf < end; buf += swgl_StepSize, uv += uv_step) {
-      if (REPEAT) {
-        int steps = int(end - buf) / swgl_StepSize;
-        // Find the sub-span before UVs repeat to avoid expensive repeat math
-        steps = computeNoRepeatSteps(uv.x, uv_step.x, tile_repeat.x, steps);
-        if (steps > 0) {
-          steps = computeNoRepeatSteps(uv.y, uv_step.y, tile_repeat.y, steps);
-          if (steps > 0) {
-            vec2 inside_uv = fract(uv) * uv_scale + min_uv;
-            vec2 inside_step = uv_step * uv_scale;
-            for (P* outside = &buf[steps * swgl_StepSize]; buf < outside;
-                 buf += swgl_StepSize, inside_uv += inside_step) {
-              commit_blend_span<BLEND>(
-                  buf, applyColor(
-                           textureNearestPacked(buf, sampler, ivec2(inside_uv)),
-                           color));
-            }
-            if (buf >= end) {
-              break;
-            }
-            uv += steps * uv_step;
-          }
-        }
-      }
-
-      // UVs might repeat within this step, so explicitly compute repeated UVs
-      vec2 repeated_uv = REPEAT
-                             ? tileRepeatUV(uv, tile_repeat) * uv_scale + min_uv
-                             : clamp(uv, min_uv, max_uv);
-      commit_blend_span<BLEND>(
-          buf,
-          applyColor(textureNearestPacked(buf, sampler, ivec2(repeated_uv)),
-                     color));
-    }
-  }
-  return span;
-}
-
-// Determine if we can use the fast nearest filter for the given nearest mode.
-// If the Y coordinate varies more than half a pixel over
-// the span (which might cause the texel to alias to the next one), or the span
-// needs X scaling, then we have to use the fallback.
-template <typename S, typename T>
-static ALWAYS_INLINE bool needsNearestFallback(S sampler, T P, int span) {
-  P = samplerScale(sampler, P);
-  return (P.y.y - P.y.x) * span >= 0.5f || spanNeedsScale(span, P);
-}
-
-// Commit an entire span with nearest filtering and either clamped or repeating
-// UVs
-#define swgl_commitTextureNearest(format, s, p, uv_rect, color)               \
-  do {                                                                        \
-    auto packed_color = packColor(swgl_Out##format, color);                   \
-    int drawn = 0;                                                            \
-    if (needsNearestFallback(s, p, swgl_SpanLength)) {                        \
-      if (blend_key) {                                                        \
-        drawn = blendTextureNearestRepeat<true, false>(                       \
-            s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color,               \
-            swgl_Out##format);                                                \
-      } else {                                                                \
-        drawn = blendTextureNearestRepeat<false, false>(                      \
-            s, p, swgl_SpanLength, 0.0f, uv_rect, packed_color,               \
-            swgl_Out##format);                                                \
-      }                                                                       \
-    } else if (blend_key) {                                                   \
-      drawn = blendTextureNearestFast<true>(s, p, swgl_SpanLength, uv_rect,   \
-                                            packed_color, swgl_Out##format);  \
-    } else {                                                                  \
-      drawn = blendTextureNearestFast<false>(s, p, swgl_SpanLength, uv_rect,  \
-                                             packed_color, swgl_Out##format); \
-    }                                                                         \
-    swgl_Out##format += drawn;                                                \
-    swgl_SpanLength -= drawn;                                                 \
-  } while (0)
-#define swgl_commitTextureNearestRGBA8(s, p, uv_rect) \
-  swgl_commitTextureNearest(RGBA8, s, p, uv_rect, NoColor())
-#define swgl_commitTextureNearestColorRGBA8(s, p, uv_rect, color) \
-  swgl_commitTextureNearest(RGBA8, s, p, uv_rect, color)
-
-#define swgl_commitTextureNearestRepeat(format, s, p, tile_repeat, uv_rect, \
-                                        color)                              \
-  do {                                                                      \
-    auto packed_color = packColor(swgl_Out##format, color);                 \
-    int drawn = 0;                                                          \
-    if (blend_key) {                                                        \
-      drawn = blendTextureNearestRepeat<true, true>(                        \
-          s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color,        \
-          swgl_Out##format);                                                \
-    } else {                                                                \
-      drawn = blendTextureNearestRepeat<false, true>(                       \
-          s, p, swgl_SpanLength, tile_repeat, uv_rect, packed_color,        \
-          swgl_Out##format);                                                \
-    }                                                                       \
-    swgl_Out##format += drawn;                                              \
-    swgl_SpanLength -= drawn;                                               \
-  } while (0)
-#define swgl_commitTextureNearestRepeatRGBA8(s, p, tile_repeat, uv_repeat, \
-                                             uv_rect)                      \
-  swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat,     \
-                                  NoColor())
-#define swgl_commitTextureNearestRepeatColorRGBA8(s, p, tile_repeat,         \
-                                                  uv_repeat, uv_rect, color) \
-  swgl_commitTextureNearestRepeat(RGBA8, s, p, tile_repeat, uv_repeat, color)
-
-// Commit an entire span of texture with filtering determined by sampler state.
-#define swgl_commitTexture(format, s, ...)               \
-  do {                                                   \
-    if (s->filter == TextureFilter::LINEAR) {            \
-      swgl_commitTextureLinear##format(s, __VA_ARGS__);  \
-    } else {                                             \
-      swgl_commitTextureNearest##format(s, __VA_ARGS__); \
-    }                                                    \
-  } while (0)
-#define swgl_commitTextureRGBA8(...) swgl_commitTexture(RGBA8, __VA_ARGS__)
-#define swgl_commitTextureColorRGBA8(...) \
-  swgl_commitTexture(ColorRGBA8, __VA_ARGS__)
-#define swgl_commitTextureRepeatRGBA8(...) \
-  swgl_commitTexture(RepeatRGBA8, __VA_ARGS__)
-#define swgl_commitTextureRepeatColorRGBA8(...) \
-  swgl_commitTexture(RepeatColorRGBA8, __VA_ARGS__)
-
-// Commit an entire span of a separable pass of a Gaussian blur that falls
-// within the given radius scaled by supplied coefficients, clamped to uv_rect
-// bounds.
-template <bool BLEND, typename S, typename P>
-static int blendGaussianBlur(S sampler, vec2 uv, const vec4_scalar& uv_rect,
-                             P* buf, int span, bool hori, int radius,
-                             vec2_scalar coeffs) {
-  if (!matchTextureFormat(sampler, buf)) {
-    return 0;
-  }
-  vec2_scalar size = {float(sampler->width), float(sampler->height)};
-  ivec2_scalar curUV = make_ivec2(force_scalar(uv) * size);
-  ivec4_scalar bounds = make_ivec4(uv_rect * make_vec4(size, size));
-  int startX = curUV.x;
-  int endX = min(bounds.z, curUV.x + span);
-  if (hori) {
-    for (; curUV.x + swgl_StepSize <= endX;
-         buf += swgl_StepSize, curUV.x += swgl_StepSize) {
-      commit_blend_span<BLEND>(
-          buf, gaussianBlurHorizontal<P>(sampler, curUV, bounds.x, bounds.z,
-                                         radius, coeffs.x, coeffs.y));
-    }
-  } else {
-    for (; curUV.x + swgl_StepSize <= endX;
-         buf += swgl_StepSize, curUV.x += swgl_StepSize) {
-      commit_blend_span<BLEND>(
-          buf, gaussianBlurVertical<P>(sampler, curUV, bounds.y, bounds.w,
-                                       radius, coeffs.x, coeffs.y));
-    }
-  }
-  return curUV.x - startX;
-}
-
-#define swgl_commitGaussianBlur(format, s, p, uv_rect, hori, radius, coeffs)   \
-  do {                                                                         \
-    int drawn = 0;                                                             \
-    if (blend_key) {                                                           \
-      drawn = blendGaussianBlur<true>(s, p, uv_rect, swgl_Out##format,         \
-                                      swgl_SpanLength, hori, radius, coeffs);  \
-    } else {                                                                   \
-      drawn = blendGaussianBlur<false>(s, p, uv_rect, swgl_Out##format,        \
-                                       swgl_SpanLength, hori, radius, coeffs); \
-    }                                                                          \
-    swgl_Out##format += drawn;                                                 \
-    swgl_SpanLength -= drawn;                                                  \
-  } while (0)
-#define swgl_commitGaussianBlurRGBA8(s, p, uv_rect, hori, radius, coeffs) \
-  swgl_commitGaussianBlur(RGBA8, s, p, uv_rect, hori, radius, coeffs)
-#define swgl_commitGaussianBlurR8(s, p, uv_rect, hori, radius, coeffs) \
-  swgl_commitGaussianBlur(R8, s, p, uv_rect, hori, radius, coeffs)
-
-// Convert and pack planar YUV samples to RGB output using a color space
-static ALWAYS_INLINE PackedRGBA8 convertYUV(int colorSpace, U16 y, U16 u,
-                                            U16 v) {
-  auto yy = V8<int16_t>(zip(y, y));
-  auto uv = V8<int16_t>(zip(u, v));
-  return yuvMatrix[colorSpace].convert(yy, uv);
-}
-
-// Helper functions to sample from planar YUV textures before converting to RGB
-template <typename S0>
-static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0,
-                                           int colorSpace,
-                                           UNUSED int rescaleFactor) {
-  switch (sampler0->format) {
-    case TextureFormat::RGBA8: {
-      auto planar = textureLinearPlanarRGBA8(sampler0, uv0);
-      return convertYUV(colorSpace, highHalf(planar.rg), lowHalf(planar.rg),
-                        lowHalf(planar.ba));
-    }
-    case TextureFormat::YUV422: {
-      auto planar = textureLinearPlanarYUV422(sampler0, uv0);
-      return convertYUV(colorSpace, planar.y, planar.u, planar.v);
-    }
-    default:
-      assert(false);
-      return PackedRGBA8(0);
-  }
-}
-
-template <bool BLEND, typename S0, typename P, typename C = NoColor>
-static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
-                    const vec4_scalar& uv_rect0, int colorSpace,
-                    int rescaleFactor, C color = C()) {
-  if (!swgl_isTextureLinear(sampler0)) {
-    return 0;
-  }
-  LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
-  auto c = packColor(buf, color);
-  auto* end = buf + span;
-  for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0) {
-    commit_blend_span<BLEND>(
-        buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
-                                  colorSpace, rescaleFactor),
-                        c));
-  }
-  return span;
-}
-
-template <typename S0, typename S1>
-static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1,
-                                           ivec2 uv1, int colorSpace,
-                                           UNUSED int rescaleFactor) {
-  switch (sampler1->format) {
-    case TextureFormat::RG8: {
-      assert(sampler0->format == TextureFormat::R8);
-      auto y = textureLinearUnpackedR8(sampler0, uv0);
-      auto planar = textureLinearPlanarRG8(sampler1, uv1);
-      return convertYUV(colorSpace, y, lowHalf(planar.rg), highHalf(planar.rg));
-    }
-    case TextureFormat::RGBA8: {
-      assert(sampler0->format == TextureFormat::R8);
-      auto y = textureLinearUnpackedR8(sampler0, uv0);
-      auto planar = textureLinearPlanarRGBA8(sampler1, uv1);
-      return convertYUV(colorSpace, y, lowHalf(planar.ba), highHalf(planar.rg));
-    }
-    default:
-      assert(false);
-      return PackedRGBA8(0);
-  }
-}
-
-template <bool BLEND, typename S0, typename S1, typename P,
-          typename C = NoColor>
-static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
-                    const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1,
-                    const vec4_scalar& uv_rect1, int colorSpace,
-                    int rescaleFactor, C color = C()) {
-  if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1)) {
-    return 0;
-  }
-  LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
-  LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
-  auto c = packColor(buf, color);
-  auto* end = buf + span;
-  for (; buf < end; buf += swgl_StepSize, uv0 += uv_step0, uv1 += uv_step1) {
-    commit_blend_span<BLEND>(
-        buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
-                                  sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)),
-                                  colorSpace, rescaleFactor),
-                        c));
-  }
-  return span;
-}
-
-template <typename S0, typename S1, typename S2>
-static ALWAYS_INLINE PackedRGBA8 sampleYUV(S0 sampler0, ivec2 uv0, S1 sampler1,
-                                           ivec2 uv1, S2 sampler2, ivec2 uv2,
-                                           int colorSpace, int rescaleFactor) {
-  assert(sampler0->format == sampler1->format &&
-         sampler0->format == sampler2->format);
-  switch (sampler0->format) {
-    case TextureFormat::R8: {
-      auto y = textureLinearUnpackedR8(sampler0, uv0);
-      auto u = textureLinearUnpackedR8(sampler1, uv1);
-      auto v = textureLinearUnpackedR8(sampler2, uv2);
-      return convertYUV(colorSpace, y, u, v);
-    }
-    case TextureFormat::R16: {
-      // The rescaling factor represents how many bits to add to renormalize the
-      // texture to 16 bits, and so the color depth is actually 16 minus the
-      // rescaling factor.
-      // Need to right shift the sample by the amount of bits over 8 it
-      // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit
-      // of precision at the low end already, hence 1 is subtracted from the
-      // color depth.
-      int colorDepth = 16 - rescaleFactor;
-      int rescaleBits = (colorDepth - 1) - 8;
-      auto y = textureLinearUnpackedR16(sampler0, uv0) >> rescaleBits;
-      auto u = textureLinearUnpackedR16(sampler1, uv1) >> rescaleBits;
-      auto v = textureLinearUnpackedR16(sampler2, uv2) >> rescaleBits;
-      return convertYUV(colorSpace, U16(y), U16(u), U16(v));
-    }
-    default:
-      assert(false);
-      return PackedRGBA8(0);
-  }
-}
-
-// Fallback helper for when we can't specifically accelerate YUV with
-// composition.
-template <bool BLEND, typename S0, typename S1, typename S2, typename P,
-          typename C>
-static void blendYUVFallback(P* buf, int span, S0 sampler0, vec2 uv0,
-                             vec2_scalar uv_step0, vec2_scalar min_uv0,
-                             vec2_scalar max_uv0, S1 sampler1, vec2 uv1,
-                             vec2_scalar uv_step1, vec2_scalar min_uv1,
-                             vec2_scalar max_uv1, S2 sampler2, vec2 uv2,
-                             vec2_scalar uv_step2, vec2_scalar min_uv2,
-                             vec2_scalar max_uv2, int colorSpace,
-                             int rescaleFactor, C color) {
-  for (auto* end = buf + span; buf < end; buf += swgl_StepSize, uv0 += uv_step0,
-             uv1 += uv_step1, uv2 += uv_step2) {
-    commit_blend_span<BLEND>(
-        buf, applyColor(sampleYUV(sampler0, ivec2(clamp(uv0, min_uv0, max_uv0)),
-                                  sampler1, ivec2(clamp(uv1, min_uv1, max_uv1)),
-                                  sampler2, ivec2(clamp(uv2, min_uv2, max_uv2)),
-                                  colorSpace, rescaleFactor),
-                        color));
-  }
-}
-
-template <bool BLEND, typename S0, typename S1, typename S2, typename P,
-          typename C = NoColor>
-static int blendYUV(P* buf, int span, S0 sampler0, vec2 uv0,
-                    const vec4_scalar& uv_rect0, S1 sampler1, vec2 uv1,
-                    const vec4_scalar& uv_rect1, S2 sampler2, vec2 uv2,
-                    const vec4_scalar& uv_rect2, int colorSpace,
-                    int rescaleFactor, C color = C()) {
-  if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) ||
-      !swgl_isTextureLinear(sampler2)) {
-    return 0;
-  }
-  LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
-  LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
-  LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2);
-  auto c = packColor(buf, color);
-  blendYUVFallback<BLEND>(buf, span, sampler0, uv0, uv_step0, min_uv0, max_uv0,
-                          sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2,
-                          uv2, uv_step2, min_uv2, max_uv2, colorSpace,
-                          rescaleFactor, c);
-  return span;
-}
-
-// A variant of the blendYUV that attempts to reuse the inner loops from the
-// CompositeYUV infrastructure. CompositeYUV imposes stricter requirements on
-// the source data, which in turn allows it to be much faster than blendYUV.
-// At a minimum, we need to ensure that we are outputting to a BGRA8 framebuffer
-// and that no color scaling is applied, which we can accomplish via template
-// specialization. We need to further validate inside that texture formats
-// and dimensions are sane for video and that the video is axis-aligned before
-// acceleration can proceed.
-template <bool BLEND>
-static int blendYUV(uint32_t* buf, int span, sampler2DRect sampler0, vec2 uv0,
-                    const vec4_scalar& uv_rect0, sampler2DRect sampler1,
-                    vec2 uv1, const vec4_scalar& uv_rect1,
-                    sampler2DRect sampler2, vec2 uv2,
-                    const vec4_scalar& uv_rect2, int colorSpace,
-                    int rescaleFactor, NoColor noColor = NoColor()) {
-  if (!swgl_isTextureLinear(sampler0) || !swgl_isTextureLinear(sampler1) ||
-      !swgl_isTextureLinear(sampler2)) {
-    return 0;
-  }
-  LINEAR_QUANTIZE_UV(sampler0, uv0, uv_step0, uv_rect0, min_uv0, max_uv0);
-  LINEAR_QUANTIZE_UV(sampler1, uv1, uv_step1, uv_rect1, min_uv1, max_uv1);
-  LINEAR_QUANTIZE_UV(sampler2, uv2, uv_step2, uv_rect2, min_uv2, max_uv2);
-  auto* end = buf + span;
-  // CompositeYUV imposes further restrictions on the source textures, such that
-  // the the Y/U/V samplers must all have a matching format, the U/V samplers
-  // must have matching sizes and sample coordinates, and there must be no
-  // change in row across the entire span.
-  if (sampler0->format == sampler1->format &&
-      sampler1->format == sampler2->format &&
-      sampler1->width == sampler2->width &&
-      sampler1->height == sampler2->height && uv_step0.y == 0 &&
-      uv_step0.x > 0 && uv_step1.y == 0 && uv_step1.x > 0 &&
-      uv_step1 == uv_step2 && uv1.x.x == uv2.x.x && uv1.y.x == uv2.y.x) {
-    // CompositeYUV does not support a clamp rect, so we must take care to
-    // advance till we're inside the bounds of the clamp rect.
-    int outside = min(int(ceil(max((min_uv0.x - uv0.x.x) / uv_step0.x,
-                                   (min_uv1.x - uv1.x.x) / uv_step1.x))),
-                      (end - buf) / swgl_StepSize);
-    if (outside > 0) {
-      blendYUVFallback<BLEND>(
-          buf, outside * swgl_StepSize, sampler0, uv0, uv_step0, min_uv0,
-          max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1, sampler2, uv2,
-          uv_step2, min_uv2, max_uv2, colorSpace, rescaleFactor, noColor);
-      buf += outside * swgl_StepSize;
-      uv0.x += outside * uv_step0.x;
-      uv1.x += outside * uv_step1.x;
-      uv2.x += outside * uv_step2.x;
-    }
-    // Find the amount of chunks inside the clamp rect before we hit the
-    // maximum. If there are any chunks inside, we can finally dispatch to
-    // CompositeYUV.
-    int inside = min(int(min((max_uv0.x - uv0.x.x) / uv_step0.x,
-                             (max_uv1.x - uv1.x.x) / uv_step1.x)),
-                     (end - buf) / swgl_StepSize);
-    if (inside > 0) {
-      // We need the color depth, which is relative to the texture format and
-      // rescale factor.
-      int colorDepth =
-          (sampler0->format == TextureFormat::R16 ? 16 : 8) - rescaleFactor;
-      // Finally, call the inner loop of CompositeYUV.
-      linear_row_yuv<BLEND>(
-          buf, inside * swgl_StepSize, sampler0, force_scalar(uv0),
-          uv_step0.x / swgl_StepSize, sampler1, sampler2, force_scalar(uv1),
-          uv_step1.x / swgl_StepSize, colorDepth, yuvMatrix[colorSpace]);
-      // Now that we're done, advance past the processed inside portion.
-      buf += inside * swgl_StepSize;
-      uv0.x += inside * uv_step0.x;
-      uv1.x += inside * uv_step1.x;
-      uv2.x += inside * uv_step2.x;
-    }
-  }
-  // We either got here because we have some samples outside the clamp rect, or
-  // because some of the preconditions were not satisfied. Process whatever is
-  // left of the span.
-  blendYUVFallback<BLEND>(buf, end - buf, sampler0, uv0, uv_step0, min_uv0,
-                          max_uv0, sampler1, uv1, uv_step1, min_uv1, max_uv1,
-                          sampler2, uv2, uv_step2, min_uv2, max_uv2, colorSpace,
-                          rescaleFactor, noColor);
-  return span;
-}
-
-// Commit a single chunk of a YUV surface represented by multiple planar
-// textures. This requires a color space specifier selecting how to convert
-// from YUV to RGB output. In the case of HDR formats, a rescaling factor
-// selects how many bits of precision must be utilized on conversion. See the
-// sampleYUV dispatcher functions for the various supported plane
-// configurations this intrinsic accepts.
-#define swgl_commitTextureLinearYUV(...)                                    \
-  do {                                                                      \
-    int drawn = 0;                                                          \
-    if (blend_key) {                                                        \
-      drawn = blendYUV<true>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__);  \
-    } else {                                                                \
-      drawn = blendYUV<false>(swgl_OutRGBA8, swgl_SpanLength, __VA_ARGS__); \
-    }                                                                       \
-    swgl_OutRGBA8 += drawn;                                                 \
-    swgl_SpanLength -= drawn;                                               \
-  } while (0)
-
-// Commit a single chunk of a YUV surface scaled by a color.
-#define swgl_commitTextureLinearColorYUV(...) \
-  swgl_commitTextureLinearYUV(__VA_ARGS__)
-
-// Each gradient stops entry is a pair of RGBA32F start color and end step.
-struct GradientStops {
-  Float startColor;
-  union {
-    Float stepColor;
-    vec4_scalar stepData;
-  };
-
-  // Whether this gradient entry can be merged with an adjacent entry. The
-  // step will be equal with the adjacent step if and only if they can be
-  // merged, or rather, that the stops are actually part of a single larger
-  // gradient.
-  bool can_merge(const GradientStops& next) const {
-    return stepData == next.stepData;
-  }
-
-  // Get the interpolated color within the entry based on the offset from its
-  // start.
-  Float interpolate(float offset) const {
-    return startColor + stepColor * offset;
-  }
-
-  // Get the end color of the entry where interpolation stops.
-  Float end_color() const { return startColor + stepColor; }
-};
-
-// Checks if a gradient table of the specified size exists at the UV coords of
-// the address within an RGBA32F texture. If so, a linear address within the
-// texture is returned that may be used to sample the gradient table later. If
-// the address doesn't describe a valid gradient, then a negative value is
-// returned.
-static inline int swgl_validateGradient(sampler2D sampler, ivec2_scalar address,
-                                        int entries) {
-  return sampler->format == TextureFormat::RGBA32F && address.y >= 0 &&
-                 address.y < int(sampler->height) && address.x >= 0 &&
-                 address.x < int(sampler->width) && entries > 0 &&
-                 address.x +
-                         int(sizeof(GradientStops) / sizeof(Float)) * entries <=
-                     int(sampler->width)
-             ? address.y * sampler->stride + address.x * 4
-             : -1;
-}
-
-static inline WideRGBA8 sampleGradient(sampler2D sampler, int address,
-                                       Float entry) {
-  assert(sampler->format == TextureFormat::RGBA32F);
-  assert(address >= 0 && address < int(sampler->height * sampler->stride));
-  // Get the integer portion of the entry index to find the entry colors.
-  I32 index = cast(entry);
-  // Use the fractional portion of the entry index to control blending between
-  // entry colors.
-  Float offset = entry - cast(index);
-  // Every entry is a pair of colors blended by the fractional offset.
-  assert(test_all(index >= 0 &&
-                  index * int(sizeof(GradientStops) / sizeof(Float)) <
-                      int(sampler->width)));
-  GradientStops* stops = (GradientStops*)&sampler->buf[address];
-  // Blend between the colors for each SIMD lane, then pack them to RGBA8
-  // result. Since the layout of the RGBA8 framebuffer is actually BGRA while
-  // the gradient table has RGBA colors, swizzling is required.
-  return combine(
-      packRGBA8(round_pixel(stops[index.x].interpolate(offset.x).zyxw),
-                round_pixel(stops[index.y].interpolate(offset.y).zyxw)),
-      packRGBA8(round_pixel(stops[index.z].interpolate(offset.z).zyxw),
-                round_pixel(stops[index.w].interpolate(offset.w).zyxw)));
-}
-
-// Samples a gradient entry from the gradient at the provided linearized
-// address. The integer portion of the entry index is used to find the entry
-// within the table whereas the fractional portion is used to blend between
-// adjacent table entries.
-#define swgl_commitGradientRGBA8(sampler, address, entry) \
-  swgl_commitChunk(RGBA8, sampleGradient(sampler, address, entry))
-
-// Variant that allows specifying a color multiplier of the gradient result.
-#define swgl_commitGradientColorRGBA8(sampler, address, entry, color)         \
-  swgl_commitChunk(RGBA8, applyColor(sampleGradient(sampler, address, entry), \
-                                     packColor(swgl_OutRGBA, color)))
-
-// Samples an entire span of a linear gradient by crawling the gradient table
-// and looking for consecutive stops that can be merged into a single larger
-// gradient, then interpolating between those larger gradients within the span.
-template <bool BLEND>
-static bool commitLinearGradient(sampler2D sampler, int address, float size,
-                                 bool repeat, Float offset, uint32_t* buf,
-                                 int span) {
-  assert(sampler->format == TextureFormat::RGBA32F);
-  assert(address >= 0 && address < int(sampler->height * sampler->stride));
-  GradientStops* stops = (GradientStops*)&sampler->buf[address];
-  // Get the chunk delta from the difference in offset steps. This represents
-  // how far within the gradient table we advance for every step in output,
-  // normalized to gradient table size.
-  float delta = (offset.y - offset.x) * 4.0f;
-  if (!isfinite(delta)) {
-    return false;
-  }
-  for (; span > 0;) {
-    // If repeat is desired, we need to limit the offset to a fractional value.
-    if (repeat) {
-      offset = fract(offset);
-    }
-    // Try to process as many chunks as are within the span if possible.
-    float chunks = 0.25f * span;
-    // To properly handle both clamping and repeating of the table offset, we
-    // need to ensure we don't run past the 0 and 1 points. Here we compute the
-    // intercept points depending on whether advancing forwards or backwards in
-    // the gradient table to ensure the chunk count is limited by the amount
-    // before intersection. If there is no delta, then we compute no intercept.
-    float startEntry;
-    int minIndex, maxIndex;
-    if (offset.x < 0) {
-      // If we're below the gradient table, use the first color stop. We can
-      // only intercept the table if walking forward.
-      startEntry = 0;
-      minIndex = int(startEntry);
-      maxIndex = minIndex;
-      if (delta > 0) {
-        chunks = min(chunks, -offset.x / delta);
-      }
-    } else if (offset.x < 1) {
-      // Otherwise, we're inside the gradient table. Depending on the direction
-      // we're walking the the table, we may intersect either the 0 or 1 offset.
-      // Compute the start entry based on our initial offset, and compute the
-      // end entry based on the available chunks limited by intercepts. Clamp
-      // them into the valid range of the table.
-      startEntry = 1.0f + offset.x * size;
-      if (delta < 0) {
-        chunks = min(chunks, -offset.x / delta);
-      } else if (delta > 0) {
-        chunks = min(chunks, (1 - offset.x) / delta);
-      }
-      float endEntry = clamp(1.0f + (offset.x + delta * int(chunks)) * size,
-                             0.0f, 1.0f + size);
-      // Now that we know the range of entries we need to sample, we want to
-      // find the largest possible merged gradient within that range. Depending
-      // on which direction we are advancing in the table, we either walk up or
-      // down the table trying to merge the current entry with the adjacent
-      // entry. We finally limit the chunks to only sample from this merged
-      // gradient.
-      minIndex = int(startEntry);
-      maxIndex = minIndex;
-      if (delta > 0) {
-        while (maxIndex + 1 < endEntry &&
-               stops[maxIndex].can_merge(stops[maxIndex + 1])) {
-          maxIndex++;
-        }
-        chunks = min(chunks, (maxIndex + 1 - startEntry) / (delta * size));
-      } else if (delta < 0) {
-        while (minIndex - 1 > endEntry &&
-               stops[minIndex - 1].can_merge(stops[minIndex])) {
-          minIndex--;
-        }
-        chunks = min(chunks, (minIndex - startEntry) / (delta * size));
-      }
-    } else {
-      // If we're above the gradient table, use the last color stop. We can
-      // only intercept the table if walking backward.
-      startEntry = 1.0f + size;
-      minIndex = int(startEntry);
-      maxIndex = minIndex;
-      if (delta < 0) {
-        chunks = min(chunks, (1 - offset.x) / delta);
-      }
-    }
-    // If there are any amount of whole chunks of a merged gradient found,
-    // then we want to process that as a single gradient span with the start
-    // and end colors from the min and max entries.
-    if (chunks >= 1.0f) {
-      int inside = int(chunks);
-      // Sample the start color from the min entry and the end color from the
-      // max entry of the merged gradient. These are scaled to a range of
-      // 0..0xFF00, as that is the largest shifted value that can fit in a U16.
-      // Since we are only doing addition with the step value, we can still
-      // represent negative step values without having to use an explicit sign
-      // bit, as the result will still come out the same, allowing us to gain an
-      // extra bit of precision. We will later shift these into 8 bit output
-      // range while committing the span, but stepping with higher precision to
-      // avoid banding. We convert from RGBA to BGRA here to avoid doing this in
-      // the inner loop.
-      auto minColorF = stops[minIndex].startColor.zyxw * float(0xFF00);
-      auto maxColorF = stops[maxIndex].end_color().zyxw * float(0xFF00);
-      // Get the color range of the merged gradient, normalized to its size.
-      auto colorRangeF =
-          (maxColorF - minColorF) * (1.0f / (maxIndex + 1 - minIndex));
-      // Compute the actual starting color of the current start offset within
-      // the merged gradient. The value 0.5 is added to the low bits (0x80) so
-      // that the color will effective round to the nearest increment below.
-      auto colorF =
-          minColorF + colorRangeF * (startEntry - minIndex) + float(0x80);
-      // Compute the portion of the color range that we advance on each chunk.
-      Float deltaColorF = colorRangeF * (delta * size);
-      // Quantize the color delta and current color. These have already been
-      // scaled to the 0..0xFF00 range, so we just need to round them to U16.
-      auto deltaColor = repeat4(CONVERT(round_pixel(deltaColorF, 1), U16));
-      auto color =
-          combine(CONVERT(round_pixel(colorF, 1), U16),
-                  CONVERT(round_pixel(colorF + deltaColorF * 0.25f, 1), U16),
-                  CONVERT(round_pixel(colorF + deltaColorF * 0.5f, 1), U16),
-                  CONVERT(round_pixel(colorF + deltaColorF * 0.75f, 1), U16));
-      // Finally, step the current color through the output chunks, shifting
-      // it into 8 bit range and outputting as we go.
-      for (auto* end = buf + inside * 4; buf < end; buf += 4) {
-        commit_blend_span<BLEND>(buf, bit_cast<WideRGBA8>(color >> 8));
-        color += deltaColor;
-      }
-      // Deduct the number of chunks inside the gradient from the remaining
-      // overall span. If we exhausted the span, bail out.
-      span -= inside * 4;
-      if (span <= 0) {
-        break;
-      }
-      // Otherwise, assume we're in a transitional section of the gradient that
-      // will probably require per-sample table lookups, so fall through below.
-      offset += inside * delta;
-      if (repeat) {
-        offset = fract(offset);
-      }
-    }
-    // If we get here, there were no whole chunks of a merged gradient found
-    // that we could process, but we still have a non-zero amount of span left.
-    // That means we have segments of gradient that begin or end at the current
-    // entry we're on. For this case, we just fall back to sampleGradient which
-    // will calculate a table entry for each sample, assuming the samples may
-    // have different table entries.
-    Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size);
-    commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry));
-    span -= 4;
-    buf += 4;
-    offset += delta;
-  }
-  return true;
-}
-
-// Commits an entire span of a linear gradient, given the address of a table
-// previously resolved with swgl_validateGradient. The size of the inner portion
-// of the table is given, assuming the table start and ends with a single entry
-// each to deal with clamping. Repeating will be handled if necessary. The
-// initial offset within the table is used to designate where to start the span
-// and how to step through the gradient table.
-#define swgl_commitLinearGradientRGBA8(sampler, address, size, repeat, offset) \
-  do {                                                                         \
-    bool drawn = false;                                                        \
-    if (blend_key) {                                                           \
-      drawn =                                                                  \
-          commitLinearGradient<true>(sampler, address, size, repeat, offset,   \
-                                     swgl_OutRGBA8, swgl_SpanLength);          \
-    } else {                                                                   \
-      drawn =                                                                  \
-          commitLinearGradient<false>(sampler, address, size, repeat, offset,  \
-                                      swgl_OutRGBA8, swgl_SpanLength);         \
-    }                                                                          \
-    if (drawn) {                                                               \
-      swgl_OutRGBA8 += swgl_SpanLength;                                        \
-      swgl_SpanLength = 0;                                                     \
-    }                                                                          \
-  } while (0)
-
-template <bool CLAMP, typename V>
-static ALWAYS_INLINE V fastSqrt(V v) {
-#if USE_SSE2 || USE_NEON
-  // Clamp to avoid zero in inversesqrt.
-  return v * inversesqrt(CLAMP ? max(v, V(1.0e-10f)) : v);
-#else
-  return sqrt(v);
-#endif
-}
-
-template <bool CLAMP, typename V>
-static ALWAYS_INLINE auto fastLength(V v) {
-  return fastSqrt<CLAMP>(dot(v, v));
-}
-
-// Samples an entire span of a radial gradient by crawling the gradient table
-// and looking for consecutive stops that can be merged into a single larger
-// gradient, then interpolating between those larger gradients within the span
-// based on the computed position relative to a radius.
-template <bool BLEND>
-static bool commitRadialGradient(sampler2D sampler, int address, float size,
-                                 bool repeat, vec2 pos, float radius,
-                                 uint32_t* buf, int span) {
-  assert(sampler->format == TextureFormat::RGBA32F);
-  assert(address >= 0 && address < int(sampler->height * sampler->stride));
-  GradientStops* stops = (GradientStops*)&sampler->buf[address];
-  // clang-format off
-  // Given position p, delta d, and radius r, we need to repeatedly solve the
-  // following quadratic for the pixel offset t:
-  //    length(p + t*d) = r
-  //    (px + t*dx)^2 + (py + t*dy)^2 = r^2
-  // Rearranged into quadratic equation form (t^2*a + t*b + c = 0) this is:
-  //    t^2*(dx^2+dy^2) + t*2*(dx*px+dy*py) + (px^2+py^2-r^2) = 0
-  //    t^2*d.d + t*2*d.p + (p.p-r^2) = 0
-  // The solution of the quadratic formula t=(-b+-sqrt(b^2-4ac))/2a reduces to:
-  //    t = -d.p/d.d +- sqrt((d.p/d.d)^2 - (p.p-r^2)/d.d)
-  // Note that d.p, d.d, p.p, and r^2 are constant across the gradient, and so
-  // we cache them below for faster computation.
-  //
-  // The quadratic has two solutions, representing the span intersecting the
-  // given radius of gradient, which can occur at two offsets. If there is only
-  // one solution (where b^2-4ac = 0), this represents the point at which the
-  // span runs tangent to the radius. This middle point is significant in that
-  // before it, we walk down the gradient ramp, and after it, we walk up the
-  // ramp.
-  // clang-format on
-  vec2_scalar pos0 = {pos.x.x, pos.y.x};
-  vec2_scalar delta = {pos.x.y - pos.x.x, pos.y.y - pos.y.x};
-  float deltaDelta = dot(delta, delta);
-  if (!isfinite(deltaDelta) || !isfinite(radius)) {
-    return false;
-  }
-  float invDelta, middleT, middleB;
-  if (deltaDelta > 0) {
-    invDelta = 1.0f / deltaDelta;
-    middleT = -dot(delta, pos0) * invDelta;
-    middleB = middleT * middleT - dot(pos0, pos0) * invDelta;
-  } else {
-    // If position is invariant, just set the coefficients so the quadratic
-    // always reduces to the end of the span.
-    invDelta = 0.0f;
-    middleT = float(span);
-    middleB = 0.0f;
-  }
-  // We only want search for merged gradients up to the minimum of either the
-  // mid-point or the span length. Cache those offsets here as they don't vary
-  // in the inner loop.
-  Float middleEndRadius = fastLength<true>(
-      pos0 + delta * (Float){middleT, float(span), 0.0f, 0.0f});
-  float middleRadius = span < middleT ? middleEndRadius.y : middleEndRadius.x;
-  float endRadius = middleEndRadius.y;
-  // Convert delta to change in position per chunk.
-  delta *= 4;
-  deltaDelta *= 4 * 4;
-  // clang-format off
-  // Given current position p and delta d, we reduce:
-  //    length(p) = sqrt(dot(p,p)) = dot(p,p) * invsqrt(dot(p,p))
-  // where dot(p+d,p+d) can be accumulated as:
-  //    (x+dx)^2+(y+dy)^2 = (x^2+y^2) + 2(x*dx+y*dy) + (dx^2+dy^2)
-  //                      = p.p + 2p.d + d.d
-  // Since p increases by d every loop iteration, p.d increases by d.d, and thus
-  // we can accumulate d.d to calculate 2p.d, then allowing us to get the next
-  // dot-product by adding it to dot-product p.p of the prior iteration. This
-  // saves us some multiplications and an expensive sqrt inside the inner loop.
-  // clang-format on
-  Float dotPos = dot(pos, pos);
-  Float dotPosDelta = 2.0f * dot(pos, delta) + deltaDelta;
-  float deltaDelta2 = 2.0f * deltaDelta;
-  for (int t = 0; t < span;) {
-    // Compute the gradient table offset from the current position.
-    Float offset = fastSqrt<true>(dotPos) - radius;
-    float startRadius = radius;
-    // If repeat is desired, we need to limit the offset to a fractional value.
-    if (repeat) {
-      // The non-repeating radius at which the gradient table actually starts,
-      // radius + floor(offset) = radius + (offset - fract(offset)).
-      startRadius += offset.x;
-      offset = fract(offset);
-      startRadius -= offset.x;
-    }
-    // We need to find the min/max index in the table of the gradient we want to
-    // use as well as the intercept point where we leave this gradient.
-    float intercept = -1;
-    int minIndex = 0;
-    int maxIndex = int(1.0f + size);
-    if (offset.x < 0) {
-      // If inside the inner radius of the gradient table, then use the first
-      // stop. Set the intercept to advance forward to the start of the gradient
-      // table.
-      maxIndex = minIndex;
-      if (t >= middleT) {
-        intercept = radius;
-      }
-    } else if (offset.x < 1) {
-      // Otherwise, we're inside the valid part of the gradient table.
-      minIndex = int(1.0f + offset.x * size);
-      maxIndex = minIndex;
-      // Find the offset in the gradient that corresponds to the search limit.
-      // We only search up to the minimum of either the mid-point or the span
-      // length. Get the table index that corresponds to this offset, clamped so
-      // that we avoid hitting the beginning (0) or end (1 + size) of the table.
-      float searchOffset =
-          (t >= middleT ? endRadius : middleRadius) - startRadius;
-      int searchIndex = int(clamp(1.0f + size * searchOffset, 1.0f, size));
-      // If we are past the mid-point, walk up the gradient table trying to
-      // merge stops. If we're below the mid-point, we need to walk down the
-      // table. We note the table index at which we need to look for an
-      // intercept to determine a valid span.
-      if (t >= middleT) {
-        while (maxIndex + 1 <= searchIndex &&
-               stops[maxIndex].can_merge(stops[maxIndex + 1])) {
-          maxIndex++;
-        }
-        intercept = maxIndex + 1;
-      } else {
-        while (minIndex - 1 >= searchIndex &&
-               stops[minIndex - 1].can_merge(stops[minIndex])) {
-          minIndex--;
-        }
-        intercept = minIndex;
-      }
-      // Convert from a table index into units of radius from the center of the
-      // gradient.
-      intercept = clamp((intercept - 1.0f) / size, 0.0f, 1.0f) + startRadius;
-    } else {
-      // If outside the outer radius of the gradient table, then use the last
-      // stop. Set the intercept to advance toward the valid part of the
-      // gradient table if going in, or just run to the end of the span if going
-      // away from the gradient.
-      minIndex = maxIndex;
-      if (t < middleT) {
-        intercept = radius + 1;
-      }
-    }
-    // Solve the quadratic for t to find where the merged gradient ends. If no
-    // intercept is found, just go to the middle or end of the span.
-    float endT = t >= middleT ? span : min(span, int(middleT));
-    if (intercept >= 0) {
-      float b = middleB + intercept * intercept * invDelta;
-      if (b > 0) {
-        b = fastSqrt<false>(b);
-        endT = min(endT, t >= middleT ? middleT + b : middleT - b);
-      }
-    }
-    // Figure out how many chunks are actually inside the merged gradient.
-    if (t + 4.0f <= endT) {
-      int inside = int(endT - t) & ~3;
-      // Convert start and end colors to BGRA and scale to 0..255 range later.
-      auto minColorF = stops[minIndex].startColor.zyxw * 255.0f;
-      auto maxColorF = stops[maxIndex].end_color().zyxw * 255.0f;
-      // Compute the change in color per change in gradient offset.
-      auto deltaColorF =
-          (maxColorF - minColorF) * (size / (maxIndex + 1 - minIndex));
-      // Subtract off the color difference of the beginning of the current span
-      // from the beginning of the gradient.
-      Float colorF =
-          minColorF - deltaColorF * (startRadius + (minIndex - 1) / size);
-      // Finally, walk over the span accumulating the position dot product and
-      // getting its sqrt as an offset into the color ramp. Since we're already
-      // in BGRA format and scaled to 255, we just need to round to an integer
-      // and pack down to pixel format.
-      for (auto* end = buf + inside; buf < end; buf += 4) {
-        Float offsetG = fastSqrt<false>(dotPos);
-        commit_blend_span<BLEND>(
-            buf,
-            combine(
-                packRGBA8(round_pixel(colorF + deltaColorF * offsetG.x, 1),
-                          round_pixel(colorF + deltaColorF * offsetG.y, 1)),
-                packRGBA8(round_pixel(colorF + deltaColorF * offsetG.z, 1),
-                          round_pixel(colorF + deltaColorF * offsetG.w, 1))));
-        dotPos += dotPosDelta;
-        dotPosDelta += deltaDelta2;
-      }
-      // Advance past the portion of gradient we just processed.
-      t += inside;
-      // If we hit the end of the span, exit out now.
-      if (t >= span) {
-        break;
-      }
-      // Otherwise, we are most likely in a transitional section of the gradient
-      // between stops that will likely require doing per-sample table lookups.
-      // Rather than having to redo all the searching above to figure that out,
-      // just assume that to be the case and fall through below to doing the
-      // table lookups to hopefully avoid an iteration.
-      offset = fastSqrt<true>(dotPos) - radius;
-      if (repeat) {
-        offset = fract(offset);
-      }
-    }
-    // If we got here, that means we still have span left to process but did not
-    // have any whole chunks that fell within a merged gradient. Just fall back
-    // to doing a table lookup for each sample.
-    Float entry = clamp(offset * size + 1.0f, 0.0f, 1.0f + size);
-    commit_blend_span<BLEND>(buf, sampleGradient(sampler, address, entry));
-    buf += 4;
-    t += 4;
-    dotPos += dotPosDelta;
-    dotPosDelta += deltaDelta2;
-  }
-  return true;
-}
-
-// Commits an entire span of a radial gradient similar to
-// swglcommitLinearGradient, but given a varying 2D position scaled to
-// gradient-space and a radius at which the distance from the origin maps to the
-// start of the gradient table.
-#define swgl_commitRadialGradientRGBA8(sampler, address, size, repeat, pos,    \
-                                       radius)                                 \
-  do {                                                                         \
-    bool drawn = false;                                                        \
-    if (blend_key) {                                                           \
-      drawn =                                                                  \
-          commitRadialGradient<true>(sampler, address, size, repeat, pos,      \
-                                     radius, swgl_OutRGBA8, swgl_SpanLength);  \
-    } else {                                                                   \
-      drawn =                                                                  \
-          commitRadialGradient<false>(sampler, address, size, repeat, pos,     \
-                                      radius, swgl_OutRGBA8, swgl_SpanLength); \
-    }                                                                          \
-    if (drawn) {                                                               \
-      swgl_OutRGBA8 += swgl_SpanLength;                                        \
-      swgl_SpanLength = 0;                                                     \
-    }                                                                          \
-  } while (0)
-
-// Extension to set a clip mask image to be sampled during blending. The offset
-// specifies the positioning of the clip mask image relative to the viewport
-// origin. The bounding box specifies the rectangle relative to the clip mask's
-// origin that constrains sampling within the clip mask. Blending must be
-// enabled for this to work.
-static sampler2D swgl_ClipMask = nullptr;
-static IntPoint swgl_ClipMaskOffset = {0, 0};
-static IntRect swgl_ClipMaskBounds = {0, 0, 0, 0};
-#define swgl_clipMask(mask, offset, bb_origin, bb_size)        \
-  do {                                                         \
-    if (bb_size != vec2_scalar(0.0f, 0.0f)) {                  \
-      swgl_ClipFlags |= SWGL_CLIP_FLAG_MASK;                   \
-      swgl_ClipMask = mask;                                    \
-      swgl_ClipMaskOffset = make_ivec2(offset);                \
-      swgl_ClipMaskBounds =                                    \
-          IntRect(make_ivec2(bb_origin), make_ivec2(bb_size)); \
-    }                                                          \
-  } while (0)
-
-// Extension to enable anti-aliasing for the given edges of a quad.
-// Blending must be enable for this to work.
-static int swgl_AAEdgeMask = 0;
-
-static ALWAYS_INLINE int calcAAEdgeMask(bool on) { return on ? 0xF : 0; }
-static ALWAYS_INLINE int calcAAEdgeMask(int mask) { return mask; }
-static ALWAYS_INLINE int calcAAEdgeMask(bvec4_scalar mask) {
-  return (mask.x ? 1 : 0) | (mask.y ? 2 : 0) | (mask.z ? 4 : 0) |
-         (mask.w ? 8 : 0);
-}
-
-#define swgl_antiAlias(edges)                \
-  do {                                       \
-    swgl_AAEdgeMask = calcAAEdgeMask(edges); \
-    if (swgl_AAEdgeMask) {                   \
-      swgl_ClipFlags |= SWGL_CLIP_FLAG_AA;   \
-    }                                        \
-  } while (0)
-
-#define swgl_blendDropShadow(color)                         \
-  do {                                                      \
-    swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE;        \
-    swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_DROP_SHADOW); \
-    swgl_BlendColorRGBA8 = packColor<uint32_t>(color);      \
-  } while (0)
-
-#define swgl_blendSubpixelText(color)                         \
-  do {                                                        \
-    swgl_ClipFlags |= SWGL_CLIP_FLAG_BLEND_OVERRIDE;          \
-    swgl_BlendOverride = BLEND_KEY(SWGL_BLEND_SUBPIXEL_TEXT); \
-    swgl_BlendColorRGBA8 = packColor<uint32_t>(color);        \
-    swgl_BlendAlphaRGBA8 = alphas(swgl_BlendColorRGBA8);      \
-  } while (0)
-
-// Dispatch helper used by the GLSL translator to swgl_drawSpan functions.
-// The number of pixels committed is tracked by checking for the difference in
-// swgl_SpanLength. Any varying interpolants used will be advanced past the
-// committed part of the span in case the fragment shader must be executed for
-// any remaining pixels that were not committed by the span shader.
-#define DISPATCH_DRAW_SPAN(self, format)        \
-  do {                                          \
-    int total = self->swgl_SpanLength;          \
-    self->swgl_drawSpan##format();              \
-    int drawn = total - self->swgl_SpanLength;  \
-    if (drawn) self->step_interp_inputs(drawn); \
-    return drawn;                               \
-  } while (0)
diff --git a/third_party/webrender/swgl/src/swgl_fns.rs b/third_party/webrender/swgl/src/swgl_fns.rs
index fdb55058afe..0cb60c6d4c8 100644
--- a/third_party/webrender/swgl/src/swgl_fns.rs
+++ b/third_party/webrender/swgl/src/swgl_fns.rs
@@ -14,12 +14,8 @@ macro_rules! debug {
     ($($x:tt)*) => {};
 }
 
-#[repr(C)]
-struct LockedTexture {
-    _private: [u8; 0],
-}
+extern "C" {}
 
-#[allow(dead_code)]
 extern "C" {
     fn ActiveTexture(texture: GLenum);
     fn BindTexture(target: GLenum, texture: GLuint);
@@ -65,7 +61,19 @@ extern "C" {
         level: GLint,
     );
     fn CheckFramebufferStatus(target: GLenum) -> GLenum;
-    fn InvalidateFramebuffer(target: GLenum, num_attachments: GLsizei, attachments: *const GLenum);
+    fn InvalidateFramebuffer(
+        target: GLenum,
+        num_attachments: GLsizei,
+        attachments: *const GLenum,
+    );
+    fn TexStorage3D(
+        target: GLenum,
+        levels: GLint,
+        internal_format: GLenum,
+        width: GLsizei,
+        height: GLsizei,
+        depth: GLsizei,
+    );
     fn TexImage2D(
         target: GLenum,
         level: GLint,
@@ -77,6 +85,18 @@ extern "C" {
         ty: GLenum,
         data: *const c_void,
     );
+    fn TexImage3D(
+        target: GLenum,
+        level: GLint,
+        internal_format: GLint,
+        width: GLsizei,
+        height: GLsizei,
+        depth: GLsizei,
+        border: GLint,
+        format: GLenum,
+        ty: GLenum,
+        data: *const c_void,
+    );
     fn TexSubImage2D(
         target: GLenum,
         level: GLint,
@@ -88,6 +108,19 @@ extern "C" {
         ty: GLenum,
         data: *const c_void,
     );
+    fn TexSubImage3D(
+        target: GLenum,
+        level: GLint,
+        xoffset: GLint,
+        yoffset: GLint,
+        zoffset: GLint,
+        width: GLsizei,
+        height: GLsizei,
+        depth: GLsizei,
+        format: GLenum,
+        ty: GLenum,
+        data: *const c_void,
+    );
     fn GenerateMipmap(target: GLenum);
     fn GetUniformLocation(program: GLuint, name: *const GLchar) -> GLint;
     fn BindAttribLocation(program: GLuint, index: GLuint, name: *const GLchar);
@@ -119,19 +152,26 @@ extern "C" {
         transpose: GLboolean,
         value: *const GLfloat,
     );
+
     fn DrawElementsInstanced(
         mode: GLenum,
         count: GLsizei,
         type_: GLenum,
-        indices: GLintptr,
+        indices: *const c_void,
         instancecount: GLsizei,
     );
     fn EnableVertexAttribArray(index: GLuint);
     fn VertexAttribDivisor(index: GLuint, divisor: GLuint);
     fn LinkProgram(program: GLuint);
-    fn GetLinkStatus(program: GLuint) -> GLint;
     fn UseProgram(program: GLuint);
     fn SetViewport(x: GLint, y: GLint, width: GLsizei, height: GLsizei);
+    fn FramebufferTextureLayer(
+        target: GLenum,
+        attachment: GLenum,
+        texture: GLuint,
+        level: GLint,
+        layer: GLint,
+    );
     fn FramebufferRenderbuffer(
         target: GLenum,
         attachment: GLenum,
@@ -145,31 +185,6 @@ extern "C" {
     fn ClearColor(r: GLfloat, g: GLfloat, b: GLfloat, a: GLfloat);
     fn ClearDepth(depth: GLdouble);
     fn Clear(mask: GLbitfield);
-    fn ClearTexSubImage(
-        target: GLenum,
-        level: GLint,
-        xoffset: GLint,
-        yoffset: GLint,
-        zoffset: GLint,
-        width: GLsizei,
-        height: GLsizei,
-        depth: GLsizei,
-        format: GLenum,
-        ty: GLenum,
-        data: *const c_void,
-    );
-    fn ClearTexImage(target: GLenum, level: GLint, format: GLenum, ty: GLenum, data: *const c_void);
-    fn ClearColorRect(
-        fbo: GLuint,
-        xoffset: GLint,
-        yoffset: GLint,
-        width: GLsizei,
-        height: GLsizei,
-        r: GLfloat,
-        g: GLfloat,
-        b: GLfloat,
-        a: GLfloat,
-    );
     fn PixelStorei(name: GLenum, param: GLint);
     fn ReadPixels(
         x: GLint,
@@ -210,6 +225,17 @@ extern "C" {
         width: GLsizei,
         height: GLsizei,
     );
+    fn CopyTexSubImage3D(
+        target: GLenum,
+        level: GLint,
+        xoffset: GLint,
+        yoffset: GLint,
+        zoffset: GLint,
+        x: GLint,
+        y: GLint,
+        width: GLsizei,
+        height: GLsizei,
+    );
     fn BlitFramebuffer(
         src_x0: GLint,
         src_y0: GLint,
@@ -227,33 +253,22 @@ extern "C" {
     fn GetString(name: GLenum) -> *const c_char;
     fn GetStringi(name: GLenum, index: GLuint) -> *const c_char;
     fn GetError() -> GLenum;
-    fn InitDefaultFramebuffer(
-        x: i32,
-        y: i32,
-        width: i32,
-        height: i32,
-        stride: i32,
-        buf: *mut c_void,
-    );
+    fn InitDefaultFramebuffer(width: i32, height: i32);
     fn GetColorBuffer(
         fbo: GLuint,
         flush: GLboolean,
         width: *mut i32,
         height: *mut i32,
-        stride: *mut i32,
     ) -> *mut c_void;
-    fn ResolveFramebuffer(fbo: GLuint);
     fn SetTextureBuffer(
         tex: GLuint,
         internal_format: GLenum,
         width: GLsizei,
         height: GLsizei,
-        stride: GLsizei,
         buf: *mut c_void,
         min_width: GLsizei,
         min_height: GLsizei,
     );
-    fn SetTextureParameter(tex: GLuint, pname: GLenum, param: GLint);
     fn DeleteTexture(n: GLuint);
     fn DeleteRenderbuffer(n: GLuint);
     fn DeleteFramebuffer(n: GLuint);
@@ -262,64 +277,23 @@ extern "C" {
     fn DeleteQuery(n: GLuint);
     fn DeleteShader(shader: GLuint);
     fn DeleteProgram(program: GLuint);
-    fn LockFramebuffer(fbo: GLuint) -> *mut LockedTexture;
-    fn LockTexture(tex: GLuint) -> *mut LockedTexture;
-    fn LockResource(resource: *mut LockedTexture);
-    fn UnlockResource(resource: *mut LockedTexture);
-    fn GetResourceBuffer(
-        resource: *mut LockedTexture,
-        width: *mut i32,
-        height: *mut i32,
-        stride: *mut i32,
-    ) -> *mut c_void;
     fn Composite(
-        locked_dst: *mut LockedTexture,
-        locked_src: *mut LockedTexture,
+        src_id: GLuint,
         src_x: GLint,
         src_y: GLint,
         src_width: GLsizei,
         src_height: GLsizei,
         dst_x: GLint,
         dst_y: GLint,
-        dst_width: GLsizei,
-        dst_height: GLsizei,
         opaque: GLboolean,
         flip: GLboolean,
-        filter: GLenum,
-        clip_x: GLint,
-        clip_y: GLint,
-        clip_width: GLsizei,
-        clip_height: GLsizei,
-    );
-    fn CompositeYUV(
-        locked_dst: *mut LockedTexture,
-        locked_y: *mut LockedTexture,
-        locked_u: *mut LockedTexture,
-        locked_v: *mut LockedTexture,
-        color_space: YUVColorSpace,
-        color_depth: GLuint,
-        src_x: GLint,
-        src_y: GLint,
-        src_width: GLsizei,
-        src_height: GLsizei,
-        dst_x: GLint,
-        dst_y: GLint,
-        dst_width: GLsizei,
-        dst_height: GLsizei,
-        flip: GLboolean,
-        clip_x: GLint,
-        clip_y: GLint,
-        clip_width: GLsizei,
-        clip_height: GLsizei,
     );
     fn CreateContext() -> *mut c_void;
-    fn ReferenceContext(ctx: *mut c_void);
     fn DestroyContext(ctx: *mut c_void);
     fn MakeCurrent(ctx: *mut c_void);
-    fn ReportMemory(size_of_op: unsafe extern "C" fn(ptr: *const c_void) -> usize) -> usize;
 }
 
-#[derive(Clone, Copy)]
+#[derive(Clone)]
 pub struct Context(*mut c_void);
 
 impl Context {
@@ -327,12 +301,6 @@ impl Context {
         Context(unsafe { CreateContext() })
     }
 
-    pub fn reference(&self) {
-        unsafe {
-            ReferenceContext(self.0);
-        }
-    }
-
     pub fn destroy(&self) {
         unsafe {
             DestroyContext(self.0);
@@ -345,56 +313,18 @@ impl Context {
         }
     }
 
-    pub fn init_default_framebuffer(
-        &self,
-        x: i32,
-        y: i32,
-        width: i32,
-        height: i32,
-        stride: i32,
-        buf: *mut c_void,
-    ) {
+    pub fn init_default_framebuffer(&self, width: i32, height: i32) {
         unsafe {
-            InitDefaultFramebuffer(x, y, width, height, stride, buf);
+            InitDefaultFramebuffer(width, height);
         }
     }
 
-    pub fn get_color_buffer(&self, fbo: GLuint, flush: bool) -> (*mut c_void, i32, i32, i32) {
+    pub fn get_color_buffer(&self, fbo: GLuint, flush: bool) -> (*mut c_void, i32, i32) {
         unsafe {
             let mut width: i32 = 0;
             let mut height: i32 = 0;
-            let mut stride: i32 = 0;
-            let data_ptr = GetColorBuffer(
-                fbo,
-                flush as GLboolean,
-                &mut width,
-                &mut height,
-                &mut stride,
-            );
-            (data_ptr, width, height, stride)
-        }
-    }
-
-    pub fn resolve_framebuffer(&self, fbo: GLuint) {
-        unsafe {
-            ResolveFramebuffer(fbo);
-        }
-    }
-
-    pub fn clear_color_rect(
-        &self,
-        fbo: GLuint,
-        xoffset: GLint,
-        yoffset: GLint,
-        width: GLsizei,
-        height: GLsizei,
-        r: f32,
-        g: f32,
-        b: f32,
-        a: f32,
-    ) {
-        unsafe {
-            ClearColorRect(fbo, xoffset, yoffset, width, height, r, g, b, a);
+            let data_ptr = GetColorBuffer(fbo, flush as GLboolean, &mut width, &mut height);
+            (data_ptr, width, height)
         }
     }
 
@@ -404,7 +334,6 @@ impl Context {
         internal_format: GLenum,
         width: GLsizei,
         height: GLsizei,
-        stride: GLsizei,
         buf: *mut c_void,
         min_width: GLsizei,
         min_height: GLsizei,
@@ -415,7 +344,6 @@ impl Context {
                 internal_format,
                 width,
                 height,
-                stride,
                 buf,
                 min_width,
                 min_height,
@@ -423,37 +351,32 @@ impl Context {
         }
     }
 
-    pub fn set_texture_parameter(&self, tex: GLuint, pname: GLenum, param: GLint) {
-        unsafe {
-            SetTextureParameter(tex, pname, param);
-        }
-    }
-
-    pub fn lock_framebuffer(&self, fbo: GLuint) -> Option<LockedResource> {
-        unsafe {
-            let resource = LockFramebuffer(fbo);
-            if resource != ptr::null_mut() {
-                Some(LockedResource(resource))
-            } else {
-                None
-            }
-        }
-    }
-
-    pub fn lock_texture(&self, tex: GLuint) -> Option<LockedResource> {
+    pub fn composite(
+        &self,
+        src_id: GLuint,
+        src_x: GLint,
+        src_y: GLint,
+        src_width: GLsizei,
+        src_height: GLint,
+        dst_x: GLint,
+        dst_y: GLint,
+        opaque: bool,
+        flip: bool,
+    ) {
         unsafe {
-            let resource = LockTexture(tex);
-            if resource != ptr::null_mut() {
-                Some(LockedResource(resource))
-            } else {
-                None
-            }
+            Composite(
+                src_id,
+                src_x,
+                src_y,
+                src_width,
+                src_height,
+                dst_x,
+                dst_y,
+                opaque as GLboolean,
+                flip as GLboolean,
+            );
         }
     }
-
-    pub fn report_memory(size_of_op: unsafe extern "C" fn(ptr: *const c_void) -> usize) -> usize {
-        unsafe { ReportMemory(size_of_op) }
-    }
 }
 
 impl From<*mut c_void> for Context {
@@ -488,7 +411,6 @@ fn calculate_length(width: GLsizei, height: GLsizei, format: GLenum, pixel_type:
         UNSIGNED_SHORT => 2,
         SHORT => 2,
         FLOAT => 4,
-        UNSIGNED_INT_8_8_8_8_REV => 1,
         _ => panic!("unsupported pixel_type for read_pixels: {:?}", pixel_type),
     };
 
@@ -563,8 +485,8 @@ impl Gl for Context {
             let u = str::from_utf8(s).unwrap();
             const PREFIX: &'static str = "// shader: ";
             if let Some(start) = u.find(PREFIX) {
-                if let Some(end) = u[start..].find('\n') {
-                    let name = u[start + PREFIX.len()..start + end].trim();
+                if let Some(end) = u[start ..].find('\n') {
+                    let name = u[start + PREFIX.len() .. start + end].trim();
                     debug!("shader name: {}", name);
                     unsafe {
                         let c_string = CString::new(name).unwrap();
@@ -1033,6 +955,7 @@ impl Gl for Context {
         panic!();
     }
 
+    // FIXME: Does not verify buffer size -- unsafe!
     fn tex_image_3d(
         &self,
         target: GLenum,
@@ -1046,7 +969,24 @@ impl Gl for Context {
         ty: GLenum,
         opt_data: Option<&[u8]>,
     ) {
-        panic!();
+        unsafe {
+            let pdata = match opt_data {
+                Some(data) => data.as_ptr() as *const GLvoid,
+                None => ptr::null(),
+            };
+            TexImage3D(
+                target,
+                level,
+                internal_format,
+                width,
+                height,
+                depth,
+                border,
+                format,
+                ty,
+                pdata,
+            );
+        }
     }
 
     fn copy_tex_image_2d(
@@ -1091,7 +1031,11 @@ impl Gl for Context {
         width: GLsizei,
         height: GLsizei,
     ) {
-        panic!();
+        unsafe {
+            CopyTexSubImage3D(
+                target, level, xoffset, yoffset, zoffset, x, y, width, height,
+            );
+        }
     }
 
     fn tex_sub_image_2d(
@@ -1173,7 +1117,22 @@ impl Gl for Context {
         data: &[u8],
     ) {
         debug!("tex_sub_image_3d");
-        panic!();
+        //panic!();
+        unsafe {
+            TexSubImage3D(
+                target,
+                level,
+                xoffset,
+                yoffset,
+                zoffset,
+                width,
+                height,
+                depth,
+                format,
+                ty,
+                data.as_ptr() as *const c_void,
+            );
+        }
     }
 
     fn tex_sub_image_3d_pbo(
@@ -1190,7 +1149,21 @@ impl Gl for Context {
         ty: GLenum,
         offset: usize,
     ) {
-        panic!();
+        unsafe {
+            TexSubImage3D(
+                target,
+                level,
+                xoffset,
+                yoffset,
+                zoffset,
+                width,
+                height,
+                depth,
+                format,
+                ty,
+                offset as *const c_void,
+            );
+        }
     }
 
     fn tex_storage_2d(
@@ -1216,7 +1189,10 @@ impl Gl for Context {
         height: GLsizei,
         depth: GLsizei,
     ) {
-        panic!();
+        //panic!();
+        unsafe {
+            TexStorage3D(target, levels, internal_format, width, height, depth);
+        }
     }
 
     fn get_tex_image_into_buffer(
@@ -1376,7 +1352,10 @@ impl Gl for Context {
             "framebuffer_texture_layer {} {} {} {} {}",
             target, attachment, texture, level, layer
         );
-        panic!();
+        //panic!();
+        unsafe {
+            FramebufferTextureLayer(target, attachment, texture, level, layer);
+        }
     }
 
     fn blit_framebuffer(
@@ -1498,9 +1477,7 @@ impl Gl for Context {
     }
 
     fn draw_arrays(&self, mode: GLenum, first: GLint, count: GLsizei) {
-        unsafe {
-            DrawElementsInstanced(mode, count, NONE, first as GLintptr, 1);
-        }
+        panic!();
     }
 
     fn draw_arrays_instanced(
@@ -1510,9 +1487,7 @@ impl Gl for Context {
         count: GLsizei,
         primcount: GLsizei,
     ) {
-        unsafe {
-            DrawElementsInstanced(mode, count, NONE, first as GLintptr, primcount);
-        }
+        panic!();
     }
 
     fn draw_elements(
@@ -1528,7 +1503,13 @@ impl Gl for Context {
         );
         //panic!();
         unsafe {
-            DrawElementsInstanced(mode, count, element_type, indices_offset as GLintptr, 1);
+            DrawElementsInstanced(
+                mode,
+                count,
+                element_type,
+                indices_offset as *const c_void,
+                1,
+            );
         }
     }
 
@@ -1550,7 +1531,7 @@ impl Gl for Context {
                 mode,
                 count,
                 element_type,
-                indices_offset as GLintptr,
+                indices_offset as *const c_void,
                 primcount,
             );
         }
@@ -1843,8 +1824,8 @@ impl Gl for Context {
     }
 
     fn get_program_info_log(&self, program: GLuint) -> String {
-        debug!("get_program_info_log {}", program);
-        String::new()
+        panic!();
+        //String::new()
     }
 
     #[inline]
@@ -1854,7 +1835,7 @@ impl Gl for Context {
         assert!(!result.is_empty());
         //#define GL_LINK_STATUS                    0x8B82
         if pname == 0x8b82 {
-            result[0] = GetLinkStatus(program);
+            result[0] = 1;
         }
     }
 
@@ -2118,7 +2099,7 @@ impl Gl for Context {
         //ptr::null()
     }
 
-    fn client_wait_sync(&self, sync: GLsync, flags: GLbitfield, timeout: GLuint64) -> GLenum {
+    fn client_wait_sync(&self, sync: GLsync, flags: GLbitfield, timeout: GLuint64) {
         panic!();
     }
 
@@ -2191,7 +2172,7 @@ impl Gl for Context {
 
     // GL_KHR_blend_equation_advanced
     fn blend_barrier_khr(&self) {
-        // No barrier required, so nothing to do
+        panic!();
     }
 
     // GL_CHROMIUM_copy_texture
@@ -2269,158 +2250,4 @@ impl Gl for Context {
     ) {
         unimplemented!("Not supported by SWGL");
     }
-
-    fn buffer_storage(
-        &self,
-        target: GLenum,
-        size: GLsizeiptr,
-        data: *const GLvoid,
-        flags: GLbitfield,
-    ) {
-        unimplemented!("Not supported by SWGL");
-    }
-
-    fn flush_mapped_buffer_range(&self, target: GLenum, offset: GLintptr, length: GLsizeiptr) {
-        unimplemented!("Not supported by SWGL");
-    }
-}
-
-/// A resource that is intended for sharing between threads.
-/// Locked resources such as textures or framebuffers will
-/// not allow any further modifications while it remains
-/// locked. The resource will be unlocked when LockedResource
-/// is dropped.
-pub struct LockedResource(*mut LockedTexture);
-
-unsafe impl Send for LockedResource {}
-unsafe impl Sync for LockedResource {}
-
-#[repr(C)]
-pub enum YUVColorSpace {
-    Rec601 = 0,
-    Rec709,
-    Rec2020,
-    Identity,
-}
-
-impl LockedResource {
-    /// Composites from a locked resource to another locked resource. The band
-    /// offset and height are relative to the destination rectangle and specify
-    /// how to clip the composition into appropriate range for this band.
-    pub fn composite(
-        &self,
-        locked_src: &LockedResource,
-        src_x: GLint,
-        src_y: GLint,
-        src_width: GLsizei,
-        src_height: GLsizei,
-        dst_x: GLint,
-        dst_y: GLint,
-        dst_width: GLsizei,
-        dst_height: GLsizei,
-        opaque: bool,
-        flip: bool,
-        filter: GLenum,
-        clip_x: GLint,
-        clip_y: GLint,
-        clip_width: GLsizei,
-        clip_height: GLsizei,
-    ) {
-        unsafe {
-            Composite(
-                self.0,
-                locked_src.0,
-                src_x,
-                src_y,
-                src_width,
-                src_height,
-                dst_x,
-                dst_y,
-                dst_width,
-                dst_height,
-                opaque as GLboolean,
-                flip as GLboolean,
-                filter,
-                clip_x,
-                clip_y,
-                clip_width,
-                clip_height,
-            );
-        }
-    }
-
-    /// Composites from locked resources representing YUV planes
-    pub fn composite_yuv(
-        &self,
-        locked_y: &LockedResource,
-        locked_u: &LockedResource,
-        locked_v: &LockedResource,
-        color_space: YUVColorSpace,
-        color_depth: GLuint,
-        src_x: GLint,
-        src_y: GLint,
-        src_width: GLsizei,
-        src_height: GLsizei,
-        dst_x: GLint,
-        dst_y: GLint,
-        dst_width: GLsizei,
-        dst_height: GLsizei,
-        flip: bool,
-        clip_x: GLint,
-        clip_y: GLint,
-        clip_width: GLsizei,
-        clip_height: GLsizei,
-    ) {
-        unsafe {
-            CompositeYUV(
-                self.0,
-                locked_y.0,
-                locked_u.0,
-                locked_v.0,
-                color_space,
-                color_depth,
-                src_x,
-                src_y,
-                src_width,
-                src_height,
-                dst_x,
-                dst_y,
-                dst_width,
-                dst_height,
-                flip as GLboolean,
-                clip_x,
-                clip_y,
-                clip_width,
-                clip_height,
-            );
-        }
-    }
-
-    /// Get the underlying buffer for a locked resource
-    pub fn get_buffer(&self) -> (*mut c_void, i32, i32, i32) {
-        unsafe {
-            let mut width: i32 = 0;
-            let mut height: i32 = 0;
-            let mut stride: i32 = 0;
-            let data_ptr = GetResourceBuffer(self.0, &mut width, &mut height, &mut stride);
-            (data_ptr, width, height, stride)
-        }
-    }
-}
-
-impl Clone for LockedResource {
-    fn clone(&self) -> Self {
-        unsafe {
-            LockResource(self.0);
-        }
-        LockedResource(self.0)
-    }
-}
-
-impl Drop for LockedResource {
-    fn drop(&mut self) {
-        unsafe {
-            UnlockResource(self.0);
-        }
-    }
 }
diff --git a/third_party/webrender/swgl/src/texture.h b/third_party/webrender/swgl/src/texture.h
index fdace241eb5..0219d078bcf 100644
--- a/third_party/webrender/swgl/src/texture.h
+++ b/third_party/webrender/swgl/src/texture.h
@@ -2,884 +2,19 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-namespace glsl {
-
-using PackedRGBA8 = V16<uint8_t>;
-using WideRGBA8 = V16<uint16_t>;
-using HalfRGBA8 = V8<uint16_t>;
-
-SI WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); }
-
-template <int N>
-UNUSED SI VectorType<uint8_t, N> genericPackWide(VectorType<uint16_t, N> p) {
-  typedef VectorType<uint8_t, N> packed_type;
-  // Generic conversions only mask off the low byte without actually clamping
-  // like a real pack. First force the word to all 1s if it overflows, and then
-  // add on the sign bit to cause it to roll over to 0 if it was negative.
-  p = (p | (p > 255)) + (p >> 15);
-  return CONVERT(p, packed_type);
-}
-
-SI PackedRGBA8 pack(WideRGBA8 p) {
-#if USE_SSE2
-  return _mm_packus_epi16(lowHalf(p), highHalf(p));
-#elif USE_NEON
-  return vcombine_u8(vqmovn_u16(lowHalf(p)), vqmovn_u16(highHalf(p)));
-#else
-  return genericPackWide(p);
-#endif
-}
-
-using PackedR8 = V4<uint8_t>;
-using WideR8 = V4<uint16_t>;
-
-SI WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); }
-
-SI PackedR8 pack(WideR8 p) {
-#if USE_SSE2
-  auto m = expand(p);
-  auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m));
-  return SHUFFLE(r, r, 0, 1, 2, 3);
-#elif USE_NEON
-  return lowHalf(bit_cast<V8<uint8_t>>(vqmovn_u16(expand(p))));
-#else
-  return genericPackWide(p);
-#endif
-}
-
-using PackedRG8 = V8<uint8_t>;
-using WideRG8 = V8<uint16_t>;
-
-SI PackedRG8 pack(WideRG8 p) {
-#if USE_SSE2
-  return lowHalf(bit_cast<V16<uint8_t>>(_mm_packus_epi16(p, p)));
-#elif USE_NEON
-  return bit_cast<V8<uint8_t>>(vqmovn_u16(p));
-#else
-  return genericPackWide(p);
-#endif
-}
-
-SI I32 clampCoord(I32 coord, int limit, int base = 0) {
-#if USE_SSE2
-  return _mm_min_epi16(_mm_max_epi16(coord, _mm_set1_epi32(base)),
-                       _mm_set1_epi32(limit - 1));
-#else
-  return clamp(coord, base, limit - 1);
-#endif
-}
-
-SI int clampCoord(int coord, int limit, int base = 0) {
-  return min(max(coord, base), limit - 1);
-}
-
-template <typename T, typename S>
-SI T clamp2D(T P, S sampler) {
-  return T{clampCoord(P.x, sampler->width), clampCoord(P.y, sampler->height)};
-}
-
-SI float to_float(uint32_t x) { return x * (1.f / 255.f); }
-
-SI vec4 pixel_to_vec4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
-  U32 pixels = {a, b, c, d};
-  return vec4(cast((pixels >> 16) & 0xFF), cast((pixels >> 8) & 0xFF),
-              cast(pixels & 0xFF), cast(pixels >> 24)) *
-         (1.0f / 255.0f);
-}
-
-SI vec4 pixel_float_to_vec4(Float a, Float b, Float c, Float d) {
-  return vec4(Float{a.x, b.x, c.x, d.x}, Float{a.y, b.y, c.y, d.y},
-              Float{a.z, b.z, c.z, d.z}, Float{a.w, b.w, c.w, d.w});
-}
-
-SI ivec4 pixel_int_to_ivec4(I32 a, I32 b, I32 c, I32 d) {
-  return ivec4(I32{a.x, b.x, c.x, d.x}, I32{a.y, b.y, c.y, d.y},
-               I32{a.z, b.z, c.z, d.z}, I32{a.w, b.w, c.w, d.w});
-}
-
-SI vec4_scalar pixel_to_vec4(uint32_t p) {
-  U32 i = {(p >> 16) & 0xFF, (p >> 8) & 0xFF, p & 0xFF, p >> 24};
-  Float f = cast(i) * (1.0f / 255.0f);
-  return vec4_scalar(f.x, f.y, f.z, f.w);
-}
-
-template <typename S>
-SI vec4 fetchOffsetsRGBA8(S sampler, I32 offset) {
-  return pixel_to_vec4(sampler->buf[offset.x], sampler->buf[offset.y],
-                       sampler->buf[offset.z], sampler->buf[offset.w]);
-}
-
-template <typename S>
-vec4 texelFetchRGBA8(S sampler, ivec2 P) {
-  I32 offset = P.x + P.y * sampler->stride;
-  return fetchOffsetsRGBA8(sampler, offset);
-}
-
-template <typename S>
-SI Float fetchOffsetsR8(S sampler, I32 offset) {
-  U32 i = {
-      ((uint8_t*)sampler->buf)[offset.x], ((uint8_t*)sampler->buf)[offset.y],
-      ((uint8_t*)sampler->buf)[offset.z], ((uint8_t*)sampler->buf)[offset.w]};
-  return cast(i) * (1.0f / 255.0f);
-}
-
-template <typename S>
-vec4 texelFetchR8(S sampler, ivec2 P) {
-  I32 offset = P.x + P.y * sampler->stride;
-  return vec4(fetchOffsetsR8(sampler, offset), 0.0f, 0.0f, 1.0f);
-}
-
-template <typename S>
-SI vec4 fetchOffsetsRG8(S sampler, I32 offset) {
-  uint16_t* buf = (uint16_t*)sampler->buf;
-  U16 pixels = {buf[offset.x], buf[offset.y], buf[offset.z], buf[offset.w]};
-  Float r = CONVERT(pixels & 0xFF, Float) * (1.0f / 255.0f);
-  Float g = CONVERT(pixels >> 8, Float) * (1.0f / 255.0f);
-  return vec4(r, g, 0.0f, 1.0f);
-}
-
-template <typename S>
-vec4 texelFetchRG8(S sampler, ivec2 P) {
-  I32 offset = P.x + P.y * sampler->stride;
-  return fetchOffsetsRG8(sampler, offset);
-}
-
 template <typename S>
-SI Float fetchOffsetsR16(S sampler, I32 offset) {
-  U32 i = {
-      ((uint16_t*)sampler->buf)[offset.x], ((uint16_t*)sampler->buf)[offset.y],
-      ((uint16_t*)sampler->buf)[offset.z], ((uint16_t*)sampler->buf)[offset.w]};
-  return cast(i) * (1.0f / 65535.0f);
-}
-
-template <typename S>
-vec4 texelFetchR16(S sampler, ivec2 P) {
-  I32 offset = P.x + P.y * sampler->stride;
-  return vec4(fetchOffsetsR16(sampler, offset), 0.0f, 0.0f, 1.0f);
-}
-
-template <typename S>
-SI vec4 fetchOffsetsFloat(S sampler, I32 offset) {
-  return pixel_float_to_vec4(
-      *(Float*)&sampler->buf[offset.x], *(Float*)&sampler->buf[offset.y],
-      *(Float*)&sampler->buf[offset.z], *(Float*)&sampler->buf[offset.w]);
-}
-
-vec4 texelFetchFloat(sampler2D sampler, ivec2 P) {
-  I32 offset = P.x * 4 + P.y * sampler->stride;
-  return fetchOffsetsFloat(sampler, offset);
-}
-
-template <typename S>
-SI vec4 fetchOffsetsYUV422(S sampler, I32 offset) {
-  // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R.
-  // Offset is aligned to a chunk rather than a pixel, and selector specifies
-  // pixel within the chunk.
-  I32 selector = offset & 1;
-  offset &= ~1;
-  uint16_t* buf = (uint16_t*)sampler->buf;
-  U32 pixels = {*(uint32_t*)&buf[offset.x], *(uint32_t*)&buf[offset.y],
-                *(uint32_t*)&buf[offset.z], *(uint32_t*)&buf[offset.w]};
-  Float b = CONVERT((pixels >> 8) & 0xFF, Float) * (1.0f / 255.0f);
-  Float r = CONVERT((pixels >> 24), Float) * (1.0f / 255.0f);
-  Float g =
-      CONVERT(if_then_else(-selector, pixels >> 16, pixels) & 0xFF, Float) *
-      (1.0f / 255.0f);
-  return vec4(r, g, b, 1.0f);
-}
-
-template <typename S>
-vec4 texelFetchYUV422(S sampler, ivec2 P) {
-  I32 offset = P.x + P.y * sampler->stride;
-  return fetchOffsetsYUV422(sampler, offset);
-}
-
-vec4 texelFetch(sampler2D sampler, ivec2 P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
-  switch (sampler->format) {
-    case TextureFormat::RGBA32F:
-      return texelFetchFloat(sampler, P);
-    case TextureFormat::RGBA8:
-      return texelFetchRGBA8(sampler, P);
-    case TextureFormat::R8:
-      return texelFetchR8(sampler, P);
-    case TextureFormat::RG8:
-      return texelFetchRG8(sampler, P);
-    case TextureFormat::R16:
-      return texelFetchR16(sampler, P);
-    case TextureFormat::YUV422:
-      return texelFetchYUV422(sampler, P);
-    default:
-      assert(false);
-      return vec4();
-  }
-}
-
-vec4 texelFetch(sampler2DRGBA32F sampler, ivec2 P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
-  assert(sampler->format == TextureFormat::RGBA32F);
-  return texelFetchFloat(sampler, P);
-}
-
-vec4 texelFetch(sampler2DRGBA8 sampler, ivec2 P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
+static PackedRGBA8 textureLinearPackedRGBA8(S sampler, ivec2 i, int zoffset) {
   assert(sampler->format == TextureFormat::RGBA8);
-  return texelFetchRGBA8(sampler, P);
-}
-
-vec4 texelFetch(sampler2DR8 sampler, ivec2 P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
-  assert(sampler->format == TextureFormat::R8);
-  return texelFetchR8(sampler, P);
-}
-
-vec4 texelFetch(sampler2DRG8 sampler, ivec2 P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
-  assert(sampler->format == TextureFormat::RG8);
-  return texelFetchRG8(sampler, P);
-}
-
-vec4_scalar texelFetch(sampler2D sampler, ivec2_scalar P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
-  if (sampler->format == TextureFormat::RGBA32F) {
-    return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
-  } else {
-    assert(sampler->format == TextureFormat::RGBA8);
-    return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
-  }
-}
-
-vec4_scalar texelFetch(sampler2DRGBA32F sampler, ivec2_scalar P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
-  assert(sampler->format == TextureFormat::RGBA32F);
-  return *(vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
-}
-
-vec4_scalar texelFetch(sampler2DRGBA8 sampler, ivec2_scalar P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
-  assert(sampler->format == TextureFormat::RGBA8);
-  return pixel_to_vec4(sampler->buf[P.x + P.y * sampler->stride]);
-}
-
-vec4_scalar texelFetch(sampler2DR8 sampler, ivec2_scalar P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
-  assert(sampler->format == TextureFormat::R8);
-  return vec4_scalar{
-      to_float(((uint8_t*)sampler->buf)[P.x + P.y * sampler->stride]), 0.0f,
-      0.0f, 1.0f};
-}
-
-vec4_scalar texelFetch(sampler2DRG8 sampler, ivec2_scalar P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
-  assert(sampler->format == TextureFormat::RG8);
-  uint16_t pixel = ((uint16_t*)sampler->buf)[P.x + P.y * sampler->stride];
-  return vec4_scalar{to_float(pixel & 0xFF), to_float(pixel >> 8), 0.0f, 1.0f};
-}
-
-vec4 texelFetch(sampler2DRect sampler, ivec2 P) {
-  P = clamp2D(P, sampler);
-  switch (sampler->format) {
-    case TextureFormat::RGBA8:
-      return texelFetchRGBA8(sampler, P);
-    case TextureFormat::R8:
-      return texelFetchR8(sampler, P);
-    case TextureFormat::RG8:
-      return texelFetchRG8(sampler, P);
-    case TextureFormat::R16:
-      return texelFetchR16(sampler, P);
-    case TextureFormat::YUV422:
-      return texelFetchYUV422(sampler, P);
-    default:
-      assert(false);
-      return vec4();
-  }
-}
-
-template <typename S>
-SI ivec4 fetchOffsetsInt(S sampler, I32 offset) {
-  return pixel_int_to_ivec4(
-      *(I32*)&sampler->buf[offset.x], *(I32*)&sampler->buf[offset.y],
-      *(I32*)&sampler->buf[offset.z], *(I32*)&sampler->buf[offset.w]);
-}
-
-ivec4 texelFetch(isampler2D sampler, ivec2 P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
-  assert(sampler->format == TextureFormat::RGBA32I);
-  I32 offset = P.x * 4 + P.y * sampler->stride;
-  return fetchOffsetsInt(sampler, offset);
-}
-
-ivec4_scalar texelFetch(isampler2D sampler, ivec2_scalar P, int lod) {
-  assert(lod == 0);
-  P = clamp2D(P, sampler);
-  assert(sampler->format == TextureFormat::RGBA32I);
-  return *(ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
-}
-
-SI vec4_scalar* texelFetchPtr(sampler2D sampler, ivec2_scalar P, int min_x,
-                              int max_x, int min_y, int max_y) {
-  P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x);
-  P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y);
-  assert(sampler->format == TextureFormat::RGBA32F);
-  return (vec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
-}
-
-SI ivec4_scalar* texelFetchPtr(isampler2D sampler, ivec2_scalar P, int min_x,
-                               int max_x, int min_y, int max_y) {
-  P.x = min(max(P.x, -min_x), int(sampler->width) - 1 - max_x);
-  P.y = min(max(P.y, -min_y), int(sampler->height) - 1 - max_y);
-  assert(sampler->format == TextureFormat::RGBA32I);
-  return (ivec4_scalar*)&sampler->buf[P.x * 4 + P.y * sampler->stride];
-}
-
-template <typename S>
-SI I32 texelFetchPtr(S sampler, ivec2 P, int min_x, int max_x, int min_y,
-                     int max_y) {
-  P.x = clampCoord(P.x, int(sampler->width) - max_x, -min_x);
-  P.y = clampCoord(P.y, int(sampler->height) - max_y, -min_y);
-  return P.x * 4 + P.y * sampler->stride;
-}
-
-template <typename S, typename P>
-SI P texelFetchUnchecked(S sampler, P* ptr, int x, int y = 0) {
-  return ptr[x + y * (sampler->stride >> 2)];
-}
-
-SI vec4 texelFetchUnchecked(sampler2D sampler, I32 offset, int x, int y = 0) {
-  assert(sampler->format == TextureFormat::RGBA32F);
-  return fetchOffsetsFloat(sampler, offset + (x * 4 + y * sampler->stride));
-}
-
-SI ivec4 texelFetchUnchecked(isampler2D sampler, I32 offset, int x, int y = 0) {
-  assert(sampler->format == TextureFormat::RGBA32I);
-  return fetchOffsetsInt(sampler, offset + (x * 4 + y * sampler->stride));
-}
-
-#define texelFetchOffset(sampler, P, lod, offset) \
-  texelFetch(sampler, (P) + (offset), lod)
-
-// Scale texture coords for quantization, subtract offset for filtering
-// (assuming coords already offset to texel centers), and round to nearest
-// 1/scale increment
-template <typename T>
-SI T linearQuantize(T P, float scale) {
-  return P * scale + (0.5f - 0.5f * scale);
-}
-
-// Helper version that also scales normalized texture coords for sampler
-template <typename T, typename S>
-SI T samplerScale(S sampler, T P) {
-  P.x *= sampler->width;
-  P.y *= sampler->height;
-  return P;
-}
-
-template <typename T>
-SI T samplerScale(UNUSED sampler2DRect sampler, T P) {
-  return P;
-}
-
-template <typename T, typename S>
-SI T linearQuantize(T P, float scale, S sampler) {
-  return linearQuantize(samplerScale(sampler, P), scale);
-}
-
-// Compute clamped offset of first row for linear interpolation
-template <typename S, typename I>
-SI auto computeRow(S sampler, I i, size_t margin = 1) -> decltype(i.x) {
-  return clampCoord(i.x, sampler->width - margin) +
-         clampCoord(i.y, sampler->height) * sampler->stride;
-}
-
-// Compute clamped offset of second row for linear interpolation from first row
-template <typename S, typename I>
-SI auto computeNextRowOffset(S sampler, I i) -> decltype(i.x) {
-  return if_then_else(i.y >= 0 && i.y < int32_t(sampler->height) - 1,
-                      sampler->stride, 0);
-}
-
-// Convert X coordinate to a 2^7 scale fraction for interpolation
-template <typename S>
-SI I16 computeFracX(S sampler, ivec2 i, ivec2 frac) {
-  auto overread = i.x > int32_t(sampler->width) - 2;
-  return CONVERT((((frac.x & (i.x >= 0)) | overread) & 0x7F) - overread, I16);
-}
-
-// Convert Y coordinate to a 2^7 scale fraction for interpolation
-SI I16 computeFracNoClamp(I32 frac) { return CONVERT(frac & 0x7F, I16); }
-SI I16 computeFracY(ivec2 frac) { return computeFracNoClamp(frac.y); }
-
-struct WidePlanarRGBA8 {
-  V8<uint16_t> rg;
-  V8<uint16_t> ba;
-};
-
-template <typename S>
-SI WidePlanarRGBA8 textureLinearPlanarRGBA8(S sampler, ivec2 i) {
-  assert(sampler->format == TextureFormat::RGBA8);
-
-  ivec2 frac = i;
-  i >>= 7;
-
-  I32 row0 = computeRow(sampler, i);
-  I32 row1 = row0 + computeNextRowOffset(sampler, i);
-  I16 fracx = computeFracX(sampler, i, frac);
-  I16 fracy = computeFracY(frac);
-
-  auto a0 =
-      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>);
-  auto a1 =
-      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.x]), V8<int16_t>);
-  a0 += ((a1 - a0) * fracy.x) >> 7;
-
-  auto b0 =
-      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.y]), V8<int16_t>);
-  auto b1 =
-      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.y]), V8<int16_t>);
-  b0 += ((b1 - b0) * fracy.y) >> 7;
-
-  auto abl = zipLow(a0, b0);
-  auto abh = zipHigh(a0, b0);
-  abl += ((abh - abl) * fracx.xyxyxyxy) >> 7;
-
-  auto c0 =
-      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.z]), V8<int16_t>);
-  auto c1 =
-      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.z]), V8<int16_t>);
-  c0 += ((c1 - c0) * fracy.z) >> 7;
-
-  auto d0 =
-      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.w]), V8<int16_t>);
-  auto d1 =
-      CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row1.w]), V8<int16_t>);
-  d0 += ((d1 - d0) * fracy.w) >> 7;
-
-  auto cdl = zipLow(c0, d0);
-  auto cdh = zipHigh(c0, d0);
-  cdl += ((cdh - cdl) * fracx.zwzwzwzw) >> 7;
-
-  auto rg = V8<uint16_t>(zip2Low(abl, cdl));
-  auto ba = V8<uint16_t>(zip2High(abl, cdl));
-  return WidePlanarRGBA8{rg, ba};
-}
-
-template <typename S>
-vec4 textureLinearRGBA8(S sampler, vec2 P) {
-  ivec2 i(linearQuantize(P, 128, sampler));
-  auto planar = textureLinearPlanarRGBA8(sampler, i);
-  auto rg = CONVERT(planar.rg, V8<float>);
-  auto ba = CONVERT(planar.ba, V8<float>);
-  auto r = lowHalf(rg);
-  auto g = highHalf(rg);
-  auto b = lowHalf(ba);
-  auto a = highHalf(ba);
-  return vec4(b, g, r, a) * (1.0f / 255.0f);
-}
-
-template <typename S>
-static inline U16 textureLinearUnpackedR8(S sampler, ivec2 i) {
-  assert(sampler->format == TextureFormat::R8);
-  ivec2 frac = i;
-  i >>= 7;
-
-  I32 row0 = computeRow(sampler, i);
-  I32 row1 = row0 + computeNextRowOffset(sampler, i);
-  I16 fracx = computeFracX(sampler, i, frac);
-  I16 fracy = computeFracY(frac);
-
-  uint8_t* buf = (uint8_t*)sampler->buf;
-  auto a0 = unaligned_load<V2<uint8_t>>(&buf[row0.x]);
-  auto b0 = unaligned_load<V2<uint8_t>>(&buf[row0.y]);
-  auto c0 = unaligned_load<V2<uint8_t>>(&buf[row0.z]);
-  auto d0 = unaligned_load<V2<uint8_t>>(&buf[row0.w]);
-  auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>);
-
-  auto a1 = unaligned_load<V2<uint8_t>>(&buf[row1.x]);
-  auto b1 = unaligned_load<V2<uint8_t>>(&buf[row1.y]);
-  auto c1 = unaligned_load<V2<uint8_t>>(&buf[row1.z]);
-  auto d1 = unaligned_load<V2<uint8_t>>(&buf[row1.w]);
-  auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>);
-
-  abcd0 += ((abcd1 - abcd0) * fracy.xxyyzzww) >> 7;
-
-  abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);
-  auto abcdl = lowHalf(abcd0);
-  auto abcdh = highHalf(abcd0);
-  abcdl += ((abcdh - abcdl) * fracx) >> 7;
-
-  return U16(abcdl);
-}
-
-template <typename S>
-vec4 textureLinearR8(S sampler, vec2 P) {
-  assert(sampler->format == TextureFormat::R8);
-
-  ivec2 i(linearQuantize(P, 128, sampler));
-  Float r = CONVERT(textureLinearUnpackedR8(sampler, i), Float);
-  return vec4(r * (1.0f / 255.0f), 0.0f, 0.0f, 1.0f);
-}
-
-struct WidePlanarRG8 {
-  V8<uint16_t> rg;
-};
-
-template <typename S>
-SI WidePlanarRG8 textureLinearPlanarRG8(S sampler, ivec2 i) {
-  assert(sampler->format == TextureFormat::RG8);
-
-  ivec2 frac = i;
-  i >>= 7;
-
-  I32 row0 = computeRow(sampler, i);
-  I32 row1 = row0 + computeNextRowOffset(sampler, i);
-  I16 fracx = computeFracX(sampler, i, frac);
-  I16 fracy = computeFracY(frac);
-
-  uint16_t* buf = (uint16_t*)sampler->buf;
-
-  // Load RG bytes for two adjacent pixels - rgRG
-  auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]);
-  auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]);
-  auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>);
-  // Load two pixels for next row
-  auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]);
-  auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]);
-  auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>);
-  // Blend rows
-  ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7;
-
-  auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]);
-  auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]);
-  auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>);
-  auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]);
-  auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]);
-  auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>);
-  // Blend rows
-  cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7;
-
-  // ab = a.rgRG,b.rgRG
-  // cd = c.rgRG,d.rgRG
-  // ... ac = ar,cr,ag,cg,aR,cR,aG,cG
-  // ... bd = br,dr,bg,dg,bR,dR,bG,dG
-  auto ac = zipLow(ab0, cd0);
-  auto bd = zipHigh(ab0, cd0);
-  // ar,br,cr,dr,ag,bg,cg,dg
-  // aR,bR,cR,dR,aG,bG,cG,dG
-  auto abcdl = zipLow(ac, bd);
-  auto abcdh = zipHigh(ac, bd);
-  // Blend columns
-  abcdl += ((abcdh - abcdl) * fracx.xyzwxyzw) >> 7;
-
-  auto rg = V8<uint16_t>(abcdl);
-  return WidePlanarRG8{rg};
-}
-
-template <typename S>
-vec4 textureLinearRG8(S sampler, vec2 P) {
-  ivec2 i(linearQuantize(P, 128, sampler));
-  auto planar = textureLinearPlanarRG8(sampler, i);
-  auto rg = CONVERT(planar.rg, V8<float>) * (1.0f / 255.0f);
-  auto r = lowHalf(rg);
-  auto g = highHalf(rg);
-  return vec4(r, g, 0.0f, 1.0f);
-}
-
-// Samples R16 texture with linear filtering and returns results packed as
-// signed I16. One bit of precision is shifted away from the bottom end to
-// accommodate the sign bit, so only 15 bits of precision is left.
-template <typename S>
-static inline I16 textureLinearUnpackedR16(S sampler, ivec2 i) {
-  assert(sampler->format == TextureFormat::R16);
-
-  ivec2 frac = i;
+  ivec2 frac = i & 0x7F;
   i >>= 7;
 
-  I32 row0 = computeRow(sampler, i);
-  I32 row1 = row0 + computeNextRowOffset(sampler, i);
-
+  I32 row0 = clampCoord(i.x, sampler->width) +
+             clampCoord(i.y, sampler->height) * sampler->stride + zoffset;
+  I32 row1 = row0 + ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) &
+                     I32(sampler->stride));
   I16 fracx =
-      CONVERT(
-          ((frac.x & (i.x >= 0)) | (i.x > int32_t(sampler->width) - 2)) & 0x7F,
-          I16)
-      << 8;
-  I16 fracy = computeFracY(frac) << 8;
-
-  // Sample the 16 bit data for both rows
-  uint16_t* buf = (uint16_t*)sampler->buf;
-  auto a0 = unaligned_load<V2<uint16_t>>(&buf[row0.x]);
-  auto b0 = unaligned_load<V2<uint16_t>>(&buf[row0.y]);
-  auto c0 = unaligned_load<V2<uint16_t>>(&buf[row0.z]);
-  auto d0 = unaligned_load<V2<uint16_t>>(&buf[row0.w]);
-  auto abcd0 = CONVERT(combine(a0, b0, c0, d0) >> 1, V8<int16_t>);
-
-  auto a1 = unaligned_load<V2<uint16_t>>(&buf[row1.x]);
-  auto b1 = unaligned_load<V2<uint16_t>>(&buf[row1.y]);
-  auto c1 = unaligned_load<V2<uint16_t>>(&buf[row1.z]);
-  auto d1 = unaligned_load<V2<uint16_t>>(&buf[row1.w]);
-  auto abcd1 = CONVERT(combine(a1, b1, c1, d1) >> 1, V8<int16_t>);
-
-  // The samples occupy 15 bits and the fraction occupies 15 bits, so that when
-  // they are multiplied together, the new scaled sample will fit in the high
-  // 14 bits of the result. It is left shifted once to make it 15 bits again
-  // for the final multiply.
-#if USE_SSE2
-  abcd0 += bit_cast<V8<int16_t>>(_mm_mulhi_epi16(abcd1 - abcd0, fracy.xxyyzzww))
-           << 1;
-#elif USE_NEON
-  // NEON has a convenient instruction that does both the multiply and the
-  // doubling, so doesn't need an extra shift.
-  abcd0 += bit_cast<V8<int16_t>>(vqrdmulhq_s16(abcd1 - abcd0, fracy.xxyyzzww));
-#else
-  abcd0 += CONVERT((CONVERT(abcd1 - abcd0, V8<int32_t>) *
-                    CONVERT(fracy.xxyyzzww, V8<int32_t>)) >>
-                       16,
-                   V8<int16_t>)
-           << 1;
-#endif
-
-  abcd0 = SHUFFLE(abcd0, abcd0, 0, 2, 4, 6, 1, 3, 5, 7);
-  auto abcdl = lowHalf(abcd0);
-  auto abcdh = highHalf(abcd0);
-#if USE_SSE2
-  abcdl += lowHalf(bit_cast<V8<int16_t>>(
-               _mm_mulhi_epi16(expand(abcdh - abcdl), expand(fracx))))
-           << 1;
-#elif USE_NEON
-  abcdl += bit_cast<V4<int16_t>>(vqrdmulh_s16(abcdh - abcdl, fracx));
-#else
-  abcdl += CONVERT((CONVERT(abcdh - abcdl, V4<int32_t>) *
-                    CONVERT(fracx, V4<int32_t>)) >>
-                       16,
-                   V4<int16_t>)
-           << 1;
-#endif
-
-  return abcdl;
-}
-
-template <typename S>
-vec4 textureLinearR16(S sampler, vec2 P) {
-  assert(sampler->format == TextureFormat::R16);
-
-  ivec2 i(linearQuantize(P, 128, sampler));
-  Float r = CONVERT(textureLinearUnpackedR16(sampler, i), Float);
-  return vec4(r * (1.0f / 32767.0f), 0.0f, 0.0f, 1.0f);
-}
-
-using PackedRGBA32F = V16<float>;
-using WideRGBA32F = V16<float>;
-
-template <typename S>
-vec4 textureLinearRGBA32F(S sampler, vec2 P) {
-  assert(sampler->format == TextureFormat::RGBA32F);
-  P = samplerScale(sampler, P);
-  P -= 0.5f;
-  vec2 f = floor(P);
-  vec2 r = P - f;
-  ivec2 i(f);
-  ivec2 c(clampCoord(i.x, sampler->width - 1),
-          clampCoord(i.y, sampler->height));
-  r.x = if_then_else(i.x >= 0, if_then_else(i.x < sampler->width - 1, r.x, 1.0),
-                     0.0f);
-  I32 offset0 = c.x * 4 + c.y * sampler->stride;
-  I32 offset1 = offset0 + computeNextRowOffset(sampler, i);
-
-  Float c0 = mix(mix(*(Float*)&sampler->buf[offset0.x],
-                     *(Float*)&sampler->buf[offset0.x + 4], r.x),
-                 mix(*(Float*)&sampler->buf[offset1.x],
-                     *(Float*)&sampler->buf[offset1.x + 4], r.x),
-                 r.y);
-  Float c1 = mix(mix(*(Float*)&sampler->buf[offset0.y],
-                     *(Float*)&sampler->buf[offset0.y + 4], r.x),
-                 mix(*(Float*)&sampler->buf[offset1.y],
-                     *(Float*)&sampler->buf[offset1.y + 4], r.x),
-                 r.y);
-  Float c2 = mix(mix(*(Float*)&sampler->buf[offset0.z],
-                     *(Float*)&sampler->buf[offset0.z + 4], r.x),
-                 mix(*(Float*)&sampler->buf[offset1.z],
-                     *(Float*)&sampler->buf[offset1.z + 4], r.x),
-                 r.y);
-  Float c3 = mix(mix(*(Float*)&sampler->buf[offset0.w],
-                     *(Float*)&sampler->buf[offset0.w + 4], r.x),
-                 mix(*(Float*)&sampler->buf[offset1.w],
-                     *(Float*)&sampler->buf[offset1.w + 4], r.x),
-                 r.y);
-  return pixel_float_to_vec4(c0, c1, c2, c3);
-}
-
-struct WidePlanarYUV8 {
-  U16 y;
-  U16 u;
-  U16 v;
-};
-
-template <typename S>
-SI WidePlanarYUV8 textureLinearPlanarYUV422(S sampler, ivec2 i) {
-  assert(sampler->format == TextureFormat::YUV422);
-
-  ivec2 frac = i;
-  i >>= 7;
-
-  I32 row0 = computeRow(sampler, i, 2);
-  // Layout is 2 pixel chunks (occupying 4 bytes) organized as: G0, B, G1, R.
-  // Get the selector for the pixel within the chunk.
-  I32 selector = row0 & 1;
-  // Align the row index to the chunk.
-  row0 &= ~1;
-  I32 row1 = row0 + computeNextRowOffset(sampler, i);
-  // G only needs to be clamped to a pixel boundary for safe interpolation,
-  // whereas the BR fraction needs to be clamped 1 extra pixel inside to a chunk
-  // boundary.
-  frac.x &= (i.x >= 0);
-  auto fracx =
-      CONVERT(combine(frac.x | (i.x > int32_t(sampler->width) - 3),
-                      (frac.x >> 1) | (i.x > int32_t(sampler->width) - 3)) &
-                  0x7F,
-              V8<int16_t>);
-  I16 fracy = computeFracY(frac);
-
-  uint16_t* buf = (uint16_t*)sampler->buf;
-
-  // Load bytes for two adjacent chunks - g0,b,g1,r,G0,B,G1,R
-  // We always need to interpolate between (b,r) and (B,R).
-  // Depending on selector we need to either interpolate between g0 and g1
-  // or between g1 and G0. So for now we just interpolate both cases for g
-  // and will select the appropriate one on output.
-  auto a0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.x]), V8<int16_t>);
-  auto a1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.x]), V8<int16_t>);
-  // Combine with next row.
-  a0 += ((a1 - a0) * fracy.x) >> 7;
-
-  auto b0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.y]), V8<int16_t>);
-  auto b1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.y]), V8<int16_t>);
-  b0 += ((b1 - b0) * fracy.y) >> 7;
-
-  auto c0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.z]), V8<int16_t>);
-  auto c1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.z]), V8<int16_t>);
-  c0 += ((c1 - c0) * fracy.z) >> 7;
-
-  auto d0 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row0.w]), V8<int16_t>);
-  auto d1 = CONVERT(unaligned_load<V8<uint8_t>>(&buf[row1.w]), V8<int16_t>);
-  d0 += ((d1 - d0) * fracy.w) >> 7;
-
-  // Shuffle things around so we end up with g0,g0,g0,g0,b,b,b,b and
-  // g1,g1,g1,g1,r,r,r,r.
-  auto abl = zipLow(a0, b0);
-  auto cdl = zipLow(c0, d0);
-  auto g0b = zip2Low(abl, cdl);
-  auto g1r = zip2High(abl, cdl);
-
-  // Need to zip g1,B,G0,R. Instead of using a bunch of complicated masks and
-  // and shifts, just shuffle here instead... We finally end up with
-  // g1,g1,g1,g1,B,B,B,B and G0,G0,G0,G0,R,R,R,R.
-  auto abh = SHUFFLE(a0, b0, 2, 10, 5, 13, 4, 12, 7, 15);
-  auto cdh = SHUFFLE(c0, d0, 2, 10, 5, 13, 4, 12, 7, 15);
-  auto g1B = zip2Low(abh, cdh);
-  auto G0R = zip2High(abh, cdh);
-
-  // Finally interpolate between adjacent columns.
-  g0b += ((g1B - g0b) * fracx) >> 7;
-  g1r += ((G0R - g1r) * fracx) >> 7;
-
-  // Choose either g0 or g1 based on selector.
-  return WidePlanarYUV8{
-      U16(if_then_else(CONVERT(-selector, I16), lowHalf(g1r), lowHalf(g0b))),
-      U16(highHalf(g0b)), U16(highHalf(g1r))};
-}
-
-template <typename S>
-vec4 textureLinearYUV422(S sampler, vec2 P) {
-  ivec2 i(linearQuantize(P, 128, sampler));
-  auto planar = textureLinearPlanarYUV422(sampler, i);
-  auto y = CONVERT(planar.y, Float) * (1.0f / 255.0f);
-  auto u = CONVERT(planar.u, Float) * (1.0f / 255.0f);
-  auto v = CONVERT(planar.v, Float) * (1.0f / 255.0f);
-  return vec4(v, y, u, 1.0f);
-}
-
-SI vec4 texture(sampler2D sampler, vec2 P) {
-  if (sampler->filter == TextureFilter::LINEAR) {
-    switch (sampler->format) {
-      case TextureFormat::RGBA32F:
-        return textureLinearRGBA32F(sampler, P);
-      case TextureFormat::RGBA8:
-        return textureLinearRGBA8(sampler, P);
-      case TextureFormat::R8:
-        return textureLinearR8(sampler, P);
-      case TextureFormat::RG8:
-        return textureLinearRG8(sampler, P);
-      case TextureFormat::R16:
-        return textureLinearR16(sampler, P);
-      case TextureFormat::YUV422:
-        return textureLinearYUV422(sampler, P);
-      default:
-        assert(false);
-        return vec4();
-    }
-  } else {
-    ivec2 coord(roundzero(P.x, sampler->width),
-                roundzero(P.y, sampler->height));
-    return texelFetch(sampler, coord, 0);
-  }
-}
-
-vec4 texture(sampler2DRect sampler, vec2 P) {
-  if (sampler->filter == TextureFilter::LINEAR) {
-    switch (sampler->format) {
-      case TextureFormat::RGBA8:
-        return textureLinearRGBA8(sampler, P);
-      case TextureFormat::R8:
-        return textureLinearR8(sampler, P);
-      case TextureFormat::RG8:
-        return textureLinearRG8(sampler, P);
-      case TextureFormat::R16:
-        return textureLinearR16(sampler, P);
-      case TextureFormat::YUV422:
-        return textureLinearYUV422(sampler, P);
-      default:
-        assert(false);
-        return vec4();
-    }
-  } else {
-    ivec2 coord(roundzero(P.x, 1.0f), roundzero(P.y, 1.0f));
-    return texelFetch(sampler, coord);
-  }
-}
-
-template <typename S>
-vec4_scalar texture(S sampler, vec2_scalar P) {
-  return force_scalar(texture(sampler, vec2(P)));
-}
-
-ivec2_scalar textureSize(sampler2D sampler, int) {
-  return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
-}
-
-ivec2_scalar textureSize(sampler2DRect sampler) {
-  return ivec2_scalar{int32_t(sampler->width), int32_t(sampler->height)};
-}
-
-template <typename S>
-static WideRGBA8 textureLinearUnpackedRGBA8(S sampler, ivec2 i) {
-  assert(sampler->format == TextureFormat::RGBA8);
-  ivec2 frac = i;
-  i >>= 7;
-
-  I32 row0 = computeRow(sampler, i);
-  I32 row1 = row0 + computeNextRowOffset(sampler, i);
-  I16 fracx = computeFracX(sampler, i, frac);
-  I16 fracy = computeFracY(frac);
+      CONVERT(frac.x & (i.x >= 0 && i.x < int32_t(sampler->width) - 1), I16);
+  I16 fracy = CONVERT(frac.y, I16);
 
   auto a0 =
       CONVERT(unaligned_load<V8<uint8_t>>(&sampler->buf[row0.x]), V8<int16_t>);
@@ -913,233 +48,80 @@ static WideRGBA8 textureLinearUnpackedRGBA8(S sampler, ivec2 i) {
   auto cdh = combine(highHalf(c0), highHalf(d0));
   cdl += ((cdh - cdl) * fracx.zzzzwwww) >> 7;
 
-  return combine(HalfRGBA8(abl), HalfRGBA8(cdl));
+  return pack(combine(HalfRGBA8(abl), HalfRGBA8(cdl)));
 }
 
 template <typename S>
-static PackedRGBA8 textureLinearPackedRGBA8(S sampler, ivec2 i) {
-  return pack(textureLinearUnpackedRGBA8(sampler, i));
+static inline void textureLinearCommit4(S sampler, ivec2 i, int zoffset,
+                                        uint32_t* buf) {
+  commit_span(buf, textureLinearPackedRGBA8(sampler, i, zoffset));
 }
 
 template <typename S>
-static PackedRGBA8 textureNearestPackedRGBA8(S sampler, ivec2 i) {
+static void textureLinearCommit8(S sampler, ivec2_scalar i, int zoffset,
+                                 uint32_t* buf) {
   assert(sampler->format == TextureFormat::RGBA8);
-  I32 row = computeRow(sampler, i, 0);
-  return combine(unaligned_load<V4<uint8_t>>(&sampler->buf[row.x]),
-                 unaligned_load<V4<uint8_t>>(&sampler->buf[row.y]),
-                 unaligned_load<V4<uint8_t>>(&sampler->buf[row.z]),
-                 unaligned_load<V4<uint8_t>>(&sampler->buf[row.w]));
-}
-
-template <typename S>
-static PackedR8 textureLinearPackedR8(S sampler, ivec2 i) {
-  return pack(textureLinearUnpackedR8(sampler, i));
-}
-
-template <typename S>
-static WideRG8 textureLinearUnpackedRG8(S sampler, ivec2 i) {
-  assert(sampler->format == TextureFormat::RG8);
-  ivec2 frac = i & 0x7F;
+  ivec2_scalar frac = i & 0x7F;
   i >>= 7;
 
-  I32 row0 = computeRow(sampler, i);
-  I32 row1 = row0 + computeNextRowOffset(sampler, i);
-  I16 fracx = computeFracX(sampler, i, frac);
-  I16 fracy = computeFracY(frac);
-
-  uint16_t* buf = (uint16_t*)sampler->buf;
-
-  // Load RG bytes for two adjacent pixels - rgRG
-  auto a0 = unaligned_load<V4<uint8_t>>(&buf[row0.x]);
-  auto b0 = unaligned_load<V4<uint8_t>>(&buf[row0.y]);
-  auto ab0 = CONVERT(combine(a0, b0), V8<int16_t>);
-  // Load two pixels for next row
-  auto a1 = unaligned_load<V4<uint8_t>>(&buf[row1.x]);
-  auto b1 = unaligned_load<V4<uint8_t>>(&buf[row1.y]);
-  auto ab1 = CONVERT(combine(a1, b1), V8<int16_t>);
-  // Blend rows
-  ab0 += ((ab1 - ab0) * fracy.xxxxyyyy) >> 7;
-
-  auto c0 = unaligned_load<V4<uint8_t>>(&buf[row0.z]);
-  auto d0 = unaligned_load<V4<uint8_t>>(&buf[row0.w]);
-  auto cd0 = CONVERT(combine(c0, d0), V8<int16_t>);
-  auto c1 = unaligned_load<V4<uint8_t>>(&buf[row1.z]);
-  auto d1 = unaligned_load<V4<uint8_t>>(&buf[row1.w]);
-  auto cd1 = CONVERT(combine(c1, d1), V8<int16_t>);
-  // Blend rows
-  cd0 += ((cd1 - cd0) * fracy.zzzzwwww) >> 7;
-
-  // ab = a.rgRG,b.rgRG
-  // cd = c.rgRG,d.rgRG
-  // ... ac = a.rg,c.rg,a.RG,c.RG
-  // ... bd = b.rg,d.rg,b.RG,d.RG
-  auto ac = zip2Low(ab0, cd0);
-  auto bd = zip2High(ab0, cd0);
-  // a.rg,b.rg,c.rg,d.rg
-  // a.RG,b.RG,c.RG,d.RG
-  auto abcdl = zip2Low(ac, bd);
-  auto abcdh = zip2High(ac, bd);
-  // Blend columns
-  abcdl += ((abcdh - abcdl) * fracx.xxyyzzww) >> 7;
-
-  return WideRG8(abcdl);
-}
-
-template <typename S>
-static PackedRG8 textureLinearPackedRG8(S sampler, ivec2 i) {
-  return pack(textureLinearUnpackedRG8(sampler, i));
-}
-
-template <int N>
-static ALWAYS_INLINE VectorType<uint16_t, N> addsat(VectorType<uint16_t, N> x,
-                                                    VectorType<uint16_t, N> y) {
-  auto r = x + y;
-  return r | (r < x);
-}
-
-template <typename P, typename S>
-static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurHorizontal(
-    S sampler, const ivec2_scalar& i, int minX, int maxX, int radius,
-    float coeff, float coeffStep) {
-  // Packed and unpacked vectors for a chunk of the given pixel type.
-  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
-  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
-
-  // Pre-scale the coefficient by 8 bits of fractional precision, so that when
-  // the sample is multiplied by it, it will yield a 16 bit unsigned integer
-  // that will use all 16 bits of precision to accumulate the sum.
-  coeff *= 1 << 8;
-  float coeffStep2 = coeffStep * coeffStep;
-
-  int row = computeRow(sampler, i);
-  P* buf = (P*)sampler->buf;
-  auto pixelsRight = unaligned_load<V4<P>>(&buf[row]);
-  auto pixelsLeft = pixelsRight;
-  auto sum = CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) *
-             uint16_t(coeff + 0.5f);
-
-  // Here we use some trickery to reuse the pixels within a chunk, shifted over
-  // by one pixel, to get the next sample for the entire chunk. This allows us
-  // to sample only one pixel for each offset across the entire chunk in both
-  // the left and right directions. To avoid clamping within the loop to the
-  // texture bounds, we compute the valid radius that doesn't require clamping
-  // and fall back to a slower clamping loop outside of that valid radius.
-  int offset = 1;
-  // The left bound is how much we can offset the sample before the start of
-  // the row bounds.
-  int leftBound = i.x - max(minX, 0);
-  // The right bound is how much we can offset the sample before the end of the
-  // row bounds.
-  int rightBound = min(maxX, sampler->width - 1) - i.x;
-  int validRadius = min(radius, min(leftBound, rightBound - (4 - 1)));
-  for (; offset <= validRadius; offset++) {
-    // Overwrite the pixel that needs to be shifted out with the new pixel, and
-    // shift it into the correct location.
-    pixelsRight.x = unaligned_load<P>(&buf[row + offset + 4 - 1]);
-    pixelsRight = pixelsRight.yzwx;
-    pixelsLeft = pixelsLeft.wxyz;
-    pixelsLeft.x = unaligned_load<P>(&buf[row - offset]);
-
-    // Accumulate the Gaussian coefficients step-wise.
-    coeff *= coeffStep;
-    coeffStep *= coeffStep2;
-
-    // Both left and right samples at this offset use the same coefficient.
-    sum = addsat(sum,
-                 (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) +
-                  CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) *
-                     uint16_t(coeff + 0.5f));
-  }
-
-  for (; offset <= radius; offset++) {
-    pixelsRight.x =
-        unaligned_load<P>(&buf[row + min(offset + 4 - 1, rightBound)]);
-    pixelsRight = pixelsRight.yzwx;
-    pixelsLeft = pixelsLeft.wxyz;
-    pixelsLeft.x = unaligned_load<P>(&buf[row - min(offset, leftBound)]);
-
-    coeff *= coeffStep;
-    coeffStep *= coeffStep2;
-
-    sum = addsat(sum,
-                 (CONVERT(bit_cast<packed_type>(pixelsRight), unpacked_type) +
-                  CONVERT(bit_cast<packed_type>(pixelsLeft), unpacked_type)) *
-                     uint16_t(coeff + 0.5f));
-  }
-
-  // Shift away the intermediate precision.
-  return sum >> 8;
-}
-
-template <typename P, typename S>
-static VectorType<uint16_t, 4 * sizeof(P)> gaussianBlurVertical(
-    S sampler, const ivec2_scalar& i, int minY, int maxY, int radius,
-    float coeff, float coeffStep) {
-  // Packed and unpacked vectors for a chunk of the given pixel type.
-  typedef VectorType<uint8_t, 4 * sizeof(P)> packed_type;
-  typedef VectorType<uint16_t, 4 * sizeof(P)> unpacked_type;
-
-  // Pre-scale the coefficient by 8 bits of fractional precision, so that when
-  // the sample is multiplied by it, it will yield a 16 bit unsigned integer
-  // that will use all 16 bits of precision to accumulate the sum.
-  coeff *= 1 << 8;
-  float coeffStep2 = coeffStep * coeffStep;
-
-  int rowAbove = computeRow(sampler, i);
-  int rowBelow = rowAbove;
-  P* buf = (P*)sampler->buf;
-  auto pixels = unaligned_load<V4<P>>(&buf[rowAbove]);
-  auto sum = CONVERT(bit_cast<packed_type>(pixels), unpacked_type) *
-             uint16_t(coeff + 0.5f);
-
-  // For the vertical loop we can't be quite as creative with reusing old values
-  // as we were in the horizontal loop. We just do the obvious implementation of
-  // loading a chunk from each row in turn and accumulating it into the sum. We
-  // compute a valid radius within which we don't need to clamp the sampled row
-  // and use that to avoid any clamping in the main inner loop. We fall back to
-  // a slower clamping loop outside of that valid radius.
-  int offset = 1;
-  int belowBound = i.y - max(minY, 0);
-  int aboveBound = min(maxY, sampler->height - 1) - i.y;
-  int validRadius = min(radius, min(belowBound, aboveBound));
-  for (; offset <= validRadius; offset++) {
-    rowAbove += sampler->stride;
-    rowBelow -= sampler->stride;
-    auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]);
-    auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]);
-
-    // Accumulate the Gaussian coefficients step-wise.
-    coeff *= coeffStep;
-    coeffStep *= coeffStep2;
-
-    // Both above and below samples at this offset use the same coefficient.
-    sum = addsat(sum,
-                 (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) +
-                  CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) *
-                     uint16_t(coeff + 0.5f));
+  uint32_t* row0 =
+      &sampler
+           ->buf[clampCoord(i.x, sampler->width) +
+                 clampCoord(i.y, sampler->height) * sampler->stride + zoffset];
+  uint32_t* row1 =
+      row0 +
+      ((i.y >= 0 && i.y < int32_t(sampler->height) - 1) ? sampler->stride : 0);
+  int16_t fracx = i.x >= 0 && i.x < int32_t(sampler->width) - 1 ? frac.x : 0;
+  int16_t fracy = frac.y;
+
+  U32 pix0 = unaligned_load<U32>(row0);
+  U32 pix0n = unaligned_load<U32>(row0 + 4);
+  uint32_t pix0x = row0[8];
+  U32 pix1 = unaligned_load<U32>(row1);
+  U32 pix1n = unaligned_load<U32>(row1 + 4);
+  uint32_t pix1x = row1[8];
+
+  {
+    auto ab0 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0, pix0, 0, 1, 1, 2)),
+                       V16<int16_t>);
+    auto ab1 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1, pix1, 0, 1, 1, 2)),
+                       V16<int16_t>);
+    ab0 += ((ab1 - ab0) * fracy) >> 7;
+
+    auto cd0 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0, pix0n, 2, 3, 3, 4)),
+                       V16<int16_t>);
+    auto cd1 = CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1, pix1n, 2, 3, 3, 4)),
+                       V16<int16_t>);
+    cd0 += ((cd1 - cd0) * fracy) >> 7;
+
+    auto abcdl = combine(lowHalf(ab0), lowHalf(cd0));
+    auto abcdh = combine(highHalf(ab0), highHalf(cd0));
+    abcdl += ((abcdh - abcdl) * fracx) >> 7;
+
+    commit_span(buf, pack(WideRGBA8(abcdl)));
   }
 
-  for (; offset <= radius; offset++) {
-    if (offset <= aboveBound) {
-      rowAbove += sampler->stride;
-    }
-    if (offset <= belowBound) {
-      rowBelow -= sampler->stride;
-    }
-    auto pixelsAbove = unaligned_load<V4<P>>(&buf[rowAbove]);
-    auto pixelsBelow = unaligned_load<V4<P>>(&buf[rowBelow]);
-
-    coeff *= coeffStep;
-    coeffStep *= coeffStep2;
-
-    sum = addsat(sum,
-                 (CONVERT(bit_cast<packed_type>(pixelsAbove), unpacked_type) +
-                  CONVERT(bit_cast<packed_type>(pixelsBelow), unpacked_type)) *
-                     uint16_t(coeff + 0.5f));
+  {
+    auto ab0 =
+        CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0n, pix0n, 0, 1, 1, 2)),
+                V16<int16_t>);
+    auto ab1 =
+        CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1n, pix1n, 0, 1, 1, 2)),
+                V16<int16_t>);
+    ab0 += ((ab1 - ab0) * fracy) >> 7;
+
+    auto cd0 =
+        CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix0n, U32(pix0x), 2, 3, 3, 4)),
+                V16<int16_t>);
+    auto cd1 =
+        CONVERT(bit_cast<V16<uint8_t>>(SHUFFLE(pix1n, U32(pix1x), 2, 3, 3, 4)),
+                V16<int16_t>);
+    cd0 += ((cd1 - cd0) * fracy) >> 7;
+
+    auto abcdl = combine(lowHalf(ab0), lowHalf(cd0));
+    auto abcdh = combine(highHalf(ab0), highHalf(cd0));
+    abcdl += ((abcdh - abcdl) * fracx) >> 7;
+
+    commit_span(buf + 4, pack(WideRGBA8(abcdl)));
   }
-
-  // Shift away the intermediate precision.
-  return sum >> 8;
 }
-
-}  // namespace glsl
diff --git a/third_party/webrender/swgl/src/vector_type.h b/third_party/webrender/swgl/src/vector_type.h
index 43364ffcce2..8ec5876c340 100644
--- a/third_party/webrender/swgl/src/vector_type.h
+++ b/third_party/webrender/swgl/src/vector_type.h
@@ -39,16 +39,6 @@ SI VectorType<T, 16> combine(VectorType<T, 8> a, VectorType<T, 8> b) {
 }
 
 template <typename T>
-SI VectorType<T, 2> lowHalf(VectorType<T, 4> a) {
-  return __builtin_shufflevector(a, a, 0, 1);
-}
-
-template <typename T>
-SI VectorType<T, 2> highHalf(VectorType<T, 4> a) {
-  return __builtin_shufflevector(a, a, 2, 3);
-}
-
-template <typename T>
 SI VectorType<T, 4> lowHalf(VectorType<T, 8> a) {
   return __builtin_shufflevector(a, a, 0, 1, 2, 3);
 }
@@ -114,7 +104,7 @@ struct VectorType {
     };
   };
 
-  VectorType() : data{0} {}
+  VectorType() : data{0} { }
 
   constexpr VectorType(const VectorType& rhs) : data(rhs.data) {}
   // GCC vector extensions only support broadcasting scalars on arithmetic ops,
@@ -315,27 +305,10 @@ struct VectorType {
     return VectorType<T, N * 2>::wrap(data, high.data);
   }
 
-#  define xxxx swizzle(0, 0, 0, 0)
-#  define yyyy swizzle(1, 1, 1, 1)
-#  define zzzz swizzle(2, 2, 2, 2)
-#  define wwww swizzle(3, 3, 3, 3)
-#  define xxyy swizzle(0, 0, 1, 1)
-#  define xxzz swizzle(0, 0, 2, 2)
-#  define yyww swizzle(1, 1, 3, 3)
-#  define zzww swizzle(2, 2, 3, 3)
 #  define xyxy swizzle(0, 1, 0, 1)
-#  define xzxz swizzle(0, 2, 0, 2)
-#  define ywyw swizzle(1, 3, 1, 3)
 #  define zwzw swizzle(2, 3, 2, 3)
-#  define zwxy swizzle(2, 3, 0, 1)
 #  define zyxw swizzle(2, 1, 0, 3)
-#  define xxyz swizzle(0, 0, 1, 2)
-#  define xyyz swizzle(0, 1, 1, 2)
 #  define xyzz swizzle(0, 1, 2, 2)
-#  define xzyw swizzle(0, 2, 1, 3)
-#  define yzwx swizzle(1, 2, 3, 0)
-#  define wxyz swizzle(3, 0, 1, 2)
-#  define wzyx swizzle(3, 2, 1, 0)
 #  define xxxxyyyy XXXXYYYY()
   VectorType<T, 8> XXXXYYYY() const {
     return swizzle(0, 0, 0, 0).combine(swizzle(1, 1, 1, 1));
@@ -358,10 +331,6 @@ struct VectorType {
   VectorType<T, 8> XXYYZZWW() const {
     return swizzle(0, 0, 1, 1).combine(swizzle(2, 2, 3, 3));
   }
-#  define xxxxyyyyzzzzwwww XXXXYYYYZZZZWWWW()
-  VectorType<T, 16> XXXXYYYYZZZZWWWW() {
-    return XXXXYYYY().combine(ZZZZWWWW());
-  }
 };
 
 template <typename T>
@@ -374,17 +343,6 @@ struct VectorType<T, 2> {
     };
     T elements[2];
   };
-
-  SI VectorType wrap(const data_type& data) {
-    VectorType v;
-    v.data = data;
-    return v;
-  }
-
-  VectorType operator&(VectorType x) const { return wrap(data & x.data); }
-  VectorType operator&(T x) const { return wrap(data & x); }
-  VectorType operator|(VectorType x) const { return wrap(data | x.data); }
-  VectorType operator|(T x) const { return wrap(data | x); }
 };
 
 #  define CONVERT(vector, type) ((type)(vector))
@@ -411,32 +369,6 @@ SI VectorType<T, N * 2> expand(VectorType<T, N> a) {
 }
 #endif
 
-template <typename T, int N>
-SI VectorType<T, N * 4> combine(VectorType<T, N> a, VectorType<T, N> b,
-                                VectorType<T, N> c, VectorType<T, N> d) {
-  return combine(combine(a, b), combine(c, d));
-}
-
-template <typename T, int N>
-SI VectorType<T, N> combineLow(VectorType<T, N> a, VectorType<T, N> b) {
-  return combine(lowHalf(a), lowHalf(b));
-}
-
-template <typename T, int N>
-SI VectorType<T, N> combineHigh(VectorType<T, N> a, VectorType<T, N> b) {
-  return combine(highHalf(a), highHalf(b));
-}
-
-template <typename T, int N>
-SI VectorType<T, N * 2> repeat2(VectorType<T, N> a) {
-  return combine(a, a);
-}
-
-template <typename T, int N>
-SI VectorType<T, N * 4> repeat4(VectorType<T, N> a) {
-  return combine(a, a, a, a);
-}
-
 template <typename T>
 SI VectorType<T, 4> zipLow(VectorType<T, 4> a, VectorType<T, 4> b) {
   return SHUFFLE(a, b, 0, 4, 1, 5);
@@ -478,23 +410,6 @@ SI VectorType<T, 8> zip2High(VectorType<T, 8> a, VectorType<T, 8> b) {
   return SHUFFLE(a, b, 4, 5, 12, 13, 6, 7, 14, 15);
 }
 
-#ifdef __clang__
-template <typename T>
-SI VectorType<T, 8> zip(VectorType<T, 4> a, VectorType<T, 4> b) {
-  return SHUFFLE(a, b, 0, 4, 1, 5, 2, 6, 3, 7);
-}
-
-template <typename T>
-SI VectorType<T, 16> zip(VectorType<T, 8> a, VectorType<T, 8> b) {
-  return SHUFFLE(a, b, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15);
-}
-#else
-template <typename T, int N>
-SI VectorType<T, N * 2> zip(VectorType<T, N> a, VectorType<T, N> b) {
-  return combine(zipLow(a, b), zipHigh(a, b));
-}
-#endif
-
 template <typename T>
 struct Unaligned {
   template <typename P>