Revert "Upgrade WebRender to e491e1ae637b2eed1e7195855d88357e5eb3ddf9 (#30323)"revert-webrender

This reverts commit a9d37cb85ac2c55fc630fccffe1ba60ff00f555b.
author: Mukilan Thiyagarajan <mukilan@igalia.com> 2023-09-14 15:00:42 +0530
committer: Mukilan Thiyagarajan <mukilan@igalia.com> 2023-09-14 15:00:42 +0530
commit: c385b3c9737c17d59cb02e520c3b68b232cb6497 (patch)
tree: ad598ffbbdfbcecd6a4cf458abe2afc702d92c27 /third_party/webrender/swgl/src/gl.cc
parent: 988e05a68b48c9e744bf49459faf41a1bd9b81d7 (diff)
download: servo-revert-webrender.tar.gz
servo-revert-webrender.zip
1 files changed, 2186 insertions, 978 deletions
diff --git a/third_party/webrender/swgl/src/gl.cc b/third_party/webrender/swgl/src/gl.cc
index 6e214547421..f4a69752dde 100644
--- a/third_party/webrender/swgl/src/gl.cc
+++ b/third_party/webrender/swgl/src/gl.cc
@@ -22,65 +22,15 @@
 #  define debugf(...) printf(__VA_ARGS__)
 #endif
 
-// #define PRINT_TIMINGS
-
 #ifdef _WIN32
 #  define ALWAYS_INLINE __forceinline
-#  define NO_INLINE __declspec(noinline)
-
-// Including Windows.h brings a huge amount of namespace polution so just
-// define a couple of things manually
-typedef int BOOL;
-#  define WINAPI __stdcall
-#  define DECLSPEC_IMPORT __declspec(dllimport)
-#  define WINBASEAPI DECLSPEC_IMPORT
-typedef unsigned long DWORD;
-typedef long LONG;
-typedef __int64 LONGLONG;
-#  define DUMMYSTRUCTNAME
-
-typedef union _LARGE_INTEGER {
-  struct {
-    DWORD LowPart;
-    LONG HighPart;
-  } DUMMYSTRUCTNAME;
-  struct {
-    DWORD LowPart;
-    LONG HighPart;
-  } u;
-  LONGLONG QuadPart;
-} LARGE_INTEGER;
-extern "C" {
-WINBASEAPI BOOL WINAPI
-QueryPerformanceCounter(LARGE_INTEGER* lpPerformanceCount);
-
-WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency);
-}
-
 #else
-// GCC is slower when dealing with always_inline, especially in debug builds.
-// When using Clang, use always_inline more aggressively.
-#  if defined(__clang__) || defined(NDEBUG)
-#    define ALWAYS_INLINE __attribute__((always_inline)) inline
-#  else
-#    define ALWAYS_INLINE inline
-#  endif
-#  define NO_INLINE __attribute__((noinline))
-#endif
-
-// Some functions may cause excessive binary bloat if inlined in debug or with
-// GCC builds, so use PREFER_INLINE on these instead of ALWAYS_INLINE.
-#if defined(__clang__) && defined(NDEBUG)
-#  define PREFER_INLINE ALWAYS_INLINE
-#else
-#  define PREFER_INLINE inline
+#  define ALWAYS_INLINE __attribute__((always_inline)) inline
 #endif
 
 #define UNREACHABLE __builtin_unreachable()
 
-#define UNUSED [[maybe_unused]]
-
-#define FALLTHROUGH [[fallthrough]]
+#define UNUSED __attribute__((unused))
 
 #ifdef MOZILLA_CLIENT
 #  define IMPLICIT __attribute__((annotate("moz_implicit")))
@@ -91,32 +41,19 @@ WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency);
 #include "gl_defs.h"
 #include "glsl.h"
 #include "program.h"
-#include "texture.h"
 
 using namespace glsl;
 
-typedef ivec2_scalar IntPoint;
-
 struct IntRect {
   int x0;
   int y0;
   int x1;
   int y1;
 
-  IntRect() : x0(0), y0(0), x1(0), y1(0) {}
-  IntRect(int x0, int y0, int x1, int y1) : x0(x0), y0(y0), x1(x1), y1(y1) {}
-  IntRect(IntPoint origin, IntPoint size)
-      : x0(origin.x),
-        y0(origin.y),
-        x1(origin.x + size.x),
-        y1(origin.y + size.y) {}
-
   int width() const { return x1 - x0; }
   int height() const { return y1 - y0; }
   bool is_empty() const { return width() <= 0 || height() <= 0; }
 
-  IntPoint origin() const { return IntPoint(x0, y0); }
-
   bool same_size(const IntRect& o) const {
     return width() == o.width() && height() == o.height();
   }
@@ -133,12 +70,6 @@ struct IntRect {
     return *this;
   }
 
-  IntRect intersection(const IntRect& o) {
-    IntRect result = *this;
-    result.intersect(o);
-    return result;
-  }
-
   // Scale from source-space to dest-space, optionally rounding inward
   IntRect& scale(int srcWidth, int srcHeight, int dstWidth, int dstHeight,
                  bool roundIn = false) {
@@ -156,60 +87,15 @@ struct IntRect {
     swap(y0, y1);
   }
 
-  IntRect& offset(const IntPoint& o) {
-    x0 += o.x;
-    y0 += o.y;
-    x1 += o.x;
-    y1 += o.y;
+  IntRect& offset(int dx, int dy) {
+    x0 += dx;
+    y0 += dy;
+    x1 += dx;
+    y1 += dy;
     return *this;
   }
-
-  IntRect operator+(const IntPoint& o) const {
-    return IntRect(*this).offset(o);
-  }
-  IntRect operator-(const IntPoint& o) const {
-    return IntRect(*this).offset(-o);
-  }
 };
 
-typedef vec2_scalar Point2D;
-typedef vec4_scalar Point3D;
-
-struct IntRange {
-  int start;
-  int end;
-
-  int len() const { return end - start; }
-
-  IntRange intersect(IntRange r) const {
-    return {max(start, r.start), min(end, r.end)};
-  }
-};
-
-struct FloatRange {
-  float start;
-  float end;
-
-  float clip(float x) const { return clamp(x, start, end); }
-
-  FloatRange clip(FloatRange r) const { return {clip(r.start), clip(r.end)}; }
-
-  FloatRange merge(FloatRange r) const {
-    return {min(start, r.start), max(end, r.end)};
-  }
-
-  IntRange round() const {
-    return {int(floor(start + 0.5f)), int(floor(end + 0.5f))};
-  }
-
-  IntRange round_out() const { return {int(floor(start)), int(ceil(end))}; }
-};
-
-template <typename P>
-static inline FloatRange x_range(P p0, P p1) {
-  return {min(p0.x, p1.x), max(p0.x, p1.x)};
-}
-
 struct VertexAttrib {
   size_t size = 0;  // in bytes
   GLenum type = 0;
@@ -237,18 +123,12 @@ static int bytes_for_internal_format(GLenum internal_format) {
     case GL_R8:
     case GL_RED:
       return 1;
-    case GL_RG8:
-    case GL_RG:
-      return 2;
     case GL_DEPTH_COMPONENT:
     case GL_DEPTH_COMPONENT16:
+      return 2;
     case GL_DEPTH_COMPONENT24:
     case GL_DEPTH_COMPONENT32:
       return 4;
-    case GL_RGB_RAW_422_APPLE:
-      return 2;
-    case GL_R16:
-      return 2;
     default:
       debugf("internal format: %x\n", internal_format);
       assert(0);
@@ -268,12 +148,6 @@ static TextureFormat gl_format_to_texture_format(int type) {
       return TextureFormat::RGBA8;
     case GL_R8:
       return TextureFormat::R8;
-    case GL_RG8:
-      return TextureFormat::RG8;
-    case GL_R16:
-      return TextureFormat::R16;
-    case GL_RGB_RAW_422_APPLE:
-      return TextureFormat::YUV422;
     default:
       assert(0);
       return TextureFormat::RGBA8;
@@ -287,34 +161,19 @@ struct Query {
 struct Buffer {
   char* buf = nullptr;
   size_t size = 0;
-  size_t capacity = 0;
 
   bool allocate(size_t new_size) {
-    // If the size remains unchanged, don't allocate anything.
-    if (new_size == size) {
-      return false;
-    }
-    // If the new size is within the existing capacity of the buffer, just
-    // reuse the existing buffer.
-    if (new_size <= capacity) {
-      size = new_size;
-      return true;
-    }
-    // Otherwise we need to reallocate the buffer to hold up to the requested
-    // larger size.
-    char* new_buf = (char*)realloc(buf, new_size);
-    assert(new_buf);
-    if (!new_buf) {
-      // If we fail, null out the buffer rather than leave around the old
-      // allocation state.
+    if (new_size != size) {
+      char* new_buf = (char*)realloc(buf, new_size);
+      assert(new_buf);
+      if (new_buf) {
+        buf = new_buf;
+        size = new_size;
+        return true;
+      }
       cleanup();
-      return false;
     }
-    // The reallocation succeeded, so install the buffer.
-    buf = new_buf;
-    size = new_size;
-    capacity = new_size;
-    return true;
+    return false;
   }
 
   void cleanup() {
@@ -322,7 +181,6 @@ struct Buffer {
       free(buf);
       buf = nullptr;
       size = 0;
-      capacity = 0;
     }
   }
 
@@ -331,6 +189,7 @@ struct Buffer {
 
 struct Framebuffer {
   GLuint color_attachment = 0;
+  GLint layer = 0;
   GLuint depth_attachment = 0;
 };
 
@@ -364,32 +223,17 @@ struct Texture {
   GLenum internal_format = 0;
   int width = 0;
   int height = 0;
+  int depth = 0;
   char* buf = nullptr;
   size_t buf_size = 0;
-  uint32_t buf_stride = 0;
-  uint8_t buf_bpp = 0;
   GLenum min_filter = GL_NEAREST;
   GLenum mag_filter = GL_LINEAR;
-  // The number of active locks on this texture. If this texture has any active
-  // locks, we need to disallow modifying or destroying the texture as it may
-  // be accessed by other threads where modifications could lead to races.
-  int32_t locked = 0;
-  // When used as an attachment of a framebuffer, rendering to the texture
-  // behaves as if it is located at the given offset such that the offset is
-  // subtracted from all transformed vertexes after the viewport is applied.
-  IntPoint offset;
 
   enum FLAGS {
-    // If the buffer is internally-allocated by SWGL
     SHOULD_FREE = 1 << 1,
-    // If the buffer has been cleared to initialize it. Currently this is only
-    // utilized by depth buffers which need to know when depth runs have reset
-    // to a valid row state. When unset, the depth runs may contain garbage.
-    CLEARED = 1 << 2,
   };
   int flags = SHOULD_FREE;
   bool should_free() const { return bool(flags & SHOULD_FREE); }
-  bool cleared() const { return bool(flags & CLEARED); }
 
   void set_flag(int flag, bool val) {
     if (val) {
@@ -398,14 +242,7 @@ struct Texture {
       flags &= ~flag;
     }
   }
-  void set_should_free(bool val) {
-    // buf must be null before SHOULD_FREE can be safely toggled. Otherwise, we
-    // might accidentally mistakenly realloc an externally allocated buffer as
-    // if it were an internally allocated one.
-    assert(!buf);
-    set_flag(SHOULD_FREE, val);
-  }
-  void set_cleared(bool val) { set_flag(CLEARED, val); }
+  void set_should_free(bool val) { set_flag(SHOULD_FREE, val); }
 
   // Delayed-clearing state. When a clear of an FB is requested, we don't
   // immediately clear each row, as the rows may be subsequently overwritten
@@ -418,9 +255,6 @@ struct Texture {
   uint32_t clear_val = 0;
   uint32_t* cleared_rows = nullptr;
 
-  void init_depth_runs(uint32_t z);
-  void fill_depth_runs(uint32_t z, const IntRect& scissor);
-
   void enable_delayed_clear(uint32_t val) {
     delay_clear = height;
     clear_val = val;
@@ -441,88 +275,40 @@ struct Texture {
     }
   }
 
-  int bpp() const { return buf_bpp; }
-  void set_bpp() { buf_bpp = bytes_for_internal_format(internal_format); }
+  int bpp() const { return bytes_for_internal_format(internal_format); }
 
-  size_t stride() const { return buf_stride; }
-  void set_stride() { buf_stride = aligned_stride(buf_bpp * width); }
-
-  // Set an external backing buffer of this texture.
-  void set_buffer(void* new_buf, size_t new_stride) {
-    assert(!should_free());
-    // Ensure that the supplied stride is at least as big as the row data and
-    // is aligned to the smaller of either the BPP or word-size. We need to at
-    // least be able to sample data from within a row and sample whole pixels
-    // of smaller formats without risking unaligned access.
-    set_bpp();
-    set_stride();
-    assert(new_stride >= size_t(bpp() * width) &&
-           new_stride % min(bpp(), sizeof(uint32_t)) == 0);
+  size_t stride(int b = 0, int min_width = 0) const {
+    return aligned_stride((b ? b : bpp()) * max(width, min_width));
+  }
 
-    buf = (char*)new_buf;
-    buf_size = 0;
-    buf_stride = new_stride;
+  size_t layer_stride(int b = 0, int min_width = 0, int min_height = 0) const {
+    return stride(b ? b : bpp(), min_width) * max(height, min_height);
   }
 
   bool allocate(bool force = false, int min_width = 0, int min_height = 0) {
-    assert(!locked);  // Locked textures shouldn't be reallocated
-    // If we get here, some GL API call that invalidates the texture was used.
-    // Mark the buffer as not-cleared to signal this.
-    set_cleared(false);
-    // Check if there is either no buffer currently or if we forced validation
-    // of the buffer size because some dimension might have changed.
     if ((!buf || force) && should_free()) {
-      // Initialize the buffer's BPP and stride, since they may have changed.
-      set_bpp();
-      set_stride();
-      // Compute new size based on the maximum potential stride, rather than
-      // the current stride, to hopefully avoid reallocations when size would
-      // otherwise change too much...
-      size_t max_stride = max(buf_stride, aligned_stride(buf_bpp * min_width));
-      size_t size = max_stride * max(height, min_height);
-      if ((!buf && size > 0) || size > buf_size) {
+      size_t size = layer_stride(bpp(), min_width, min_height) * max(depth, 1);
+      if (!buf || size > buf_size) {
         // Allocate with a SIMD register-sized tail of padding at the end so we
         // can safely read or write past the end of the texture with SIMD ops.
-        // Currently only the flat Z-buffer texture needs this padding due to
-        // full-register loads and stores in check_depth and discard_depth. In
-        // case some code in the future accidentally uses a linear filter on a
-        // texture with less than 2 pixels per row, we also add this padding
-        // just to be safe. All other texture types and use-cases should be
-        // safe to omit padding.
-        size_t padding =
-            internal_format == GL_DEPTH_COMPONENT24 || max(width, min_width) < 2
-                ? sizeof(Float)
-                : 0;
-        char* new_buf = (char*)realloc(buf, size + padding);
+        char* new_buf = (char*)realloc(buf, size + sizeof(Float));
         assert(new_buf);
         if (new_buf) {
-          // Successfully reallocated the buffer, so go ahead and set it.
           buf = new_buf;
           buf_size = size;
           return true;
         }
-        // Allocation failed, so ensure we don't leave stale buffer state.
         cleanup();
       }
     }
-    // Nothing changed...
     return false;
   }
 
   void cleanup() {
-    assert(!locked);  // Locked textures shouldn't be destroyed
-    if (buf) {
-      // If we need to toggle SHOULD_FREE state, ensure that buf is nulled out,
-      // regardless of whether we internally allocated it. This will prevent us
-      // from wrongly treating buf as having been internally allocated for when
-      // we go to realloc if it actually was externally allocted.
-      if (should_free()) {
-        free(buf);
-      }
+    if (buf && should_free()) {
+      free(buf);
       buf = nullptr;
       buf_size = 0;
-      buf_bpp = 0;
-      buf_stride = 0;
     }
     disable_delayed_clear();
   }
@@ -530,41 +316,44 @@ struct Texture {
   ~Texture() { cleanup(); }
 
   IntRect bounds() const { return IntRect{0, 0, width, height}; }
-  IntRect offset_bounds() const { return bounds() + offset; }
 
   // Find the valid sampling bounds relative to the requested region
   IntRect sample_bounds(const IntRect& req, bool invertY = false) const {
-    IntRect bb = bounds().intersect(req) - req.origin();
+    IntRect bb = bounds().intersect(req).offset(-req.x0, -req.y0);
     if (invertY) bb.invert_y(req.height());
     return bb;
   }
 
   // Get a pointer for sampling at the given offset
-  char* sample_ptr(int x, int y) const {
-    return buf + y * stride() + x * bpp();
+  char* sample_ptr(int x, int y, int z, int bpp, size_t stride) const {
+    return buf + (height * z + y) * stride + x * bpp;
+  }
+
+  char* sample_ptr(int x, int y, int z, int bpp) const {
+    return sample_ptr(x, y, z, bpp, stride(bpp));
+  }
+
+  char* sample_ptr(int x, int y, int z) const {
+    return sample_ptr(x, y, z, bpp());
   }
 
   // Get a pointer for sampling the requested region and limit to the provided
   // sampling bounds
-  char* sample_ptr(const IntRect& req, const IntRect& bounds,
+  char* sample_ptr(const IntRect& req, const IntRect& bounds, int z,
                    bool invertY = false) const {
     // Offset the sample pointer by the clamped bounds
     int x = req.x0 + bounds.x0;
     // Invert the Y offset if necessary
     int y = invertY ? req.y1 - 1 - bounds.y0 : req.y0 + bounds.y0;
-    return sample_ptr(x, y);
+    return sample_ptr(x, y, z);
   }
 };
 
-// The last vertex attribute is reserved as a null attribute in case a vertex
-// attribute is used without being set.
-#define MAX_ATTRIBS 17
-#define NULL_ATTRIB 16
+#define MAX_ATTRIBS 16
+#define NULL_ATTRIB 15
 struct VertexArray {
   VertexAttrib attribs[MAX_ATTRIBS];
   int max_attrib = -1;
-  // The GL spec defines element array buffer binding to be part of VAO state.
-  GLuint element_array_buffer_binding = 0;
 
   void validate();
 };
@@ -580,67 +369,33 @@ struct Program {
   FragmentShaderImpl* frag_impl = nullptr;
   bool deleted = false;
 
-  ~Program() { delete impl; }
+  ~Program() {
+    delete impl;
+  }
 };
 
-// clang-format off
-// Fully-expand GL defines while ignoring more than 4 suffixes
+// for GL defines to fully expand
 #define CONCAT_KEY(prefix, x, y, z, w, ...) prefix##x##y##z##w
-// Generate a blend key enum symbol
-#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0, 0)
-#define MASK_BLEND_KEY(...) CONCAT_KEY(MASK_BLEND_, __VA_ARGS__, 0, 0, 0)
-#define AA_BLEND_KEY(...) CONCAT_KEY(AA_BLEND_, __VA_ARGS__, 0, 0, 0)
-#define AA_MASK_BLEND_KEY(...) CONCAT_KEY(AA_MASK_BLEND_, __VA_ARGS__, 0, 0, 0)
-
-// Utility macro to easily generate similar code for all implemented blend modes
+#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0)
 #define FOR_EACH_BLEND_KEY(macro)                                              \
-  macro(GL_ONE, GL_ZERO, 0, 0)                                                 \
-  macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA)  \
-  macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0)                                  \
-  macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0)                                 \
-  macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE)                      \
-  macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0)                                 \
-  macro(GL_ZERO, GL_SRC_COLOR, 0, 0)                                           \
-  macro(GL_ONE, GL_ONE, 0, 0)                                                  \
-  macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA)                        \
-  macro(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE)                       \
-  macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, 0, 0)                       \
-  macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0)                                 \
-  macro(GL_MIN, 0, 0, 0)                                                       \
-  macro(GL_MAX, 0, 0, 0)                                                       \
-  macro(GL_MULTIPLY_KHR, 0, 0, 0)                                              \
-  macro(GL_SCREEN_KHR, 0, 0, 0)                                                \
-  macro(GL_OVERLAY_KHR, 0, 0, 0)                                               \
-  macro(GL_DARKEN_KHR, 0, 0, 0)                                                \
-  macro(GL_LIGHTEN_KHR, 0, 0, 0)                                               \
-  macro(GL_COLORDODGE_KHR, 0, 0, 0)                                            \
-  macro(GL_COLORBURN_KHR, 0, 0, 0)                                             \
-  macro(GL_HARDLIGHT_KHR, 0, 0, 0)                                             \
-  macro(GL_SOFTLIGHT_KHR, 0, 0, 0)                                             \
-  macro(GL_DIFFERENCE_KHR, 0, 0, 0)                                            \
-  macro(GL_EXCLUSION_KHR, 0, 0, 0)                                             \
-  macro(GL_HSL_HUE_KHR, 0, 0, 0)                                               \
-  macro(GL_HSL_SATURATION_KHR, 0, 0, 0)                                        \
-  macro(GL_HSL_COLOR_KHR, 0, 0, 0)                                             \
-  macro(GL_HSL_LUMINOSITY_KHR, 0, 0, 0)                                        \
-  macro(SWGL_BLEND_DROP_SHADOW, 0, 0, 0)                                       \
-  macro(SWGL_BLEND_SUBPIXEL_TEXT, 0, 0, 0)
+  macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE)                  \
+      macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0)                              \
+          macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0)                         \
+              macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE)          \
+                  macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0) macro(          \
+                      GL_ZERO, GL_SRC_COLOR, 0, 0) macro(GL_ONE, GL_ONE, 0, 0) \
+                      macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA)    \
+                          macro(GL_ONE, GL_ZERO, 0, 0) macro(                  \
+                              GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE) \
+                              macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, \
+                                    0, 0)                                      \
+                                  macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0)
 
 #define DEFINE_BLEND_KEY(...) BLEND_KEY(__VA_ARGS__),
-#define DEFINE_MASK_BLEND_KEY(...) MASK_BLEND_KEY(__VA_ARGS__),
-#define DEFINE_AA_BLEND_KEY(...) AA_BLEND_KEY(__VA_ARGS__),
-#define DEFINE_AA_MASK_BLEND_KEY(...) AA_MASK_BLEND_KEY(__VA_ARGS__),
 enum BlendKey : uint8_t {
+  BLEND_KEY_NONE = 0,
   FOR_EACH_BLEND_KEY(DEFINE_BLEND_KEY)
-  FOR_EACH_BLEND_KEY(DEFINE_MASK_BLEND_KEY)
-  FOR_EACH_BLEND_KEY(DEFINE_AA_BLEND_KEY)
-  FOR_EACH_BLEND_KEY(DEFINE_AA_MASK_BLEND_KEY)
-  BLEND_KEY_NONE = BLEND_KEY(GL_ONE, GL_ZERO),
-  MASK_BLEND_KEY_NONE = MASK_BLEND_KEY(GL_ONE, GL_ZERO),
-  AA_BLEND_KEY_NONE = AA_BLEND_KEY(GL_ONE, GL_ZERO),
-  AA_MASK_BLEND_KEY_NONE = AA_MASK_BLEND_KEY(GL_ONE, GL_ZERO),
 };
-// clang-format on
 
 const size_t MAX_TEXTURE_UNITS = 16;
 
@@ -704,10 +459,8 @@ struct ObjectStore {
 
   O* find(size_t i) const { return i < size ? objects[i] : nullptr; }
 
-  template <typename T>
-  void on_erase(T*, ...) {}
-  template <typename T>
-  void on_erase(T* o, decltype(&T::on_erase)) {
+  template <typename T> void on_erase(T*, ...) {}
+  template <typename T> void on_erase(T* o, decltype(&T::on_erase)) {
     o->on_erase();
   }
 
@@ -727,8 +480,6 @@ struct ObjectStore {
 };
 
 struct Context {
-  int32_t references = 1;
-
   ObjectStore<Query> queries;
   ObjectStore<Buffer> buffers;
   ObjectStore<Texture> textures;
@@ -756,7 +507,7 @@ struct Context {
   bool scissortest = false;
   IntRect scissor = {0, 0, 0, 0};
 
-  GLfloat clearcolor[4] = {0, 0, 0, 0};
+  uint32_t clearcolor = 0;
   GLdouble cleardepth = 1;
 
   int unpack_row_length = 0;
@@ -766,10 +517,14 @@ struct Context {
 
   struct TextureUnit {
     GLuint texture_2d_binding = 0;
+    GLuint texture_3d_binding = 0;
+    GLuint texture_2d_array_binding = 0;
     GLuint texture_rectangle_binding = 0;
 
     void unlink(GLuint n) {
       ::unlink(texture_2d_binding, n);
+      ::unlink(texture_3d_binding, n);
+      ::unlink(texture_2d_array_binding, n);
       ::unlink(texture_rectangle_binding, n);
     }
   };
@@ -784,6 +539,7 @@ struct Context {
   GLuint pixel_pack_buffer_binding = 0;
   GLuint pixel_unpack_buffer_binding = 0;
   GLuint array_buffer_binding = 0;
+  GLuint element_array_buffer_binding = 0;
   GLuint time_elapsed_query = 0;
   GLuint samples_passed_query = 0;
   GLuint renderbuffer_binding = 0;
@@ -800,9 +556,13 @@ struct Context {
       case GL_ARRAY_BUFFER:
         return array_buffer_binding;
       case GL_ELEMENT_ARRAY_BUFFER:
-        return vertex_arrays[current_vertex_array].element_array_buffer_binding;
+        return element_array_buffer_binding;
       case GL_TEXTURE_2D:
         return texture_units[active_texture_unit].texture_2d_binding;
+      case GL_TEXTURE_2D_ARRAY:
+        return texture_units[active_texture_unit].texture_2d_array_binding;
+      case GL_TEXTURE_3D:
+        return texture_units[active_texture_unit].texture_3d_binding;
       case GL_TEXTURE_RECTANGLE:
         return texture_units[active_texture_unit].texture_rectangle_binding;
       case GL_TIME_ELAPSED:
@@ -830,17 +590,16 @@ struct Context {
     return textures[texture_units[unit].texture_2d_binding];
   }
 
-  Texture& get_texture(sampler2DRect, int unit) {
-    return textures[texture_units[unit].texture_rectangle_binding];
+  Texture& get_texture(sampler2DArray, int unit) {
+    return textures[texture_units[unit].texture_2d_array_binding];
   }
 
-  IntRect apply_scissor(IntRect bb,
-                        const IntPoint& origin = IntPoint(0, 0)) const {
-    return scissortest ? bb.intersect(scissor - origin) : bb;
+  Texture& get_texture(sampler2DRect, int unit) {
+    return textures[texture_units[unit].texture_rectangle_binding];
   }
 
-  IntRect apply_scissor(const Texture& t) const {
-    return apply_scissor(t.bounds(), t.offset);
+  IntRect apply_scissor(IntRect bb) const {
+    return scissortest ? bb.intersect(scissor) : bb;
   }
 };
 static Context* ctx = nullptr;
@@ -851,12 +610,14 @@ static BlendKey blend_key = BLEND_KEY_NONE;
 static void prepare_texture(Texture& t, const IntRect* skip = nullptr);
 
 template <typename S>
+static inline void init_depth(S* s, Texture& t) {
+  s->depth = max(t.depth, 1);
+  s->height_stride = s->stride * t.height;
+}
+
+template <typename S>
 static inline void init_filter(S* s, Texture& t) {
-  // If the width is not at least 2 pixels, then we can't safely sample the end
-  // of the row with a linear filter. In that case, just punt to using nearest
-  // filtering instead.
-  s->filter = t.width >= 2 ? gl_filter_to_texture_filter(t.mag_filter)
-                           : TextureFilter::NEAREST;
+  s->filter = gl_filter_to_texture_filter(t.mag_filter);
 }
 
 template <typename S>
@@ -864,44 +625,20 @@ static inline void init_sampler(S* s, Texture& t) {
   prepare_texture(t);
   s->width = t.width;
   s->height = t.height;
-  s->stride = t.stride();
   int bpp = t.bpp();
-  if (bpp >= 4)
-    s->stride /= 4;
-  else if (bpp == 2)
-    s->stride /= 2;
-  else
-    assert(bpp == 1);
-  // Use uint32_t* for easier sampling, but need to cast to uint8_t* or
-  // uint16_t* for formats with bpp < 4.
+  s->stride = t.stride(bpp);
+  if (bpp >= 4) s->stride /= 4;
+  // Use uint32_t* for easier sampling, but need to cast to uint8_t* for formats
+  // with bpp < 4.
   s->buf = (uint32_t*)t.buf;
   s->format = gl_format_to_texture_format(t.internal_format);
 }
 
 template <typename S>
-static inline void null_sampler(S* s) {
-  // For null texture data, just make the sampler provide a 1x1 buffer that is
-  // transparent black. Ensure buffer holds at least a SIMD vector of zero data
-  // for SIMD padding of unaligned loads.
-  static const uint32_t zeroBuf[sizeof(Float) / sizeof(uint32_t)] = {0};
-  s->width = 1;
-  s->height = 1;
-  s->stride = s->width;
-  s->buf = (uint32_t*)zeroBuf;
-  s->format = TextureFormat::RGBA8;
-}
-
-template <typename S>
-static inline void null_filter(S* s) {
-  s->filter = TextureFilter::NEAREST;
-}
-
-template <typename S>
 S* lookup_sampler(S* s, int texture) {
   Texture& t = ctx->get_texture(s, texture);
   if (!t.buf) {
-    null_sampler(s);
-    null_filter(s);
+    *s = S();
   } else {
     init_sampler(s, t);
     init_filter(s, t);
@@ -913,13 +650,26 @@ template <typename S>
 S* lookup_isampler(S* s, int texture) {
   Texture& t = ctx->get_texture(s, texture);
   if (!t.buf) {
-    null_sampler(s);
+    *s = S();
   } else {
     init_sampler(s, t);
   }
   return s;
 }
 
+template <typename S>
+S* lookup_sampler_array(S* s, int texture) {
+  Texture& t = ctx->get_texture(s, texture);
+  if (!t.buf) {
+    *s = S();
+  } else {
+    init_sampler(s, t);
+    init_depth(s, t);
+    init_filter(s, t);
+  }
+  return s;
+}
+
 int bytes_per_type(GLenum type) {
   switch (type) {
     case GL_INT:
@@ -983,40 +733,21 @@ void load_attrib(T& attrib, VertexAttrib& va, uint32_t start, int instance,
     attrib = T(load_attrib_scalar<scalar_type>(va, src));
   } else {
     // Specialized for WR's primitive vertex order/winding.
+    // Triangles must be indexed at offsets 0, 1, 2.
+    // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3.
+    // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2.
+    // Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so that the
+    // points form a convex path that can be traversed by the rasterizer.
     if (!count) return;
-    assert(count >= 2 && count <= 4);
+    assert(count == 3 || count == 4);
     char* src = (char*)va.buf + va.stride * start + va.offset;
-    switch (count) {
-      case 2: {
-        // Lines must be indexed at offsets 0, 1.
-        // Line vertexes fill vertex shader SIMD lanes as 0, 1, 1, 0.
-        scalar_type lanes[2] = {
-            load_attrib_scalar<scalar_type>(va, src),
-            load_attrib_scalar<scalar_type>(va, src + va.stride)};
-        attrib = (T){lanes[0], lanes[1], lanes[1], lanes[0]};
-        break;
-      }
-      case 3: {
-        // Triangles must be indexed at offsets 0, 1, 2.
-        // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2.
-        scalar_type lanes[3] = {
-            load_attrib_scalar<scalar_type>(va, src),
-            load_attrib_scalar<scalar_type>(va, src + va.stride),
-            load_attrib_scalar<scalar_type>(va, src + va.stride * 2)};
-        attrib = (T){lanes[0], lanes[1], lanes[2], lanes[2]};
-        break;
-      }
-      default:
-        // Quads must be successive triangles indexed at offsets 0, 1, 2, 2,
-        // 1, 3. Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so
-        // that the points form a convex path that can be traversed by the
-        // rasterizer.
-        attrib = (T){load_attrib_scalar<scalar_type>(va, src),
-                     load_attrib_scalar<scalar_type>(va, src + va.stride),
-                     load_attrib_scalar<scalar_type>(va, src + va.stride * 3),
-                     load_attrib_scalar<scalar_type>(va, src + va.stride * 2)};
-        break;
-    }
+    attrib = (T){
+        load_attrib_scalar<scalar_type>(va, src),
+        load_attrib_scalar<scalar_type>(va, src + va.stride),
+        load_attrib_scalar<scalar_type>(va, src + va.stride * 2 +
+                                            (count > 3 ? va.stride : 0)),
+        load_attrib_scalar<scalar_type>(va, src + va.stride * 2)
+    };
   }
 }
 
@@ -1076,6 +807,7 @@ void Enable(GLenum cap) {
   switch (cap) {
     case GL_BLEND:
       ctx->blend = true;
+      blend_key = ctx->blend_key;
       break;
     case GL_DEPTH_TEST:
       ctx->depthtest = true;
@@ -1090,6 +822,7 @@ void Disable(GLenum cap) {
   switch (cap) {
     case GL_BLEND:
       ctx->blend = false;
+      blend_key = BLEND_KEY_NONE;
       break;
     case GL_DEPTH_TEST:
       ctx->depthtest = false;
@@ -1103,18 +836,10 @@ void Disable(GLenum cap) {
 GLenum GetError() { return GL_NO_ERROR; }
 
 static const char* const extensions[] = {
-    "GL_ARB_blend_func_extended",
-    "GL_ARB_clear_texture",
-    "GL_ARB_copy_image",
-    "GL_ARB_draw_instanced",
-    "GL_ARB_explicit_attrib_location",
-    "GL_ARB_instanced_arrays",
-    "GL_ARB_invalidate_subdata",
-    "GL_ARB_texture_storage",
-    "GL_EXT_timer_query",
-    "GL_KHR_blend_equation_advanced",
-    "GL_KHR_blend_equation_advanced_coherent",
-    "GL_APPLE_rgb_422",
+    "GL_ARB_blend_func_extended", "GL_ARB_copy_image",
+    "GL_ARB_draw_instanced",      "GL_ARB_explicit_attrib_location",
+    "GL_ARB_instanced_arrays",    "GL_ARB_invalidate_subdata",
+    "GL_ARB_texture_storage",     "GL_EXT_timer_query",
 };
 
 void GetIntegerv(GLenum pname, GLint* params) {
@@ -1128,7 +853,7 @@ void GetIntegerv(GLenum pname, GLint* params) {
       params[0] = 1 << 15;
       break;
     case GL_MAX_ARRAY_TEXTURE_LAYERS:
-      params[0] = 0;
+      params[0] = 1 << 15;
       break;
     case GL_READ_FRAMEBUFFER_BINDING:
       params[0] = ctx->read_framebuffer_binding;
@@ -1145,12 +870,6 @@ void GetIntegerv(GLenum pname, GLint* params) {
     case GL_NUM_EXTENSIONS:
       params[0] = sizeof(extensions) / sizeof(extensions[0]);
       break;
-    case GL_MAJOR_VERSION:
-      params[0] = 3;
-      break;
-    case GL_MINOR_VERSION:
-      params[0] = 2;
-      break;
     default:
       debugf("unhandled glGetIntegerv parameter %x\n", pname);
       assert(false);
@@ -1177,8 +896,6 @@ const char* GetString(GLenum name) {
       return "Software WebRender";
     case GL_VERSION:
       return "3.2";
-    case GL_SHADING_LANGUAGE_VERSION:
-      return "1.50";
     default:
       debugf("unhandled glGetString parameter %x\n", name);
       assert(false);
@@ -1254,23 +971,17 @@ GLenum remap_blendfunc(GLenum rgb, GLenum a) {
   return a;
 }
 
-// Generate a hashed blend key based on blend func and equation state. This
-// allows all the blend state to be processed down to a blend key that can be
-// dealt with inside a single switch statement.
-static void hash_blend_key() {
-  GLenum srgb = ctx->blendfunc_srgb;
-  GLenum drgb = ctx->blendfunc_drgb;
-  GLenum sa = ctx->blendfunc_sa;
-  GLenum da = ctx->blendfunc_da;
-  GLenum equation = ctx->blend_equation;
+void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) {
+  ctx->blendfunc_srgb = srgb;
+  ctx->blendfunc_drgb = drgb;
+  sa = remap_blendfunc(srgb, sa);
+  da = remap_blendfunc(drgb, da);
+  ctx->blendfunc_sa = sa;
+  ctx->blendfunc_da = da;
+
 #define HASH_BLEND_KEY(x, y, z, w) ((x << 4) | (y) | (z << 24) | (w << 20))
-  // Basic non-separate blend funcs used the two argument form
   int hash = HASH_BLEND_KEY(srgb, drgb, 0, 0);
-  // Separate alpha blend funcs use the 4 argument hash
   if (srgb != sa || drgb != da) hash |= HASH_BLEND_KEY(0, 0, sa, da);
-  // Any other blend equation than the default func_add ignores the func and
-  // instead generates a one-argument hash based on the equation
-  if (equation != GL_FUNC_ADD) hash = HASH_BLEND_KEY(equation, 0, 0, 0);
   switch (hash) {
 #define MAP_BLEND_KEY(...)                   \
   case HASH_BLEND_KEY(__VA_ARGS__):          \
@@ -1278,22 +989,14 @@ static void hash_blend_key() {
     break;
     FOR_EACH_BLEND_KEY(MAP_BLEND_KEY)
     default:
-      debugf("blendfunc: %x, %x, separate: %x, %x, equation: %x\n", srgb, drgb,
-             sa, da, equation);
+      debugf("blendfunc: %x, %x, separate: %x, %x\n", srgb, drgb, sa, da);
       assert(false);
       break;
   }
-}
 
-void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) {
-  ctx->blendfunc_srgb = srgb;
-  ctx->blendfunc_drgb = drgb;
-  sa = remap_blendfunc(srgb, sa);
-  da = remap_blendfunc(drgb, da);
-  ctx->blendfunc_sa = sa;
-  ctx->blendfunc_da = da;
-
-  hash_blend_key();
+  if (ctx->blend) {
+    blend_key = ctx->blend_key;
+  }
 }
 
 void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
@@ -1302,12 +1005,8 @@ void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
 }
 
 void BlendEquation(GLenum mode) {
-  assert(mode == GL_FUNC_ADD || mode == GL_MIN || mode == GL_MAX ||
-         (mode >= GL_MULTIPLY_KHR && mode <= GL_HSL_LUMINOSITY_KHR));
-  if (mode != ctx->blend_equation) {
-    ctx->blend_equation = mode;
-    hash_blend_key();
-  }
+  assert(mode == GL_FUNC_ADD);
+  ctx->blend_equation = mode;
 }
 
 void DepthMask(GLboolean flag) { ctx->depthmask = flag; }
@@ -1328,10 +1027,8 @@ void SetScissor(GLint x, GLint y, GLsizei width, GLsizei height) {
 }
 
 void ClearColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
-  ctx->clearcolor[0] = r;
-  ctx->clearcolor[1] = g;
-  ctx->clearcolor[2] = b;
-  ctx->clearcolor[3] = a;
+  I32 c = round_pixel((Float){b, g, r, a});
+  ctx->clearcolor = bit_cast<uint32_t>(CONVERT(c, U8));
 }
 
 void ClearDepth(GLdouble depth) { ctx->cleardepth = depth; }
@@ -1369,6 +1066,7 @@ void DeleteBuffer(GLuint n) {
     unlink(ctx->pixel_pack_buffer_binding, n);
     unlink(ctx->pixel_unpack_buffer_binding, n);
     unlink(ctx->array_buffer_binding, n);
+    unlink(ctx->element_array_buffer_binding, n);
   }
 }
 
@@ -1434,45 +1132,26 @@ void DeleteProgram(GLuint n) {
 void LinkProgram(GLuint program) {
   Program& p = ctx->programs[program];
   assert(p.impl);
-  if (!p.impl) {
-    return;
-  }
   assert(p.impl->interpolants_size() <= sizeof(Interpolants));
   if (!p.vert_impl) p.vert_impl = p.impl->get_vertex_shader();
   if (!p.frag_impl) p.frag_impl = p.impl->get_fragment_shader();
 }
 
-GLint GetLinkStatus(GLuint program) {
-  if (auto* p = ctx->programs.find(program)) {
-    return p->impl ? 1 : 0;
-  }
-  return 0;
-}
-
 void BindAttribLocation(GLuint program, GLuint index, char* name) {
   Program& p = ctx->programs[program];
   assert(p.impl);
-  if (!p.impl) {
-    return;
-  }
   p.impl->bind_attrib(name, index);
 }
 
 GLint GetAttribLocation(GLuint program, char* name) {
   Program& p = ctx->programs[program];
   assert(p.impl);
-  if (!p.impl) {
-    return -1;
-  }
   return p.impl->get_attrib(name);
 }
 
 GLint GetUniformLocation(GLuint program, char* name) {
   Program& p = ctx->programs[program];
   assert(p.impl);
-  if (!p.impl) {
-    return -1;
-  }
   GLint loc = p.impl->get_uniform(name);
   // debugf("location: %d\n", loc);
   return loc;
@@ -1482,15 +1161,7 @@ static uint64_t get_time_value() {
 #ifdef __MACH__
   return mach_absolute_time();
 #elif defined(_WIN32)
-  LARGE_INTEGER time;
-  static bool have_frequency = false;
-  static LARGE_INTEGER frequency;
-  if (!have_frequency) {
-    QueryPerformanceFrequency(&frequency);
-    have_frequency = true;
-  }
-  QueryPerformanceCounter(&time);
-  return time.QuadPart * 1000000000ULL / frequency.QuadPart;
+  return uint64_t(clock()) * (1000000000ULL / CLOCKS_PER_SEC);
 #else
   return ({
     struct timespec tp;
@@ -1583,113 +1254,60 @@ void PixelStorei(GLenum name, GLint param) {
 static GLenum remap_internal_format(GLenum format) {
   switch (format) {
     case GL_DEPTH_COMPONENT:
-      return GL_DEPTH_COMPONENT24;
+      return GL_DEPTH_COMPONENT16;
     case GL_RGBA:
       return GL_RGBA8;
     case GL_RED:
       return GL_R8;
-    case GL_RG:
-      return GL_RG8;
-    case GL_RGB_422_APPLE:
-      return GL_RGB_RAW_422_APPLE;
     default:
       return format;
   }
 }
 
-}  // extern "C"
-
-static bool format_requires_conversion(GLenum external_format,
-                                       GLenum internal_format) {
-  switch (external_format) {
-    case GL_RGBA:
-      return internal_format == GL_RGBA8;
-    default:
-      return false;
-  }
-}
-
-static inline void copy_bgra8_to_rgba8(uint32_t* dest, const uint32_t* src,
-                                       int width) {
-  for (; width >= 4; width -= 4, dest += 4, src += 4) {
-    U32 p = unaligned_load<U32>(src);
-    U32 rb = p & 0x00FF00FF;
-    unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16));
-  }
-  for (; width > 0; width--, dest++, src++) {
-    uint32_t p = *src;
-    uint32_t rb = p & 0x00FF00FF;
-    *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16);
-  }
-}
-
-static void convert_copy(GLenum external_format, GLenum internal_format,
-                         uint8_t* dst_buf, size_t dst_stride,
-                         const uint8_t* src_buf, size_t src_stride,
-                         size_t width, size_t height) {
-  switch (external_format) {
-    case GL_RGBA:
-      if (internal_format == GL_RGBA8) {
-        for (; height; height--) {
-          copy_bgra8_to_rgba8((uint32_t*)dst_buf, (const uint32_t*)src_buf,
-                              width);
-          dst_buf += dst_stride;
-          src_buf += src_stride;
-        }
-        return;
-      }
-      break;
-    default:
-      break;
-  }
-  size_t row_bytes = width * bytes_for_internal_format(internal_format);
-  for (; height; height--) {
-    memcpy(dst_buf, src_buf, row_bytes);
-    dst_buf += dst_stride;
-    src_buf += src_stride;
+void TexStorage3D(GLenum target, GLint levels, GLenum internal_format,
+                  GLsizei width, GLsizei height, GLsizei depth) {
+  assert(levels == 1);
+  Texture& t = ctx->textures[ctx->get_binding(target)];
+  internal_format = remap_internal_format(internal_format);
+  bool changed = false;
+  if (t.width != width || t.height != height || t.depth != depth ||
+      t.internal_format != internal_format) {
+    changed = true;
+    t.internal_format = internal_format;
+    t.width = width;
+    t.height = height;
+    t.depth = depth;
   }
+  t.disable_delayed_clear();
+  t.allocate(changed);
 }
 
-static void set_tex_storage(Texture& t, GLenum external_format, GLsizei width,
-                            GLsizei height, void* buf = nullptr,
-                            GLsizei stride = 0, GLsizei min_width = 0,
-                            GLsizei min_height = 0) {
-  GLenum internal_format = remap_internal_format(external_format);
+static void set_tex_storage(Texture& t, GLenum internal_format,
+                            GLsizei width, GLsizei height,
+                            bool should_free = true, void* buf = nullptr,
+                            GLsizei min_width = 0, GLsizei min_height = 0) {
+  internal_format = remap_internal_format(internal_format);
   bool changed = false;
-  if (t.width != width || t.height != height ||
+  if (t.width != width || t.height != height || t.depth != 0 ||
       t.internal_format != internal_format) {
     changed = true;
     t.internal_format = internal_format;
     t.width = width;
     t.height = height;
+    t.depth = 0;
   }
-  // If we are changed from an internally managed buffer to an externally
-  // supplied one or vice versa, ensure that we clean up old buffer state.
-  // However, if we have to convert the data from a non-native format, then
-  // always treat it as internally managed since we will need to copy to an
-  // internally managed native format buffer.
-  bool should_free = buf == nullptr || format_requires_conversion(
-                                           external_format, internal_format);
-  if (t.should_free() != should_free) {
-    changed = true;
-    t.cleanup();
+  if (t.should_free() != should_free || buf != nullptr) {
+    if (t.should_free()) {
+      t.cleanup();
+    }
     t.set_should_free(should_free);
-  }
-  // If now an external buffer, explicitly set it...
-  if (!should_free) {
-    t.set_buffer(buf, stride);
+    t.buf = (char*)buf;
+    t.buf_size = 0;
   }
   t.disable_delayed_clear();
   t.allocate(changed, min_width, min_height);
-  // If we have a buffer that needs format conversion, then do that now.
-  if (buf && should_free) {
-    convert_copy(external_format, internal_format, (uint8_t*)t.buf, t.stride(),
-                 (const uint8_t*)buf, stride, width, height);
-  }
 }
 
-extern "C" {
-
 void TexStorage2D(GLenum target, GLint levels, GLenum internal_format,
                   GLsizei width, GLsizei height) {
   assert(levels == 1);
@@ -1701,19 +1319,12 @@ GLenum internal_format_for_data(GLenum format, GLenum ty) {
   if (format == GL_RED && ty == GL_UNSIGNED_BYTE) {
     return GL_R8;
   } else if ((format == GL_RGBA || format == GL_BGRA) &&
-             (ty == GL_UNSIGNED_BYTE || ty == GL_UNSIGNED_INT_8_8_8_8_REV)) {
+             ty == GL_UNSIGNED_BYTE) {
     return GL_RGBA8;
   } else if (format == GL_RGBA && ty == GL_FLOAT) {
     return GL_RGBA32F;
   } else if (format == GL_RGBA_INTEGER && ty == GL_INT) {
     return GL_RGBA32I;
-  } else if (format == GL_RG && ty == GL_UNSIGNED_BYTE) {
-    return GL_RG8;
-  } else if (format == GL_RGB_422_APPLE &&
-             ty == GL_UNSIGNED_SHORT_8_8_REV_APPLE) {
-    return GL_RGB_RAW_422_APPLE;
-  } else if (format == GL_RED && ty == GL_UNSIGNED_SHORT) {
-    return GL_R16;
   } else {
     debugf("unknown internal format for format %x, type %x\n", format, ty);
     assert(false);
@@ -1721,6 +1332,20 @@ GLenum internal_format_for_data(GLenum format, GLenum ty) {
   }
 }
 
+static inline void copy_bgra8_to_rgba8(uint32_t* dest, uint32_t* src,
+                                       int width) {
+  for (; width >= 4; width -= 4, dest += 4, src += 4) {
+    U32 p = unaligned_load<U32>(src);
+    U32 rb = p & 0x00FF00FF;
+    unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16));
+  }
+  for (; width > 0; width--, dest++, src++) {
+    uint32_t p = *src;
+    uint32_t rb = p & 0x00FF00FF;
+    *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16);
+  }
+}
+
 static Buffer* get_pixel_pack_buffer() {
   return ctx->pixel_pack_buffer_binding
              ? &ctx->buffers[ctx->pixel_pack_buffer_binding]
@@ -1750,10 +1375,7 @@ static void* get_pixel_unpack_buffer_data(void* data) {
 void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset,
                    GLsizei width, GLsizei height, GLenum format, GLenum ty,
                    void* data) {
-  if (level != 0) {
-    assert(false);
-    return;
-  }
+  if (level != 0) { assert(false); return; }
   data = get_pixel_unpack_buffer_data(data);
   if (!data) return;
   Texture& t = ctx->textures[ctx->get_binding(target)];
@@ -1765,33 +1387,84 @@ void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset,
   GLsizei row_length =
       ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width;
   assert(t.internal_format == internal_format_for_data(format, ty));
-  int src_bpp = format_requires_conversion(format, t.internal_format)
-                    ? bytes_for_internal_format(format)
-                    : t.bpp();
-  if (!src_bpp || !t.buf) return;
-  convert_copy(format, t.internal_format,
-               (uint8_t*)t.sample_ptr(xoffset, yoffset), t.stride(),
-               (const uint8_t*)data, row_length * src_bpp, width, height);
+  int bpp = t.bpp();
+  if (!bpp || !t.buf) return;
+  size_t dest_stride = t.stride(bpp);
+  char* dest = t.sample_ptr(xoffset, yoffset, 0, bpp, dest_stride);
+  char* src = (char*)data;
+  for (int y = 0; y < height; y++) {
+    if (t.internal_format == GL_RGBA8 && format != GL_BGRA) {
+      copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width);
+    } else {
+      memcpy(dest, src, width * bpp);
+    }
+    dest += dest_stride;
+    src += row_length * bpp;
+  }
 }
 
 void TexImage2D(GLenum target, GLint level, GLint internal_format,
                 GLsizei width, GLsizei height, GLint border, GLenum format,
                 GLenum ty, void* data) {
-  if (level != 0) {
-    assert(false);
-    return;
-  }
+  if (level != 0) { assert(false); return; }
   assert(border == 0);
   TexStorage2D(target, 1, internal_format, width, height);
   TexSubImage2D(target, 0, 0, 0, width, height, format, ty, data);
 }
 
+void TexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset,
+                   GLint zoffset, GLsizei width, GLsizei height, GLsizei depth,
+                   GLenum format, GLenum ty, void* data) {
+  if (level != 0) { assert(false); return; }
+  data = get_pixel_unpack_buffer_data(data);
+  if (!data) return;
+  Texture& t = ctx->textures[ctx->get_binding(target)];
+  prepare_texture(t);
+  assert(ctx->unpack_row_length == 0 || ctx->unpack_row_length >= width);
+  GLsizei row_length =
+      ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width;
+  if (format == GL_BGRA) {
+    assert(ty == GL_UNSIGNED_BYTE);
+    assert(t.internal_format == GL_RGBA8);
+  } else {
+    assert(t.internal_format == internal_format_for_data(format, ty));
+  }
+  int bpp = t.bpp();
+  if (!bpp || !t.buf) return;
+  char* src = (char*)data;
+  assert(xoffset + width <= t.width);
+  assert(yoffset + height <= t.height);
+  assert(zoffset + depth <= t.depth);
+  size_t dest_stride = t.stride(bpp);
+  for (int z = 0; z < depth; z++) {
+    char* dest = t.sample_ptr(xoffset, yoffset, zoffset + z, bpp, dest_stride);
+    for (int y = 0; y < height; y++) {
+      if (t.internal_format == GL_RGBA8 && format != GL_BGRA) {
+        copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width);
+      } else {
+        memcpy(dest, src, width * bpp);
+      }
+      dest += dest_stride;
+      src += row_length * bpp;
+    }
+  }
+}
+
+void TexImage3D(GLenum target, GLint level, GLint internal_format,
+                GLsizei width, GLsizei height, GLsizei depth, GLint border,
+                GLenum format, GLenum ty, void* data) {
+  if (level != 0) { assert(false); return; }
+  assert(border == 0);
+  TexStorage3D(target, 1, internal_format, width, height, depth);
+  TexSubImage3D(target, 0, 0, 0, 0, width, height, depth, format, ty, data);
+}
+
 void GenerateMipmap(UNUSED GLenum target) {
   // TODO: support mipmaps
 }
 
-void SetTextureParameter(GLuint texid, GLenum pname, GLint param) {
-  Texture& t = ctx->textures[texid];
+void TexParameteri(GLenum target, GLenum pname, GLint param) {
+  Texture& t = ctx->textures[ctx->get_binding(target)];
   switch (pname) {
     case GL_TEXTURE_WRAP_S:
       assert(param == GL_CLAMP_TO_EDGE);
@@ -1810,10 +1483,6 @@ void SetTextureParameter(GLuint texid, GLenum pname, GLint param) {
   }
 }
 
-void TexParameteri(GLenum target, GLenum pname, GLint param) {
-  SetTextureParameter(ctx->get_binding(target), pname, param);
-}
-
 void GenTextures(int n, GLuint* result) {
   for (int i = 0; i < n; i++) {
     Texture t;
@@ -1839,7 +1508,9 @@ void GenRenderbuffers(int n, GLuint* result) {
 void Renderbuffer::on_erase() {
   for (auto* fb : ctx->framebuffers) {
     if (fb) {
-      unlink(fb->color_attachment, texture);
+      if (unlink(fb->color_attachment, texture)) {
+        fb->layer = 0;
+      }
       unlink(fb->depth_attachment, texture);
     }
   }
@@ -1875,11 +1546,10 @@ void RenderbufferStorage(GLenum target, GLenum internal_format, GLsizei width,
   }
   switch (internal_format) {
     case GL_DEPTH_COMPONENT:
-    case GL_DEPTH_COMPONENT16:
     case GL_DEPTH_COMPONENT24:
     case GL_DEPTH_COMPONENT32:
-      // Force depth format to 24 bits...
-      internal_format = GL_DEPTH_COMPONENT24;
+      // Force depth format to 16 bits...
+      internal_format = GL_DEPTH_COMPONENT16;
       break;
   }
   set_tex_storage(ctx->textures[r.texture], internal_format, width, height);
@@ -1963,8 +1633,7 @@ void VertexAttribDivisor(GLuint index, GLuint divisor) {
   va.divisor = divisor;
 }
 
-void BufferData(GLenum target, GLsizeiptr size, void* data,
-                UNUSED GLenum usage) {
+void BufferData(GLenum target, GLsizeiptr size, void* data, UNUSED GLenum usage) {
   Buffer& b = ctx->buffers[ctx->get_binding(target)];
   if (b.allocate(size)) {
     ctx->validate_vertex_array = true;
@@ -2004,23 +1673,17 @@ GLboolean UnmapBuffer(GLenum target) {
 
 void Uniform1i(GLint location, GLint V0) {
   // debugf("tex: %d\n", (int)ctx->textures.size);
-  if (vertex_shader) {
-    vertex_shader->set_uniform_1i(location, V0);
-  }
+  vertex_shader->set_uniform_1i(location, V0);
 }
 void Uniform4fv(GLint location, GLsizei count, const GLfloat* v) {
   assert(count == 1);
-  if (vertex_shader) {
-    vertex_shader->set_uniform_4fv(location, v);
-  }
+  vertex_shader->set_uniform_4fv(location, v);
 }
 void UniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose,
                       const GLfloat* value) {
   assert(count == 1);
   assert(!transpose);
-  if (vertex_shader) {
-    vertex_shader->set_uniform_matrix4fv(location, value);
-  }
+  vertex_shader->set_uniform_matrix4fv(location, value);
 }
 
 void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget,
@@ -2031,7 +1694,24 @@ void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget,
   Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)];
   if (attachment == GL_COLOR_ATTACHMENT0) {
     fb.color_attachment = texture;
+    fb.layer = 0;
+  } else if (attachment == GL_DEPTH_ATTACHMENT) {
+    fb.depth_attachment = texture;
+  } else {
+    assert(0);
+  }
+}
+
+void FramebufferTextureLayer(GLenum target, GLenum attachment, GLuint texture,
+                             GLint level, GLint layer) {
+  assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER);
+  assert(level == 0);
+  Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)];
+  if (attachment == GL_COLOR_ATTACHMENT0) {
+    fb.color_attachment = texture;
+    fb.layer = layer;
   } else if (attachment == GL_DEPTH_ATTACHMENT) {
+    assert(layer == 0);
     fb.depth_attachment = texture;
   } else {
     assert(0);
@@ -2046,6 +1726,7 @@ void FramebufferRenderbuffer(GLenum target, GLenum attachment,
   Renderbuffer& rb = ctx->renderbuffers[renderbuffer];
   if (attachment == GL_COLOR_ATTACHMENT0) {
     fb.color_attachment = rb.texture;
+    fb.layer = 0;
   } else if (attachment == GL_DEPTH_ATTACHMENT) {
     fb.depth_attachment = rb.texture;
   } else {
@@ -2055,18 +1736,11 @@ void FramebufferRenderbuffer(GLenum target, GLenum attachment,
 
 }  // extern "C"
 
-static inline Framebuffer* get_framebuffer(GLenum target,
-                                           bool fallback = false) {
+static inline Framebuffer* get_framebuffer(GLenum target) {
   if (target == GL_FRAMEBUFFER) {
     target = GL_DRAW_FRAMEBUFFER;
   }
-  Framebuffer* fb = ctx->framebuffers.find(ctx->get_binding(target));
-  if (fallback && !fb) {
-    // If the specified framebuffer isn't found and a fallback is requested,
-    // use the default framebuffer.
-    fb = &ctx->framebuffers[0];
-  }
-  return fb;
+  return ctx->framebuffers.find(ctx->get_binding(target));
 }
 
 template <typename T>
@@ -2092,7 +1766,9 @@ static inline uint32_t clear_chunk(uint16_t value) {
   return uint32_t(value) | (uint32_t(value) << 16);
 }
 
-static inline uint32_t clear_chunk(uint32_t value) { return value; }
+static inline uint32_t clear_chunk(uint32_t value) {
+  return value;
+}
 
 template <typename T>
 static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) {
@@ -2115,22 +1791,20 @@ static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) {
 }
 
 template <typename T>
-static void clear_buffer(Texture& t, T value, IntRect bb, int skip_start = 0,
-                         int skip_end = 0) {
+static void clear_buffer(Texture& t, T value, int layer, IntRect bb,
+                         int skip_start = 0, int skip_end = 0) {
   if (!t.buf) return;
   skip_start = max(skip_start, bb.x0);
   skip_end = max(skip_end, skip_start);
   assert(sizeof(T) == t.bpp());
-  size_t stride = t.stride();
-  // When clearing multiple full-width rows, collapse them into a single large
-  // "row" to avoid redundant setup from clearing each row individually. Note
-  // that we can only safely do this if the stride is tightly packed.
-  if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end &&
-      (t.should_free() || stride == t.width * sizeof(T))) {
+  size_t stride = t.stride(sizeof(T));
+  // When clearing multiple full-width rows, collapse them into a single
+  // large "row" to avoid redundant setup from clearing each row individually.
+  if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end) {
     bb.x1 += (stride / sizeof(T)) * (bb.height() - 1);
     bb.y1 = bb.y0 + 1;
   }
-  T* buf = (T*)t.sample_ptr(bb.x0, bb.y0);
+  T* buf = (T*)t.sample_ptr(bb.x0, bb.y0, layer, sizeof(T), stride);
   uint32_t chunk = clear_chunk(value);
   for (int rows = bb.height(); rows > 0; rows--) {
     if (bb.x0 < skip_start) {
@@ -2144,12 +1818,20 @@ static void clear_buffer(Texture& t, T value, IntRect bb, int skip_start = 0,
 }
 
 template <typename T>
+static inline void clear_buffer(Texture& t, T value, int layer = 0) {
+  IntRect bb = ctx->apply_scissor(t.bounds());
+  if (bb.width() > 0) {
+    clear_buffer<T>(t, value, layer, bb);
+  }
+}
+
+template <typename T>
 static inline void force_clear_row(Texture& t, int y, int skip_start = 0,
                                    int skip_end = 0) {
   assert(t.buf != nullptr);
   assert(sizeof(T) == t.bpp());
   assert(skip_start <= skip_end);
-  T* buf = (T*)t.sample_ptr(0, y);
+  T* buf = (T*)t.sample_ptr(0, y, 0, sizeof(T));
   uint32_t chunk = clear_chunk((T)t.clear_val);
   if (skip_start > 0) {
     clear_row<T>(buf, skip_start, t.clear_val, chunk);
@@ -2188,9 +1870,9 @@ static void force_clear(Texture& t, const IntRect* skip = nullptr) {
       while (mask) {
         int count = __builtin_ctz(mask);
         if (count > 0) {
-          clear_buffer<T>(t, t.clear_val,
-                          IntRect{0, start, t.width, start + count}, skip_start,
-                          skip_end);
+          clear_buffer<T>(t, t.clear_val, 0,
+                          IntRect{0, start, t.width, start + count},
+                          skip_start, skip_end);
           t.delay_clear -= count;
           start += count;
           mask >>= count;
@@ -2201,9 +1883,9 @@ static void force_clear(Texture& t, const IntRect* skip = nullptr) {
       }
       int count = (i + 1) * 32 - start;
       if (count > 0) {
-        clear_buffer<T>(t, t.clear_val,
-                        IntRect{0, start, t.width, start + count}, skip_start,
-                        skip_end);
+        clear_buffer<T>(t, t.clear_val, 0,
+                        IntRect{0, start, t.width, start + count},
+                        skip_start, skip_end);
         t.delay_clear -= count;
       }
     }
@@ -2220,7 +1902,7 @@ static void prepare_texture(Texture& t, const IntRect* skip) {
       case GL_R8:
         force_clear<uint8_t>(t, skip);
         break;
-      case GL_RG8:
+      case GL_DEPTH_COMPONENT16:
         force_clear<uint16_t>(t, skip);
         break;
       default:
@@ -2230,53 +1912,31 @@ static void prepare_texture(Texture& t, const IntRect* skip) {
   }
 }
 
-// Setup a clear on a texture. This may either force an immediate clear or
-// potentially punt to a delayed clear, if applicable.
-template <typename T>
-static void request_clear(Texture& t, T value, const IntRect& scissor) {
-  // If the clear would require a scissor, force clear anything outside
-  // the scissor, and then immediately clear anything inside the scissor.
-  if (!scissor.contains(t.offset_bounds())) {
-    IntRect skip = scissor - t.offset;
-    force_clear<T>(t, &skip);
-    clear_buffer<T>(t, value, skip.intersection(t.bounds()));
-  } else {
-    // Do delayed clear for 2D texture without scissor.
-    t.enable_delayed_clear(value);
-  }
-}
-
-template <typename T>
-static inline void request_clear(Texture& t, T value) {
-  // If scissoring is enabled, use the scissor rect. Otherwise, just scissor to
-  // the entire texture bounds.
-  request_clear(t, value, ctx->scissortest ? ctx->scissor : t.offset_bounds());
-}
-
 extern "C" {
 
-void InitDefaultFramebuffer(int x, int y, int width, int height, int stride,
-                            void* buf) {
+void InitDefaultFramebuffer(int width, int height) {
   Framebuffer& fb = ctx->framebuffers[0];
   if (!fb.color_attachment) {
     GenTextures(1, &fb.color_attachment);
+    fb.layer = 0;
   }
-  // If the dimensions or buffer properties changed, we need to reallocate
-  // the underlying storage for the color buffer texture.
   Texture& colortex = ctx->textures[fb.color_attachment];
-  set_tex_storage(colortex, GL_RGBA8, width, height, buf, stride);
-  colortex.offset = IntPoint(x, y);
+  if (colortex.width != width || colortex.height != height) {
+    colortex.cleanup();
+    set_tex_storage(colortex, GL_RGBA8, width, height);
+  }
   if (!fb.depth_attachment) {
     GenTextures(1, &fb.depth_attachment);
   }
-  // Ensure dimensions of the depth buffer match the color buffer.
   Texture& depthtex = ctx->textures[fb.depth_attachment];
-  set_tex_storage(depthtex, GL_DEPTH_COMPONENT24, width, height);
-  depthtex.offset = IntPoint(x, y);
+  if (depthtex.width != width || depthtex.height != height) {
+    depthtex.cleanup();
+    set_tex_storage(depthtex, GL_DEPTH_COMPONENT16, width, height);
+  }
 }
 
 void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width,
-                     int32_t* height, int32_t* stride) {
+                     int32_t* height) {
   Framebuffer* fb = ctx->framebuffers.find(fbo);
   if (!fb || !fb->color_attachment) {
     return nullptr;
@@ -2285,33 +1945,16 @@ void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width,
   if (flush) {
     prepare_texture(colortex);
   }
-  assert(colortex.offset == IntPoint(0, 0));
-  if (width) {
-    *width = colortex.width;
-  }
-  if (height) {
-    *height = colortex.height;
-  }
-  if (stride) {
-    *stride = colortex.stride();
-  }
-  return colortex.buf ? colortex.sample_ptr(0, 0) : nullptr;
-}
-
-void ResolveFramebuffer(GLuint fbo) {
-  Framebuffer* fb = ctx->framebuffers.find(fbo);
-  if (!fb || !fb->color_attachment) {
-    return;
-  }
-  Texture& colortex = ctx->textures[fb->color_attachment];
-  prepare_texture(colortex);
+  *width = colortex.width;
+  *height = colortex.height;
+  return colortex.buf ? colortex.sample_ptr(0, 0, fb->layer) : nullptr;
 }
 
 void SetTextureBuffer(GLuint texid, GLenum internal_format, GLsizei width,
-                      GLsizei height, GLsizei stride, void* buf,
-                      GLsizei min_width, GLsizei min_height) {
+                      GLsizei height, void* buf, GLsizei min_width,
+                      GLsizei min_height) {
   Texture& t = ctx->textures[texid];
-  set_tex_storage(t, internal_format, width, height, buf, stride, min_width,
+  set_tex_storage(t, internal_format, width, height, !buf, buf, min_width,
                   min_height);
 }
 
@@ -2323,170 +1966,57 @@ GLenum CheckFramebufferStatus(GLenum target) {
   return GL_FRAMEBUFFER_COMPLETE;
 }
 
-void ClearTexSubImage(GLuint texture, GLint level, GLint xoffset, GLint yoffset,
-                      GLint zoffset, GLsizei width, GLsizei height,
-                      GLsizei depth, GLenum format, GLenum type,
-                      const void* data) {
-  if (level != 0) {
-    assert(false);
-    return;
-  }
-  Texture& t = ctx->textures[texture];
-  assert(!t.locked);
-  if (width <= 0 || height <= 0 || depth <= 0) {
-    return;
-  }
-  assert(zoffset == 0 && depth == 1);
-  IntRect scissor = {xoffset, yoffset, xoffset + width, yoffset + height};
-  if (t.internal_format == GL_DEPTH_COMPONENT24) {
-    uint32_t value = 0xFFFFFF;
-    switch (format) {
-      case GL_DEPTH_COMPONENT:
-        switch (type) {
-          case GL_DOUBLE:
-            value = uint32_t(*(const GLdouble*)data * 0xFFFFFF);
-            break;
-          case GL_FLOAT:
-            value = uint32_t(*(const GLfloat*)data * 0xFFFFFF);
-            break;
-          default:
-            assert(false);
-            break;
-        }
-        break;
-      default:
-        assert(false);
-        break;
-    }
-    if (t.cleared() && !scissor.contains(t.offset_bounds())) {
-      // If we need to scissor the clear and the depth buffer was already
-      // initialized, then just fill runs for that scissor area.
-      t.fill_depth_runs(value, scissor);
-    } else {
-      // Otherwise, the buffer is either uninitialized or the clear would
-      // encompass the entire buffer. If uninitialized, we can safely fill
-      // the entire buffer with any value and thus ignore any scissoring.
-      t.init_depth_runs(value);
-    }
-    return;
-  }
-
-  uint32_t color = 0xFF000000;
-  switch (type) {
-    case GL_FLOAT: {
-      const GLfloat* f = (const GLfloat*)data;
-      Float v = {0.0f, 0.0f, 0.0f, 1.0f};
-      switch (format) {
-        case GL_RGBA:
-          v.w = f[3];  // alpha
-          FALLTHROUGH;
-        case GL_RGB:
-          v.z = f[2];  // blue
-          FALLTHROUGH;
-        case GL_RG:
-          v.y = f[1];  // green
-          FALLTHROUGH;
-        case GL_RED:
-          v.x = f[0];  // red
-          break;
-        default:
-          assert(false);
-          break;
-      }
-      color = bit_cast<uint32_t>(CONVERT(round_pixel(v), U8));
-      break;
-    }
-    case GL_UNSIGNED_BYTE: {
-      const GLubyte* b = (const GLubyte*)data;
-      switch (format) {
-        case GL_RGBA:
-          color = (color & ~0xFF000000) | (uint32_t(b[3]) << 24);  // alpha
-          FALLTHROUGH;
-        case GL_RGB:
-          color = (color & ~0x00FF0000) | (uint32_t(b[2]) << 16);  // blue
-          FALLTHROUGH;
-        case GL_RG:
-          color = (color & ~0x0000FF00) | (uint32_t(b[1]) << 8);  // green
-          FALLTHROUGH;
-        case GL_RED:
-          color = (color & ~0x000000FF) | uint32_t(b[0]);  // red
-          break;
-        default:
-          assert(false);
-          break;
-      }
-      break;
-    }
-    default:
-      assert(false);
-      break;
-  }
-
-  switch (t.internal_format) {
-    case GL_RGBA8:
-      // Clear color needs to swizzle to BGRA.
-      request_clear<uint32_t>(t,
-                              (color & 0xFF00FF00) |
-                                  ((color << 16) & 0xFF0000) |
-                                  ((color >> 16) & 0xFF),
-                              scissor);
-      break;
-    case GL_R8:
-      request_clear<uint8_t>(t, uint8_t(color & 0xFF), scissor);
-      break;
-    case GL_RG8:
-      request_clear<uint16_t>(t, uint16_t(color & 0xFFFF), scissor);
-      break;
-    default:
-      assert(false);
-      break;
-  }
-}
-
-void ClearTexImage(GLuint texture, GLint level, GLenum format, GLenum type,
-                   const void* data) {
-  Texture& t = ctx->textures[texture];
-  IntRect scissor = t.offset_bounds();
-  ClearTexSubImage(texture, level, scissor.x0, scissor.y0, 0, scissor.width(),
-                   scissor.height(), 1, format, type, data);
+static inline bool clear_requires_scissor(Texture& t) {
+  return ctx->scissortest && !ctx->scissor.contains(t.bounds());
 }
 
 void Clear(GLbitfield mask) {
-  Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true);
+  Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER);
   if ((mask & GL_COLOR_BUFFER_BIT) && fb.color_attachment) {
     Texture& t = ctx->textures[fb.color_attachment];
-    IntRect scissor = ctx->scissortest
-                          ? ctx->scissor.intersection(t.offset_bounds())
-                          : t.offset_bounds();
-    ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0,
-                     scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT,
-                     ctx->clearcolor);
+    if (t.internal_format == GL_RGBA8) {
+      uint32_t color = ctx->clearcolor;
+      // If the clear would require a scissor, force clear anything outside
+      // the scissor, and then immediately clear anything inside the scissor.
+      if (clear_requires_scissor(t)) {
+        force_clear<uint32_t>(t, &ctx->scissor);
+        clear_buffer<uint32_t>(t, color, fb.layer);
+      } else if (t.depth > 1) {
+        // Delayed clear is not supported on texture arrays.
+        t.disable_delayed_clear();
+        clear_buffer<uint32_t>(t, color, fb.layer);
+      } else {
+        // Do delayed clear for 2D texture without scissor.
+        t.enable_delayed_clear(color);
+      }
+    } else if (t.internal_format == GL_R8) {
+      uint8_t color = uint8_t((ctx->clearcolor >> 16) & 0xFF);
+      if (clear_requires_scissor(t)) {
+        force_clear<uint8_t>(t, &ctx->scissor);
+        clear_buffer<uint8_t>(t, color, fb.layer);
+      } else if (t.depth > 1) {
+        t.disable_delayed_clear();
+        clear_buffer<uint8_t>(t, color, fb.layer);
+      } else {
+        t.enable_delayed_clear(color);
+      }
+    } else {
+      assert(false);
+    }
   }
   if ((mask & GL_DEPTH_BUFFER_BIT) && fb.depth_attachment) {
     Texture& t = ctx->textures[fb.depth_attachment];
-    IntRect scissor = ctx->scissortest
-                          ? ctx->scissor.intersection(t.offset_bounds())
-                          : t.offset_bounds();
-    ClearTexSubImage(fb.depth_attachment, 0, scissor.x0, scissor.y0, 0,
-                     scissor.width(), scissor.height(), 1, GL_DEPTH_COMPONENT,
-                     GL_DOUBLE, &ctx->cleardepth);
+    assert(t.internal_format == GL_DEPTH_COMPONENT16);
+    uint16_t depth = uint16_t(0xFFFF * ctx->cleardepth) - 0x8000;
+    if (clear_requires_scissor(t)) {
+      force_clear<uint16_t>(t, &ctx->scissor);
+      clear_buffer<uint16_t>(t, depth);
+    } else {
+      t.enable_delayed_clear(depth);
+    }
   }
 }
 
-void ClearColorRect(GLuint fbo, GLint xoffset, GLint yoffset, GLsizei width,
-                    GLsizei height, GLfloat r, GLfloat g, GLfloat b,
-                    GLfloat a) {
-  GLfloat color[] = {r, g, b, a};
-  Framebuffer& fb = ctx->framebuffers[fbo];
-  Texture& t = ctx->textures[fb.color_attachment];
-  IntRect scissor =
-      IntRect{xoffset, yoffset, xoffset + width, yoffset + height}.intersection(
-          t.offset_bounds());
-  ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0,
-                   scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT,
-                   color);
-}
-
 void InvalidateFramebuffer(GLenum target, GLsizei num_attachments,
                            const GLenum* attachments) {
   Framebuffer* fb = get_framebuffer(target);
@@ -2497,7 +2027,7 @@ void InvalidateFramebuffer(GLenum target, GLsizei num_attachments,
     switch (attachments[i]) {
       case GL_DEPTH_ATTACHMENT: {
         Texture& t = ctx->textures[fb->depth_attachment];
-        t.set_cleared(false);
+        t.disable_delayed_clear();
         break;
       }
       case GL_COLOR_ATTACHMENT0: {
@@ -2516,58 +2046,40 @@ void ReadPixels(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format,
   Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
   if (!fb) return;
   assert(format == GL_RED || format == GL_RGBA || format == GL_RGBA_INTEGER ||
-         format == GL_BGRA || format == GL_RG);
+         format == GL_BGRA);
   Texture& t = ctx->textures[fb->color_attachment];
   if (!t.buf) return;
   prepare_texture(t);
   // debugf("read pixels %d, %d, %d, %d from fb %d with format %x\n", x, y,
   // width, height, ctx->read_framebuffer_binding, t.internal_format);
-  x -= t.offset.x;
-  y -= t.offset.y;
-  assert(x >= 0 && y >= 0);
   assert(x + width <= t.width);
   assert(y + height <= t.height);
   if (internal_format_for_data(format, type) != t.internal_format) {
     debugf("mismatched format for read pixels: %x vs %x\n", t.internal_format,
            internal_format_for_data(format, type));
     assert(false);
-    return;
-  }
-  // Only support readback conversions that are reversible
-  assert(!format_requires_conversion(format, t.internal_format) ||
-         bytes_for_internal_format(format) == t.bpp());
-  uint8_t* dest = (uint8_t*)data;
-  size_t destStride = width * t.bpp();
-  if (y < 0) {
-    dest += -y * destStride;
-    height += y;
-    y = 0;
-  }
-  if (y + height > t.height) {
-    height = t.height - y;
-  }
-  if (x < 0) {
-    dest += -x * t.bpp();
-    width += x;
-    x = 0;
   }
-  if (x + width > t.width) {
-    width = t.width - x;
-  }
-  if (width <= 0 || height <= 0) {
-    return;
+  int bpp = t.bpp();
+  char* dest = (char*)data;
+  size_t src_stride = t.stride(bpp);
+  char* src = t.sample_ptr(x, y, fb->layer, bpp, src_stride);
+  for (; height > 0; height--) {
+    if (t.internal_format == GL_RGBA8 && format != GL_BGRA) {
+      copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width);
+    } else {
+      memcpy(dest, src, width * bpp);
+    }
+    dest += width * bpp;
+    src += src_stride;
   }
-  convert_copy(format, t.internal_format, dest, destStride,
-               (const uint8_t*)t.sample_ptr(x, y), t.stride(), width, height);
 }
 
 void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel,
                       GLint srcX, GLint srcY, GLint srcZ, GLuint dstName,
-                      GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX,
-                      GLint dstY, GLint dstZ, GLsizei srcWidth,
-                      GLsizei srcHeight, GLsizei srcDepth) {
+                      GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX, GLint dstY,
+                      GLint dstZ, GLsizei srcWidth, GLsizei srcHeight,
+                      GLsizei srcDepth) {
   assert(srcLevel == 0 && dstLevel == 0);
-  assert(srcZ == 0 && srcDepth == 1 && dstZ == 0);
   if (srcTarget == GL_RENDERBUFFER) {
     Renderbuffer& rb = ctx->renderbuffers[srcName];
     srcName = rb.texture;
@@ -2581,44 +2093,532 @@ void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel,
   prepare_texture(srctex);
   Texture& dsttex = ctx->textures[dstName];
   if (!dsttex.buf) return;
-  assert(!dsttex.locked);
   IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight};
   prepare_texture(dsttex, &skip);
   assert(srctex.internal_format == dsttex.internal_format);
   assert(srcWidth >= 0);
   assert(srcHeight >= 0);
+  assert(srcDepth >= 0);
   assert(srcX + srcWidth <= srctex.width);
   assert(srcY + srcHeight <= srctex.height);
+  assert(srcZ + srcDepth <= max(srctex.depth, 1));
   assert(dstX + srcWidth <= dsttex.width);
   assert(dstY + srcHeight <= dsttex.height);
+  assert(dstZ + srcDepth <= max(dsttex.depth, 1));
   int bpp = srctex.bpp();
-  int src_stride = srctex.stride();
-  int dest_stride = dsttex.stride();
-  char* dest = dsttex.sample_ptr(dstX, dstY);
-  char* src = srctex.sample_ptr(srcX, srcY);
-  for (int y = 0; y < srcHeight; y++) {
-    memcpy(dest, src, srcWidth * bpp);
-    dest += dest_stride;
-    src += src_stride;
+  int src_stride = srctex.stride(bpp);
+  int dest_stride = dsttex.stride(bpp);
+  for (int z = 0; z < srcDepth; z++) {
+    char* dest = dsttex.sample_ptr(dstX, dstY, dstZ + z, bpp, dest_stride);
+    char* src = srctex.sample_ptr(srcX, srcY, srcZ + z, bpp, src_stride);
+    for (int y = 0; y < srcHeight; y++) {
+      memcpy(dest, src, srcWidth * bpp);
+      dest += dest_stride;
+      src += src_stride;
+    }
   }
 }
 
-void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset,
-                       GLint yoffset, GLint x, GLint y, GLsizei width,
+void CopyTexSubImage3D(GLenum target, UNUSED GLint level, GLint xoffset, GLint yoffset,
+                       GLint zoffset, GLint x, GLint y, GLsizei width,
                        GLsizei height) {
   assert(level == 0);
   Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
   if (!fb) return;
-  CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D, 0, x, y, 0,
-                   ctx->get_binding(target), GL_TEXTURE_2D, 0, xoffset, yoffset,
-                   0, width, height, 1);
+  CopyImageSubData(fb->color_attachment, GL_TEXTURE_3D, 0, x, y, fb->layer,
+                   ctx->get_binding(target), GL_TEXTURE_3D, 0, xoffset, yoffset,
+                   zoffset, width, height, 1);
+}
+
+void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset, GLint yoffset,
+                       GLint x, GLint y, GLsizei width, GLsizei height) {
+  assert(level == 0);
+  Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER);
+  if (!fb) return;
+  CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D_ARRAY, 0, x, y,
+                   fb->layer, ctx->get_binding(target), GL_TEXTURE_2D_ARRAY, 0,
+                   xoffset, yoffset, 0, width, height, 1);
 }
 
 }  // extern "C"
 
-#include "blend.h"
-#include "composite.h"
-#include "swgl_ext.h"
+using PackedRGBA8 = V16<uint8_t>;
+using WideRGBA8 = V16<uint16_t>;
+using HalfRGBA8 = V8<uint16_t>;
+
+static inline WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); }
+
+static inline PackedRGBA8 pack(WideRGBA8 p) {
+#if USE_SSE2
+  return _mm_packus_epi16(lowHalf(p), highHalf(p));
+#elif USE_NEON
+  return vcombine_u8(vqmovn_u16(lowHalf(p)), vqmovn_u16(highHalf(p)));
+#else
+  return CONVERT(p, PackedRGBA8);
+#endif
+}
+
+static inline HalfRGBA8 packRGBA8(I32 a, I32 b) {
+#if USE_SSE2
+  return _mm_packs_epi32(a, b);
+#elif USE_NEON
+  return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b));
+#else
+  return CONVERT(combine(a, b), HalfRGBA8);
+#endif
+}
+
+using PackedR8 = V4<uint8_t>;
+using WideR8 = V4<uint16_t>;
+
+static inline WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); }
+
+static inline WideR8 packR8(I32 a) {
+#if USE_SSE2
+  return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a)));
+#elif USE_NEON
+  return vqmovun_s32(a);
+#else
+  return CONVERT(a, WideR8);
+#endif
+}
+
+static inline PackedR8 pack(WideR8 p) {
+#if USE_SSE2
+  auto m = expand(p);
+  auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m));
+  return SHUFFLE(r, r, 0, 1, 2, 3);
+#elif USE_NEON
+  return lowHalf(bit_cast<V8<uint8_t>>(vqmovn_u16(expand(p))));
+#else
+  return CONVERT(p, PackedR8);
+#endif
+}
+
+using ZMask4 = V4<int16_t>;
+using ZMask8 = V8<int16_t>;
+
+static inline PackedRGBA8 unpack(ZMask4 mask, uint32_t*) {
+  return bit_cast<PackedRGBA8>(mask.xxyyzzww);
+}
+
+static inline WideR8 unpack(ZMask4 mask, uint8_t*) {
+  return bit_cast<WideR8>(mask);
+}
+
+#if USE_SSE2
+#  define ZMASK_NONE_PASSED 0xFFFF
+#  define ZMASK_ALL_PASSED 0
+static inline uint32_t zmask_code(ZMask8 mask) {
+  return _mm_movemask_epi8(mask);
+}
+static inline uint32_t zmask_code(ZMask4 mask) {
+  return zmask_code(mask.xyzwxyzw);
+}
+#else
+using ZMask4Code = V4<uint8_t>;
+using ZMask8Code = V8<uint8_t>;
+#  define ZMASK_NONE_PASSED 0xFFFFFFFFU
+#  define ZMASK_ALL_PASSED 0
+static inline uint32_t zmask_code(ZMask4 mask) {
+  return bit_cast<uint32_t>(CONVERT(mask, ZMask4Code));
+}
+static inline uint32_t zmask_code(ZMask8 mask) {
+  return zmask_code(
+      ZMask4((U16(lowHalf(mask)) >> 12) | (U16(highHalf(mask)) << 4)));
+}
+#endif
+
+template <int FUNC, bool MASK>
+static ALWAYS_INLINE int check_depth8(uint16_t z, uint16_t* zbuf,
+                                      ZMask8& outmask) {
+  ZMask8 dest = unaligned_load<ZMask8>(zbuf);
+  ZMask8 src = int16_t(z);
+  // Invert the depth test to check which pixels failed and should be discarded.
+  ZMask8 mask = FUNC == GL_LEQUAL ?
+                                  // GL_LEQUAL: Not(LessEqual) = Greater
+                    ZMask8(src > dest)
+                                  :
+                                  // GL_LESS: Not(Less) = GreaterEqual
+                    ZMask8(src >= dest);
+  switch (zmask_code(mask)) {
+    case ZMASK_NONE_PASSED:
+      return 0;
+    case ZMASK_ALL_PASSED:
+      if (MASK) {
+        unaligned_store(zbuf, src);
+      }
+      return -1;
+    default:
+      if (MASK) {
+        unaligned_store(zbuf, (mask & dest) | (~mask & src));
+      }
+      outmask = mask;
+      return 1;
+  }
+}
+
+template <bool FULL_SPANS, bool DISCARD>
+static ALWAYS_INLINE bool check_depth4(ZMask4 src, uint16_t* zbuf,
+                                       ZMask4& outmask, int span = 0) {
+  ZMask4 dest = unaligned_load<ZMask4>(zbuf);
+  // Invert the depth test to check which pixels failed and should be discarded.
+  ZMask4 mask = ctx->depthfunc == GL_LEQUAL
+                    ?
+                    // GL_LEQUAL: Not(LessEqual) = Greater
+                    ZMask4(src > dest)
+                    :
+                    // GL_LESS: Not(Less) = GreaterEqual
+                    ZMask4(src >= dest);
+  if (!FULL_SPANS) {
+    mask |= ZMask4(span) < ZMask4{1, 2, 3, 4};
+  }
+  if (zmask_code(mask) == ZMASK_NONE_PASSED) {
+    return false;
+  }
+  if (!DISCARD && ctx->depthmask) {
+    unaligned_store(zbuf, (mask & dest) | (~mask & src));
+  }
+  outmask = mask;
+  return true;
+}
+
+template <bool FULL_SPANS, bool DISCARD>
+static ALWAYS_INLINE bool check_depth4(uint16_t z, uint16_t* zbuf,
+                                       ZMask4& outmask, int span = 0) {
+  return check_depth4<FULL_SPANS, DISCARD>(ZMask4(int16_t(z)), zbuf, outmask,
+                                           span);
+}
+
+template <typename T>
+static inline ZMask4 packZMask4(T a) {
+#if USE_SSE2
+  return lowHalf(bit_cast<ZMask8>(_mm_packs_epi32(a, a)));
+#elif USE_NEON
+  return vqmovn_s32(a);
+#else
+  return CONVERT(a, ZMask4);
+#endif
+}
+
+static ALWAYS_INLINE ZMask4 packDepth() {
+  return packZMask4(cast(fragment_shader->gl_FragCoord.z * 0xFFFF) - 0x8000);
+}
+
+static ALWAYS_INLINE void discard_depth(ZMask4 src, uint16_t* zbuf,
+                                        ZMask4 mask) {
+  if (ctx->depthmask) {
+    ZMask4 dest = unaligned_load<ZMask4>(zbuf);
+    mask |= packZMask4(fragment_shader->isPixelDiscarded);
+    unaligned_store(zbuf, (mask & dest) | (~mask & src));
+  }
+}
+
+static ALWAYS_INLINE void discard_depth(uint16_t z, uint16_t* zbuf,
+                                        ZMask4 mask) {
+  discard_depth(ZMask4(int16_t(z)), zbuf, mask);
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8(const vec4& v) {
+  ivec4 i = round_pixel(v);
+  HalfRGBA8 xz = packRGBA8(i.z, i.x);
+  HalfRGBA8 yw = packRGBA8(i.y, i.w);
+  HalfRGBA8 xy = zipLow(xz, yw);
+  HalfRGBA8 zw = zipHigh(xz, yw);
+  HalfRGBA8 lo = zip2Low(xy, zw);
+  HalfRGBA8 hi = zip2High(xy, zw);
+  return combine(lo, hi);
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v) {
+  I32 i = round_pixel((Float){v.z, v.y, v.x, v.w});
+  HalfRGBA8 c = packRGBA8(i, i);
+  return combine(c, c);
+}
+
+static inline WideRGBA8 pack_pixels_RGBA8() {
+  return pack_pixels_RGBA8(fragment_shader->gl_FragColor);
+}
+
+template <typename V>
+static inline PackedRGBA8 pack_span(uint32_t*, const V& v) {
+  return pack(pack_pixels_RGBA8(v));
+}
+
+static inline PackedRGBA8 pack_span(uint32_t*) {
+  return pack(pack_pixels_RGBA8());
+}
+
+// (x*y + x) >> 8, cheap approximation of (x*y) / 255
+template <typename T>
+static inline T muldiv255(T x, T y) {
+  return (x * y + x) >> 8;
+}
+
+// Byte-wise addition for when x or y is a signed 8-bit value stored in the
+// low byte of a larger type T only with zeroed-out high bits, where T is
+// greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used
+// upon signed operands, using up all the precision in a 16 bit integer, and
+// potentially losing the sign bit in the last >> 8 shift. Due to the
+// properties of two's complement arithmetic, even though we've discarded the
+// sign bit, we can still represent a negative number under addition (without
+// requiring any extra sign bits), just that any negative number will behave
+// like a large unsigned number under addition, generating a single carry bit
+// on overflow that we need to discard. Thus, just doing a byte-wise add will
+// overflow without the troublesome carry, giving us only the remaining 8 low
+// bits we actually need while keeping the high bits at zero.
+template <typename T>
+static inline T addlow(T x, T y) {
+  typedef VectorType<uint8_t, sizeof(T)> bytes;
+  return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y));
+}
+
+static inline WideRGBA8 alphas(WideRGBA8 c) {
+  return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15);
+}
+
+static inline WideRGBA8 blend_pixels_RGBA8(PackedRGBA8 pdst, WideRGBA8 src) {
+  WideRGBA8 dst = unpack(pdst);
+  const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0,      0xFFFF, 0xFFFF,
+                              0xFFFF, 0,      0xFFFF, 0xFFFF, 0xFFFF, 0,
+                              0xFFFF, 0xFFFF, 0xFFFF, 0};
+  const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF,
+                                0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF};
+  const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255,
+                                  0, 0, 0, 255, 0, 0, 0, 255};
+  switch (blend_key) {
+    case BLEND_KEY_NONE:
+      return src;
+    case BLEND_KEY(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE):
+      // dst + src.a*(src.rgb1 - dst.rgb0)
+      // use addlow for signed overflow
+      return addlow(dst,
+          muldiv255(alphas(src), (src | ALPHA_OPAQUE) - (dst & RGB_MASK)));
+    case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
+      return src + dst - muldiv255(dst, alphas(src));
+    case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_COLOR):
+      return dst - muldiv255(dst, src);
+    case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE):
+      return dst - (muldiv255(dst, src) & RGB_MASK);
+    case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA):
+      return dst - muldiv255(dst, alphas(src));
+    case BLEND_KEY(GL_ZERO, GL_SRC_COLOR):
+      return muldiv255(src, dst);
+    case BLEND_KEY(GL_ONE, GL_ONE):
+      return src + dst;
+    case BLEND_KEY(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA):
+      return src + dst - (muldiv255(dst, src) & ALPHA_MASK);
+    case BLEND_KEY(GL_ONE, GL_ZERO):
+      return src;
+    case BLEND_KEY(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE):
+      // src*(1-dst.a) + dst*1 = src - src*dst.a + dst
+      return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK);
+    case BLEND_KEY(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR):
+      // src*k + (1-src)*dst = src*k + dst - src*dst = dst + src*(k - dst)
+      // use addlow for signed overflow
+      return addlow(dst,
+          muldiv255(src, combine(ctx->blendcolor, ctx->blendcolor) - dst));
+    case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): {
+      WideRGBA8 secondary =
+          pack_pixels_RGBA8(fragment_shader->gl_SecondaryFragColor);
+      return src + dst - muldiv255(dst, secondary);
+    }
+    default:
+      UNREACHABLE;
+      // return src;
+  }
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint32_t* buf, PackedRGBA8 mask) {
+  PackedRGBA8 dst = unaligned_load<PackedRGBA8>(buf);
+  WideRGBA8 r = pack_pixels_RGBA8();
+  if (blend_key) r = blend_pixels_RGBA8(dst, r);
+  if (DISCARD) mask |= bit_cast<PackedRGBA8>(fragment_shader->isPixelDiscarded);
+  unaligned_store(buf, (mask & dst) | (~mask & pack(r)));
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint32_t* buf) {
+  discard_output<DISCARD>(buf, 0);
+}
+
+template <>
+inline void discard_output<false>(uint32_t* buf) {
+  WideRGBA8 r = pack_pixels_RGBA8();
+  if (blend_key) r = blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), r);
+  unaligned_store(buf, pack(r));
+}
+
+static inline PackedRGBA8 span_mask_RGBA8(int span) {
+  return bit_cast<PackedRGBA8>(I32(span) < I32{1, 2, 3, 4});
+}
+
+static inline PackedRGBA8 span_mask(uint32_t*, int span) {
+  return span_mask_RGBA8(span);
+}
+
+static inline WideR8 pack_pixels_R8(Float c) {
+  return packR8(round_pixel(c));
+}
+
+static inline WideR8 pack_pixels_R8() {
+  return pack_pixels_R8(fragment_shader->gl_FragColor.x);
+}
+
+template <typename C>
+static inline PackedR8 pack_span(uint8_t*, C c) {
+  return pack(pack_pixels_R8(c));
+}
+
+static inline PackedR8 pack_span(uint8_t*) { return pack(pack_pixels_R8()); }
+
+static inline WideR8 blend_pixels_R8(WideR8 dst, WideR8 src) {
+  switch (blend_key) {
+    case BLEND_KEY_NONE:
+      return src;
+    case BLEND_KEY(GL_ZERO, GL_SRC_COLOR):
+      return muldiv255(src, dst);
+    case BLEND_KEY(GL_ONE, GL_ONE):
+      return src + dst;
+    case BLEND_KEY(GL_ONE, GL_ZERO):
+      return src;
+    default:
+      UNREACHABLE;
+      // return src;
+  }
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint8_t* buf, WideR8 mask) {
+  WideR8 dst = unpack(unaligned_load<PackedR8>(buf));
+  WideR8 r = pack_pixels_R8();
+  if (blend_key) r = blend_pixels_R8(dst, r);
+  if (DISCARD) mask |= packR8(fragment_shader->isPixelDiscarded);
+  unaligned_store(buf, pack((mask & dst) | (~mask & r)));
+}
+
+template <bool DISCARD>
+static inline void discard_output(uint8_t* buf) {
+  discard_output<DISCARD>(buf, 0);
+}
+
+template <>
+inline void discard_output<false>(uint8_t* buf) {
+  WideR8 r = pack_pixels_R8();
+  if (blend_key) r = blend_pixels_R8(unpack(unaligned_load<PackedR8>(buf)), r);
+  unaligned_store(buf, pack(r));
+}
+
+static inline WideR8 span_mask_R8(int span) {
+  return bit_cast<WideR8>(WideR8(span) < WideR8{1, 2, 3, 4});
+}
+
+static inline WideR8 span_mask(uint8_t*, int span) {
+  return span_mask_R8(span);
+}
+
+template <bool DISCARD, bool W, typename P, typename M>
+static inline void commit_output(P* buf, M mask) {
+  fragment_shader->run<W>();
+  discard_output<DISCARD>(buf, mask);
+}
+
+template <bool DISCARD, bool W, typename P>
+static inline void commit_output(P* buf) {
+  fragment_shader->run<W>();
+  discard_output<DISCARD>(buf);
+}
+
+template <bool DISCARD, bool W, typename P>
+static inline void commit_output(P* buf, int span) {
+  commit_output<DISCARD, W>(buf, span_mask(buf, span));
+}
+
+template <bool DISCARD, bool W, typename P, typename Z>
+static inline void commit_output(P* buf, Z z, uint16_t* zbuf) {
+  ZMask4 zmask;
+  if (check_depth4<true, DISCARD>(z, zbuf, zmask)) {
+    commit_output<DISCARD, W>(buf, unpack(zmask, buf));
+    if (DISCARD) {
+      discard_depth(z, zbuf, zmask);
+    }
+  } else {
+    fragment_shader->skip<W>();
+  }
+}
+
+template <bool DISCARD, bool W, typename P, typename Z>
+static inline void commit_output(P* buf, Z z, uint16_t* zbuf, int span) {
+  ZMask4 zmask;
+  if (check_depth4<false, DISCARD>(z, zbuf, zmask, span)) {
+    commit_output<DISCARD, W>(buf, unpack(zmask, buf));
+    if (DISCARD) {
+      discard_depth(z, zbuf, zmask);
+    }
+  }
+}
+
+static inline void commit_span(uint32_t* buf, PackedRGBA8 r) {
+  if (blend_key)
+    r = pack(blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), unpack(r)));
+  unaligned_store(buf, r);
+}
+
+UNUSED static inline void commit_solid_span(uint32_t* buf, PackedRGBA8 r,
+                                            int len) {
+  if (blend_key) {
+    auto src = unpack(r);
+    for (uint32_t* end = &buf[len]; buf < end; buf += 4) {
+      unaligned_store(
+          buf, pack(blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), src)));
+    }
+  } else {
+    fill_n(buf, len, bit_cast<U32>(r).x);
+  }
+}
+
+UNUSED static inline void commit_texture_span(uint32_t* buf, uint32_t* src,
+                                              int len) {
+  if (blend_key) {
+    for (uint32_t* end = &buf[len]; buf < end; buf += 4, src += 4) {
+      PackedRGBA8 r = unaligned_load<PackedRGBA8>(src);
+      unaligned_store(buf, pack(blend_pixels_RGBA8(
+                               unaligned_load<PackedRGBA8>(buf), unpack(r))));
+    }
+  } else {
+    memcpy(buf, src, len * sizeof(uint32_t));
+  }
+}
+
+static inline void commit_span(uint8_t* buf, PackedR8 r) {
+  if (blend_key)
+    r = pack(blend_pixels_R8(unpack(unaligned_load<PackedR8>(buf)), unpack(r)));
+  unaligned_store(buf, r);
+}
+
+UNUSED static inline void commit_solid_span(uint8_t* buf, PackedR8 r, int len) {
+  if (blend_key) {
+    auto src = unpack(r);
+    for (uint8_t* end = &buf[len]; buf < end; buf += 4) {
+      unaligned_store(buf, pack(blend_pixels_R8(
+                               unpack(unaligned_load<PackedR8>(buf)), src)));
+    }
+  } else {
+    fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(r));
+  }
+}
+
+#define DISPATCH_DRAW_SPAN(self, buf, len) do {           \
+  int drawn = self->draw_span(buf, len);                  \
+  if (drawn) self->step_interp_inputs(drawn >> 2);        \
+  for (buf += drawn; drawn < len; drawn += 4, buf += 4) { \
+    run(self);                                            \
+    commit_span(buf, pack_span(buf));                     \
+  }                                                       \
+} while (0)
+
+#include "texture.h"
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wuninitialized"
@@ -2627,14 +2627,942 @@ void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset,
 #pragma GCC diagnostic ignored "-Wunused-variable"
 #pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
 #ifdef __clang__
-#  pragma GCC diagnostic ignored "-Wunused-private-field"
+#pragma GCC diagnostic ignored "-Wunused-private-field"
 #else
-#  pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
 #endif
 #include "load_shader.h"
 #pragma GCC diagnostic pop
 
-#include "rasterize.h"
+typedef vec2_scalar Point2D;
+typedef vec4_scalar Point3D;
+
+struct ClipRect {
+  float x0;
+  float y0;
+  float x1;
+  float y1;
+
+  ClipRect(const IntRect& i) : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {}
+  ClipRect(Texture& t) : ClipRect(ctx->apply_scissor(t.bounds())) {}
+
+  template <typename P>
+  bool overlaps(int nump, const P* p) const {
+    // Generate a mask of which side of the clip rect all of a polygon's points
+    // fall inside of. This is a cheap conservative estimate of whether the
+    // bounding box of the polygon might overlap the clip rect, rather than an
+    // exact test that would require multiple slower line intersections.
+    int sides = 0;
+    for (int i = 0; i < nump; i++) {
+      sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2;
+      sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8;
+    }
+    return sides == 0xF;
+  }
+};
+
+// Helper function for drawing 8-pixel wide chunks of a span with depth buffer.
+// Using 8-pixel chunks maximizes use of 16-bit depth values in 128-bit wide
+// SIMD register. However, since fragment shaders process only 4 pixels per
+// invocation, we need to run fragment shader twice for every 8 pixel batch
+// of results we get from the depth test. Perspective is not supported.
+template <int FUNC, bool MASK, typename P>
+static inline void draw_depth_span(uint16_t z, P* buf, uint16_t* depth,
+                                   int span) {
+  int skip = 0;
+  // Check if the fragment shader has an optimized draw specialization.
+  if (fragment_shader->has_draw_span(buf)) {
+    // The loop tries to accumulate runs of pixels that passed (len) and
+    // runs of pixels that failed (skip). This allows it to pass the largest
+    // possible span in between changes in depth pass or fail status to the
+    // fragment shader's draw specialer.
+    int len = 0;
+    do {
+      ZMask8 zmask;
+      // Process depth in 8-pixel chunks.
+      switch (check_depth8<FUNC, MASK>(z, depth, zmask)) {
+        case 0: // All pixels failed the depth test.
+          if (len) {
+            // Flush out passed pixels.
+            fragment_shader->draw_span(buf - len, len);
+            len = 0;
+          }
+          // Accumulate 2 skipped chunks.
+          skip += 2;
+          break;
+        case -1: // All pixels passed the depth test.
+          if (skip) {
+            // Flushed out any skipped chunks.
+            fragment_shader->skip(skip);
+            skip = 0;
+          }
+          // Accumulate 8 passed pixels.
+          len += 8;
+          break;
+        default: // Mixture of pass and fail results.
+          if (len) {
+            // Flush out any passed pixels.
+            fragment_shader->draw_span(buf - len, len);
+            len = 0;
+          } else if (skip) {
+            // Flush out any skipped chunks.
+            fragment_shader->skip(skip);
+            skip = 0;
+          }
+          // Run fragment shader on first 4 depth results.
+          commit_output<false, false>(buf, unpack(lowHalf(zmask), buf));
+          // Run fragment shader on next 4 depth results.
+          commit_output<false, false>(buf + 4, unpack(highHalf(zmask), buf));
+          break;
+      }
+      // Advance to next 8 pixels...
+      buf += 8;
+      depth += 8;
+      span -= 8;
+    } while (span >= 8);
+    // Flush out any remaining passed pixels.
+    if (len) {
+      fragment_shader->draw_span(buf - len, len);
+    }
+  } else {
+    // No draw specialization, so we can use a simpler loop here that just
+    // accumulates depth failures, but otherwise invokes fragment shader
+    // immediately on depth pass.
+    do {
+      ZMask8 zmask;
+      // Process depth in 8-pixel chunks.
+      switch (check_depth8<FUNC, MASK>(z, depth, zmask)) {
+        case 0: // All pixels failed the depth test.
+          // Accumulate 2 skipped chunks.
+          skip += 2;
+          break;
+        case -1: // All pixels passed the depth test.
+          if (skip) {
+            // Flush out any skipped chunks.
+            fragment_shader->skip(skip);
+            skip = 0;
+          }
+          // Run the fragment shader for two 4-pixel chunks.
+          commit_output<false, false>(buf);
+          commit_output<false, false>(buf + 4);
+          break;
+        default: // Mixture of pass and fail results.
+          if (skip) {
+            // Flush out any skipped chunks.
+            fragment_shader->skip(skip);
+            skip = 0;
+          }
+          // Run fragment shader on first 4 depth results.
+          commit_output<false, false>(buf, unpack(lowHalf(zmask), buf));
+          // Run fragment shader on next 4 depth results.
+          commit_output<false, false>(buf + 4, unpack(highHalf(zmask), buf));
+          break;
+      }
+      // Advance to next 8 pixels...
+      buf += 8;
+      depth += 8;
+      span -= 8;
+    } while (span >= 8);
+  }
+  // Flush out any remaining skipped chunks.
+  if (skip) {
+    fragment_shader->skip(skip);
+  }
+}
+
+// Draw a simple span in 4-pixel wide chunks, optionally using depth.
+template <bool DISCARD, bool W, typename P, typename Z>
+static ALWAYS_INLINE void draw_span(P* buf, uint16_t* depth, int span, Z z) {
+  if (depth) {
+    // Depth testing is enabled. If perspective is used, Z values will vary
+    // across the span, we use packDepth to generate 16-bit Z values suitable
+    // for depth testing based on current values from gl_FragCoord.z.
+    // Otherwise, for the no-perspective case, we just use the provided Z.
+    // Process 4-pixel chunks first.
+    for (; span >= 4; span -= 4, buf += 4, depth += 4) {
+      commit_output<DISCARD, W>(buf, z(), depth);
+    }
+    // If there are any remaining pixels, do a partial chunk.
+    if (span > 0) {
+      commit_output<DISCARD, W>(buf, z(), depth, span);
+    }
+  } else {
+    // Process 4-pixel chunks first.
+    for (; span >= 4; span -= 4, buf += 4) {
+      commit_output<DISCARD, W>(buf);
+    }
+    // If there are any remaining pixels, do a partial chunk.
+    if (span > 0) {
+      commit_output<DISCARD, W>(buf, span);
+    }
+  }
+}
+
+// Draw spans for each row of a given quad (or triangle) with a constant Z
+// value. The quad is assumed convex. It is clipped to fall within the given
+// clip rect. In short, this function rasterizes a quad by first finding a
+// top most starting point and then from there tracing down the left and right
+// sides of this quad until it hits the bottom, outputting a span between the
+// current left and right positions at each row along the way. Points are
+// assumed to be ordered in either CW or CCW to support this, but currently
+// both orders (CW and CCW) are supported and equivalent.
+template <typename P>
+static inline void draw_quad_spans(int nump, Point2D p[4], uint16_t z,
+                                   Interpolants interp_outs[4],
+                                   Texture& colortex, int layer,
+                                   Texture& depthtex,
+                                   const ClipRect& clipRect) {
+  // Only triangles and convex quads supported.
+  assert(nump == 3 || nump == 4);
+  Point2D l0, r0, l1, r1;
+  int l0i, r0i, l1i, r1i;
+  {
+    // Find the index of the top-most (smallest Y) point from which
+    // rasterization can start.
+    int top = nump > 3 && p[3].y < p[2].y
+                  ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3)
+                                     : (p[1].y < p[3].y ? 1 : 3))
+                  : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2)
+                                     : (p[1].y < p[2].y ? 1 : 2));
+    // Helper to find next index in the points array, walking forward.
+#define NEXT_POINT(idx)   \
+  ({                      \
+    int cur = (idx) + 1;  \
+    cur < nump ? cur : 0; \
+  })
+    // Helper to find the previous index in the points array, walking backward.
+#define PREV_POINT(idx)        \
+  ({                           \
+    int cur = (idx)-1;         \
+    cur >= 0 ? cur : nump - 1; \
+  })
+    // Start looking for "left"-side and "right"-side descending edges starting
+    // from the determined top point.
+    int next = NEXT_POINT(top);
+    int prev = PREV_POINT(top);
+    if (p[top].y == p[next].y) {
+      // If the next point is on the same row as the top, then advance one more
+      // time to the next point and use that as the "left" descending edge.
+      l0i = next;
+      l1i = NEXT_POINT(next);
+      // Assume top and prev form a descending "right" edge, as otherwise this
+      // will be a collapsed polygon and harmlessly bail out down below.
+      r0i = top;
+      r1i = prev;
+    } else if (p[top].y == p[prev].y) {
+      // If the prev point is on the same row as the top, then advance to the
+      // prev again and use that as the "right" descending edge.
+      // Assume top and next form a non-empty descending "left" edge.
+      l0i = top;
+      l1i = next;
+      r0i = prev;
+      r1i = PREV_POINT(prev);
+    } else {
+      // Both next and prev are on distinct rows from top, so both "left" and
+      // "right" edges are non-empty/descending.
+      l0i = r0i = top;
+      l1i = next;
+      r1i = prev;
+    }
+    // Load the points from the indices.
+    l0 = p[l0i]; // Start of left edge
+    r0 = p[r0i]; // End of left edge
+    l1 = p[l1i]; // Start of right edge
+    r1 = p[r1i]; // End of right edge
+    //    debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1:
+    //    %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i,
+    //    r1.x, r1.y);
+  }
+
+  struct Edge
+  {
+    float yScale;
+    float xSlope;
+    float x;
+    Interpolants interpSlope;
+    Interpolants interp;
+
+    Edge(float y, const Point2D& p0, const Point2D& p1,
+         const Interpolants& i0, const Interpolants& i1) :
+      // Inverse Y scale for slope calculations. Avoid divide on 0-length edge.
+      // Later checks below ensure that Y <= p1.y, or otherwise we don't use
+      // this edge. We just need to guard against Y == p1.y == p0.y. In that
+      // case, Y - p0.y == 0 and will cancel out the slopes below, except if
+      // yScale is Inf for some reason (or worse, NaN), which 1/(p1.y-p0.y)
+      // might produce if we don't bound it.
+      yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
+      // Calculate dX/dY slope
+      xSlope((p1.x - p0.x) * yScale),
+      // Initialize current X based on Y and slope
+      x(p0.x + (y - p0.y) * xSlope),
+      // Calculate change in interpolants per change in Y
+      interpSlope((i1 - i0) * yScale),
+      // Initialize current interpolants based on Y and slope
+      interp(i0 + (y - p0.y) * interpSlope)
+    {}
+
+    void nextRow() {
+      // step current X and interpolants to next row from slope
+      x += xSlope;
+      interp += interpSlope;
+    }
+  };
+
+  // Vertex selection above should result in equal left and right start rows
+  assert(l0.y == r0.y);
+  // Find the start y, clip to within the clip rect, and round to row center.
+  float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f;
+  // Initialize left and right edges from end points and start Y
+  Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+  Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+  // Get pointer to color buffer and depth buffer at current Y
+  P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer, sizeof(P));
+  uint16_t* fdepth =
+    (uint16_t*)depthtex.sample_ptr(0, int(y), 0, sizeof(uint16_t));
+  // Loop along advancing Ys, rasterizing spans at each row
+  float checkY = min(min(l1.y, r1.y), clipRect.y1);
+  for (;;) {
+    // Check if we maybe passed edge ends or outside clip rect...
+    if (y > checkY) {
+      // If we're outside the clip rect, we're done.
+      if (y > clipRect.y1) break;
+      // Helper to find the next non-duplicate vertex that doesn't loop back.
+#define STEP_EDGE(e0i, e0, e1i, e1, STEP_POINT, end)                   \
+      for (;;) {                                                       \
+        /* Set new start of edge to be end of old edge */              \
+        e0i = e1i;                                                     \
+        e0 = e1;                                                       \
+        /* Set new end of edge to next point */                        \
+        e1i = STEP_POINT(e1i);                                         \
+        e1 = p[e1i];                                                   \
+        /* If the edge is descending, use it. */                       \
+        if (e1.y > e0.y) break;                                        \
+        /* If the edge is ascending or crossed the end, we're done. */ \
+        if (e1.y < e0.y || e0i == end) return;                         \
+        /* Otherwise, it's a duplicate, so keep searching. */          \
+      }
+      // Check if Y advanced past the end of the left edge
+      if (y > l1.y) {
+        // Step to next left edge past Y and reset edge interpolants.
+        do { STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i); } while (y > l1.y);
+        left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+      }
+      // Check if Y advanced past the end of the right edge
+      if (y > r1.y) {
+        // Step to next right edge past Y and reset edge interpolants.
+        do { STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i); } while (y > r1.y);
+        right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+      }
+      // Reset check condition for next time around.
+      checkY = min(min(l1.y, r1.y), clipRect.y1);
+    }
+    // lx..rx form the bounds of the span. WR does not use backface culling,
+    // so we need to use min/max to support the span in either orientation.
+    // Clip the span to fall within the clip rect and then round to nearest
+    // column.
+    int startx = int(max(min(left.x, right.x), clipRect.x0) + 0.5f);
+    int endx = int(min(max(left.x, right.x), clipRect.x1) + 0.5f);
+    // Check if span is non-empty.
+    int span = endx - startx;
+    if (span > 0) {
+      ctx->shaded_rows++;
+      ctx->shaded_pixels += span;
+      // Advance color/depth buffer pointers to the start of the span.
+      P* buf = fbuf + startx;
+      // Check if the we will need to use depth-buffer or discard on this span.
+      uint16_t* depth = depthtex.buf != nullptr ? fdepth + startx : nullptr;
+      bool use_discard = fragment_shader->use_discard();
+      if (depthtex.delay_clear) {
+        // Delayed clear is enabled for the depth buffer. Check if this row
+        // needs to be cleared.
+        int yi = int(y);
+        uint32_t& mask = depthtex.cleared_rows[yi / 32];
+        if ((mask & (1 << (yi & 31))) == 0) {
+          // The depth buffer is unitialized on this row, but we know it will
+          // thus be cleared entirely to the clear value. This lets us quickly
+          // check the constant Z value of the quad against the clear Z to know
+          // if the entire span passes or fails the depth test all at once.
+          switch (ctx->depthfunc) {
+            case GL_LESS:
+              if (int16_t(z) < int16_t(depthtex.clear_val))
+                break;
+              else
+                goto next_span;
+            case GL_LEQUAL:
+              if (int16_t(z) <= int16_t(depthtex.clear_val))
+                break;
+              else
+                goto next_span;
+          }
+          // If we got here, we passed the depth test.
+          if (ctx->depthmask) {
+            // Depth writes are enabled, so we need to initialize depth.
+            mask |= 1 << (yi & 31);
+            depthtex.delay_clear--;
+            if (use_discard) {
+              // if discard is enabled, we don't know what pixels may be
+              // written to, so we have to clear the entire row.
+              force_clear_row<uint16_t>(depthtex, yi);
+            } else {
+              // Otherwise, we only need to clear the pixels that fall outside
+              // the current span on this row.
+              if (startx > 0 || endx < depthtex.width) {
+                force_clear_row<uint16_t>(depthtex, yi, startx, endx);
+              }
+              // Fill in the span's Z values with constant Z.
+              clear_buffer<uint16_t>(depthtex, z, 0,
+                                     IntRect{startx, yi, endx, yi + 1});
+              // We already passed the depth test, so no need to test depth
+              // any more.
+              depth = nullptr;
+            }
+          } else {
+            // No depth writes, so don't clear anything, and no need to test.
+            depth = nullptr;
+          }
+        }
+      }
+      if (colortex.delay_clear) {
+        // Delayed clear is enabled for the color buffer. Check if needs clear.
+        int yi = int(y);
+        uint32_t& mask = colortex.cleared_rows[yi / 32];
+        if ((mask & (1 << (yi & 31))) == 0) {
+          mask |= 1 << (yi & 31);
+          colortex.delay_clear--;
+          if (depth || blend_key || use_discard) {
+            // If depth test, blending, or discard is used, old color values
+            // might be sampled, so we need to clear the entire row to fill it.
+            force_clear_row<P>(colortex, yi);
+          } else if (startx > 0 || endx < colortex.width) {
+            // Otherwise, we only need to clear the row outside of the span.
+            // The fragment shader will fill the row within the span itself.
+            force_clear_row<P>(colortex, yi, startx, endx);
+          }
+        }
+      }
+      // Initialize fragment shader interpolants to current span position.
+      fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1);
+      fragment_shader->gl_FragCoord.y = y;
+      {
+        // Change in interpolants is difference between current right and left
+        // edges per the change in right and left X.
+        Interpolants step =
+            (right.interp - left.interp) * (1.0f / (right.x - left.x));
+        // Advance current interpolants to X at start of span.
+        Interpolants o = left.interp + step * (startx + 0.5f - left.x);
+        fragment_shader->init_span(&o, &step, 4.0f);
+      }
+      if (!use_discard) {
+        // Fast paths for the case where fragment discard is not used.
+        if (depth) {
+          // If depth is used, we want to process spans in 8-pixel chunks to
+          // maximize sampling and testing 16-bit depth values within the 128-
+          // bit width of a SIMD register.
+          if (span >= 8) {
+            // Specializations for supported depth functions depending on
+            // whether depth writes are enabled.
+            if (ctx->depthfunc == GL_LEQUAL) {
+              if (ctx->depthmask)
+                draw_depth_span<GL_LEQUAL, true>(z, buf, depth, span);
+              else
+                draw_depth_span<GL_LEQUAL, false>(z, buf, depth, span);
+            } else {
+              if (ctx->depthmask)
+                draw_depth_span<GL_LESS, true>(z, buf, depth, span);
+              else
+                draw_depth_span<GL_LESS, false>(z, buf, depth, span);
+            }
+            // Advance buffers past processed chunks.
+            buf += span & ~7;
+            depth += span & ~7;
+            span &= 7;
+          }
+        } else {
+          // Check if the fragment shader has an optimized draw specialization.
+          if (span >= 4 && fragment_shader->has_draw_span(buf)) {
+            // Draw specialization expects 4-pixel chunks.
+            int len = span & ~3;
+            fragment_shader->draw_span(buf, len);
+            buf += len;
+            span &= 3;
+          }
+        }
+        draw_span<false, false>(buf, depth, span, [=]{ return z; });
+      } else {
+        // If discard is used, then use slower fallbacks. This should be rare.
+        // Just needs to work, doesn't need to be too fast yet...
+        draw_span<true, false>(buf, depth, span, [=]{ return z; });
+      }
+    }
+  next_span:
+    // Advance Y and edge interpolants to next row.
+    y++;
+    left.nextRow();
+    right.nextRow();
+    // Advance buffers to next row.
+    fbuf += colortex.stride(sizeof(P)) / sizeof(P);
+    fdepth += depthtex.stride(sizeof(uint16_t)) / sizeof(uint16_t);
+  }
+}
+
+// Draw perspective-correct spans for a convex quad that has been clipped to
+// the near and far Z planes, possibly producing a clipped convex polygon with
+// more than 4 sides. This assumes the Z value will vary across the spans and
+// requires interpolants to factor in W values. This tends to be slower than
+// the simpler 2D draw_quad_spans above, especially since we can't optimize the
+// depth test easily when Z values, and should be used only rarely if possible.
+template <typename P>
+static inline void draw_perspective_spans(int nump, Point3D* p,
+                                          Interpolants* interp_outs,
+                                          Texture& colortex, int layer,
+                                          Texture& depthtex,
+                                          const ClipRect& clipRect) {
+  Point3D l0, r0, l1, r1;
+  int l0i, r0i, l1i, r1i;
+  {
+    // Find the index of the top-most point (smallest Y) from which
+    // rasterization can start.
+    int top = 0;
+    for (int i = 1; i < nump; i++) {
+      if (p[i].y < p[top].y) {
+        top = i;
+      }
+    }
+    // Find left-most top point, the start of the left descending edge.
+    // Advance forward in the points array, searching at most nump points
+    // in case the polygon is flat.
+    l0i = top;
+    for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) {
+      l0i = i;
+    }
+    if (l0i == nump - 1) {
+      for (int i = 0; i <= top && p[i].y == p[top].y; i++) {
+        l0i = i;
+      }
+    }
+    // Find right-most top point, the start of the right descending edge.
+    // Advance backward in the points array, searching at most nump points.
+    r0i = top;
+    for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) {
+      r0i = i;
+    }
+    if (r0i == 0) {
+      for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) {
+        r0i = i;
+      }
+    }
+    // End of left edge is next point after left edge start.
+    l1i = NEXT_POINT(l0i);
+    // End of right edge is prev point after right edge start.
+    r1i = PREV_POINT(r0i);
+    l0 = p[l0i]; // Start of left edge
+    r0 = p[r0i]; // End of left edge
+    l1 = p[l1i]; // Start of right edge
+    r1 = p[r1i]; // End of right edge
+  }
+
+  struct Edge
+  {
+    float yScale;
+    // Current coordinates for edge. Where in the 2D case of draw_quad_spans,
+    // it is enough to just track the X coordinate as we advance along the rows,
+    // for the perspective case we also need to keep track of Z and W. For
+    // simplicity, we just use the full 3D point to track all these coordinates.
+    Point3D pSlope;
+    Point3D p;
+    Interpolants interpSlope;
+    Interpolants interp;
+
+    Edge(float y, const Point3D& p0, const Point3D& p1,
+         const Interpolants& i0, const Interpolants& i1) :
+      // Inverse Y scale for slope calculations. Avoid divide on 0-length edge.
+      yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)),
+      // Calculate dX/dY slope
+      pSlope((p1 - p0) * yScale),
+      // Initialize current coords based on Y and slope
+      p(p0 + (y - p0.y) * pSlope),
+      // Crucially, these interpolants must be scaled by the point's 1/w value,
+      // which allows linear interpolation in a perspective-correct manner.
+      // This will be canceled out inside the fragment shader later.
+      // Calculate change in interpolants per change in Y
+      interpSlope((i1 * p1.w - i0 * p0.w) * yScale),
+      // Initialize current interpolants based on Y and slope
+      interp(i0 * p0.w + (y - p0.y) * interpSlope)
+    {}
+
+    float x() const { return p.x; }
+    vec2_scalar zw() const { return {p.z, p.w}; }
+
+    void nextRow() {
+      // step current coords and interpolants to next row from slope
+      p += pSlope;
+      interp += interpSlope;
+    }
+  };
+
+  // Vertex selection above should result in equal left and right start rows
+  assert(l0.y == r0.y);
+  // Find the start y, clip to within the clip rect, and round to row center.
+  float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f;
+  // Initialize left and right edges from end points and start Y
+  Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+  Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+  // Get pointer to color buffer and depth buffer at current Y
+  P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer, sizeof(P));
+  uint16_t* fdepth =
+    (uint16_t*)depthtex.sample_ptr(0, int(y), 0, sizeof(uint16_t));
+  // Loop along advancing Ys, rasterizing spans at each row
+  float checkY = min(min(l1.y, r1.y), clipRect.y1);
+  for (;;) {
+    // Check if we maybe passed edge ends or outside clip rect...
+    if (y > checkY) {
+      // If we're outside the clip rect, we're done.
+      if (y > clipRect.y1) break;
+      // Check if Y advanced past the end of the left edge
+      if (y > l1.y) {
+        // Step to next left edge past Y and reset edge interpolants.
+        do { STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i); } while (y > l1.y);
+        left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]);
+      }
+      // Check if Y advanced past the end of the right edge
+      if (y > r1.y) {
+        // Step to next right edge past Y and reset edge interpolants.
+        do { STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i); } while (y > r1.y);
+        right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]);
+      }
+      // Reset check condition for next time around.
+      checkY = min(min(l1.y, r1.y), clipRect.y1);
+    }
+    // lx..rx form the bounds of the span. WR does not use backface culling,
+    // so we need to use min/max to support the span in either orientation.
+    // Clip the span to fall within the clip rect and then round to nearest
+    // column.
+    int startx = int(max(min(left.x(), right.x()), clipRect.x0) + 0.5f);
+    int endx = int(min(max(left.x(), right.x()), clipRect.x1) + 0.5f);
+    // Check if span is non-empty.
+    int span = endx - startx;
+    if (span > 0) {
+      ctx->shaded_rows++;
+      ctx->shaded_pixels += span;
+      // Advance color/depth buffer pointers to the start of the span.
+      P* buf = fbuf + startx;
+      // Check if the we will need to use depth-buffer or discard on this span.
+      uint16_t* depth = depthtex.buf != nullptr ? fdepth + startx : nullptr;
+      bool use_discard = fragment_shader->use_discard();
+      if (depthtex.delay_clear) {
+        // Delayed clear is enabled for the depth buffer. Check if this row
+        // needs to be cleared.
+        int yi = int(y);
+        uint32_t& mask = depthtex.cleared_rows[yi / 32];
+        if ((mask & (1 << (yi & 31))) == 0) {
+          mask |= 1 << (yi & 31);
+          depthtex.delay_clear--;
+          // Since Z varies across the span, it's easier to just clear the
+          // row and rely on later depth testing. If necessary, this could be
+          // optimized to test against the start and end Z values of the span
+          // here.
+          force_clear_row<uint16_t>(depthtex, yi);
+        }
+      }
+      if (colortex.delay_clear) {
+        // Delayed clear is enabled for the color buffer. Check if needs clear.
+        int yi = int(y);
+        uint32_t& mask = colortex.cleared_rows[yi / 32];
+        if ((mask & (1 << (yi & 31))) == 0) {
+          mask |= 1 << (yi & 31);
+          colortex.delay_clear--;
+          if (depth || blend_key || use_discard) {
+            // If depth test, blending, or discard is used, old color values
+            // might be sampled, so we need to clear the entire row to fill it.
+            force_clear_row<P>(colortex, yi);
+          } else if (startx > 0 || endx < colortex.width) {
+            // Otherwise, we only need to clear the row outside of the span.
+            // The fragment shader will fill the row within the span itself.
+            force_clear_row<P>(colortex, yi, startx, endx);
+          }
+        }
+      }
+      // Initialize fragment shader interpolants to current span position.
+      fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1);
+      fragment_shader->gl_FragCoord.y = y;
+      {
+        // Calculate the fragment Z and W change per change in fragment X step.
+        vec2_scalar stepZW =
+            (right.zw() - left.zw()) * (1.0f / (right.x() - left.x()));
+        // Calculate initial Z and W values for span start.
+        vec2_scalar zw = left.zw() + stepZW * (startx + 0.5f - left.x());
+        // Set fragment shader's Z and W values so that it can use them to
+        // cancel out the 1/w baked into the interpolants.
+        fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x);
+        fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y);
+        fragment_shader->stepZW = stepZW * 4.0f;
+        // Change in interpolants is difference between current right and left
+        // edges per the change in right and left X. The left and right
+        // interpolant values were previously multipled by 1/w, so the step and
+        // initial span values take this into account.
+        Interpolants step =
+            (right.interp - left.interp) * (1.0f / (right.x() - left.x()));
+        // Advance current interpolants to X at start of span.
+        Interpolants o = left.interp + step * (startx + 0.5f - left.x());
+        fragment_shader->init_span<true>(&o, &step, 4.0f);
+      }
+      if (!use_discard) {
+        // No discard is used. Common case.
+        draw_span<false, true>(buf, depth, span, packDepth);
+      } else {
+        // Discard is used. Rare.
+        draw_span<true, true>(buf, depth, span, packDepth);
+      }
+    }
+    // Advance Y and edge interpolants to next row.
+    y++;
+    left.nextRow();
+    right.nextRow();
+    // Advance buffers to next row.
+    fbuf += colortex.stride(sizeof(P)) / sizeof(P);
+    fdepth += depthtex.stride(sizeof(uint16_t)) / sizeof(uint16_t);
+  }
+}
+
+// Clip a primitive against both sides of a view-frustum axis, producing
+// intermediate vertexes with interpolated attributes that will no longer
+// intersect the selected axis planes. This assumes the primitive is convex
+// and should produce at most N+2 vertexes for each invocation (only in the
+// worst case where one point falls outside on each of the opposite sides
+// with the rest of the points inside).
+template <XYZW AXIS>
+static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP,
+                     Interpolants* outInterp) {
+  int numClip = 0;
+  Point3D prev = p[nump - 1];
+  Interpolants prevInterp = interp[nump - 1];
+  float prevCoord = prev.select(AXIS);
+  // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and
+  // if so, remember which side it is outside of.
+  int prevSide = prevCoord < -prev.w ? -1 : (prevCoord > prev.w ? 1 : 0);
+  // Loop through points, finding edges that cross the planes by evaluating
+  // the side at each point.
+  for (int i = 0; i < nump; i++) {
+    Point3D cur = p[i];
+    Interpolants curInterp = interp[i];
+    float curCoord = cur.select(AXIS);
+    int curSide = curCoord < -cur.w ? -1 : (curCoord > cur.w ? 1 : 0);
+    // Check if the previous and current end points are on different sides.
+    if (curSide != prevSide) {
+      // One of the edge's end points is outside the plane with the other
+      // inside the plane. Find the offset where it crosses the plane and
+      // adjust the point and interpolants to there.
+      if (prevSide) {
+        // Edge that was previously outside crosses inside.
+        // Evaluate plane equation for previous and current end-point
+        // based on previous side and calculate relative offset.
+        assert(numClip < nump + 2);
+        float prevDist = prevCoord - prevSide * prev.w;
+        float curDist = curCoord - prevSide * cur.w;
+        float k = prevDist / (prevDist - curDist);
+        outP[numClip] = prev + (cur - prev) * k;
+        outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
+        numClip++;
+      }
+      if (curSide) {
+        // Edge that was previously inside crosses outside.
+        // Evaluate plane equation for previous and current end-point
+        // based on current side and calculate relative offset.
+        assert(numClip < nump + 2);
+        float prevDist = prevCoord - curSide * prev.w;
+        float curDist = curCoord - curSide * cur.w;
+        float k = prevDist / (prevDist - curDist);
+        outP[numClip] = prev + (cur - prev) * k;
+        outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k;
+        numClip++;
+      }
+    }
+    if (!curSide) {
+      // The current end point is inside the plane, so output point unmodified.
+      assert(numClip < nump + 2);
+      outP[numClip] = cur;
+      outInterp[numClip] = curInterp;
+      numClip++;
+    }
+    prev = cur;
+    prevInterp = curInterp;
+    prevCoord = curCoord;
+    prevSide = curSide;
+  }
+  return numClip;
+}
+
+// Helper function to dispatch to perspective span drawing with points that
+// have already been transformed and clipped.
+static inline void draw_perspective_clipped(int nump, Point3D* p_clip,
+                                            Interpolants* interp_clip,
+                                            Texture& colortex, int layer,
+                                            Texture& depthtex) {
+  // If polygon is ouside clip rect, nothing to draw.
+  ClipRect clipRect(colortex);
+  if (!clipRect.overlaps(nump, p_clip)) {
+    return;
+  }
+
+  // Finally draw perspective-correct spans for the polygon.
+  if (colortex.internal_format == GL_RGBA8) {
+    draw_perspective_spans<uint32_t>(nump, p_clip, interp_clip, colortex,
+                                     layer, depthtex, clipRect);
+  } else if (colortex.internal_format == GL_R8) {
+    draw_perspective_spans<uint8_t>(nump, p_clip, interp_clip, colortex,
+                                    layer, depthtex, clipRect);
+  } else {
+    assert(false);
+  }
+}
+
+// Draws a perspective-correct 3D primitive with varying Z value, as opposed
+// to a simple 2D planar primitive with a constant Z value that could be
+// trivially Z rejected. This requires clipping the primitive against the near
+// and far planes to ensure it stays within the valid Z-buffer range. The Z
+// and W of each fragment of the primitives are interpolated across the
+// generated spans and then depth-tested as appropriate.
+// Additionally, vertex attributes must be interpolated with perspective-
+// correction by dividing by W before interpolation, and then later multiplied
+// by W again to produce the final correct attribute value for each fragment.
+// This process is expensive and should be avoided if possible for primitive
+// batches that are known ahead of time to not need perspective-correction.
+static void draw_perspective(int nump,
+                             Interpolants interp_outs[4],
+                             Texture& colortex, int layer,
+                             Texture& depthtex) {
+  // Convert output of vertex shader to screen space.
+  vec4 pos = vertex_shader->gl_Position;
+  vec3_scalar scale =
+    vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f;
+  vec3_scalar offset =
+    vec3_scalar(ctx->viewport.x0, ctx->viewport.y0, 0.0f) + scale;
+  if (test_none(pos.z <= -pos.w || pos.z >= pos.w)) {
+    // No points cross the near or far planes, so no clipping required.
+    // Just divide coords by W and convert to viewport.
+    Float w = 1.0f / pos.w;
+    vec3 screen = pos.sel(X, Y, Z) * w * scale + offset;
+    Point3D p[4] = {
+        {screen.x.x, screen.y.x, screen.z.x, w.x},
+        {screen.x.y, screen.y.y, screen.z.y, w.y},
+        {screen.x.z, screen.y.z, screen.z.z, w.z},
+        {screen.x.w, screen.y.w, screen.z.w, w.w}
+    };
+    draw_perspective_clipped(nump, p, interp_outs, colortex, layer, depthtex);
+  } else {
+    // Points cross the near or far planes, so we need to clip.
+    // Start with the original 3 or 4 points...
+    Point3D p[4] = {
+        {pos.x.x, pos.y.x, pos.z.x, pos.w.x},
+        {pos.x.y, pos.y.y, pos.z.y, pos.w.y},
+        {pos.x.z, pos.y.z, pos.z.z, pos.w.z},
+        {pos.x.w, pos.y.w, pos.z.w, pos.w.w}
+    };
+    // Clipping can expand the points by 1 for each of 6 view frustum planes.
+    Point3D p_clip[4 + 6];
+    Interpolants interp_clip[4 + 6];
+    // Clip against near and far Z planes.
+    nump = clip_side<Z>(nump, p, interp_outs, p_clip, interp_clip);
+    // If no points are left inside the view frustum, there's nothing to draw.
+    if (nump < 3) {
+      return;
+    }
+    // After clipping against only the near and far planes, we might still
+    // produce points where W = 0, exactly at the camera plane. OpenGL specifies
+    // that for clip coordinates, points must satisfy:
+    //   -W <= X <= W
+    //   -W <= Y <= W
+    //   -W <= Z <= W
+    // When Z = W = 0, this is trivially satisfied, but when we transform and
+    // divide by W below it will produce a divide by 0. Usually we want to only
+    // clip Z to avoid the extra work of clipping X and Y. We can still project
+    // points that fall outside the view frustum X and Y so long as Z is valid.
+    // The span drawing code will then ensure X and Y are clamped to viewport
+    // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y,
+    // will push W further inside the view frustum so that it is no longer 0,
+    // allowing us to finally proceed to projecting the points to the screen.
+    for (int i = 0; i < nump; i++) {
+      // Found an invalid W, so need to clip against X and Y...
+      if (p_clip[i].w <= 0.0f) {
+        // Ping-pong p_clip -> p_tmp -> p_clip.
+        Point3D p_tmp[4 + 6];
+        Interpolants interp_tmp[4 + 6];
+        nump = clip_side<X>(nump, p_clip, interp_clip, p_tmp, interp_tmp);
+        if (nump < 3) return;
+        nump = clip_side<Y>(nump, p_tmp, interp_tmp, p_clip, interp_clip);
+        if (nump < 3) return;
+        // After clipping against X and Y planes, there's still points left
+        // to draw, so proceed to trying projection now...
+        break;
+      }
+    }
+    // Divide coords by W and convert to viewport.
+    for (int i = 0; i < nump; i++) {
+      float w = 1.0f / p_clip[i].w;
+      p_clip[i] = Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w);
+    }
+    draw_perspective_clipped(nump, p_clip, interp_clip, colortex, layer,
+                             depthtex);
+  }
+}
+
+static void draw_quad(int nump, Texture& colortex, int layer,
+                      Texture& depthtex) {
+  // Run vertex shader once for the primitive's vertices.
+  // Reserve space for 6 sets of interpolants, in case we need to clip against
+  // near and far planes in the perspective case.
+  Interpolants interp_outs[4];
+  vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants));
+  vec4 pos = vertex_shader->gl_Position;
+  // Check if any vertex W is different from another. If so, use perspective.
+  if (test_any(pos.w != pos.w.x)) {
+    draw_perspective(nump, interp_outs, colortex, layer, depthtex);
+    return;
+  }
+
+  // Convert output of vertex shader to screen space.
+  // Divide coords by W and convert to viewport.
+  float w = 1.0f / pos.w.x;
+  vec2 screen =
+      (pos.sel(X, Y) * w + 1) * 0.5f *
+          vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) +
+      vec2_scalar(ctx->viewport.x0, ctx->viewport.y0);
+  Point2D p[4] = {{screen.x.x, screen.y.x},
+                  {screen.x.y, screen.y.y},
+                  {screen.x.z, screen.y.z},
+                  {screen.x.w, screen.y.w}};
+
+  // If quad is ouside clip rect, nothing to draw.
+  ClipRect clipRect(colortex);
+  if (!clipRect.overlaps(nump, p)) {
+    return;
+  }
+
+  // Since the quad is assumed 2D, Z is constant across the quad.
+  float screenZ = (pos.z.x * w + 1) * 0.5f;
+  if (screenZ < 0 || screenZ > 1) {
+    // Z values would cross the near or far plane, so just bail.
+    return;
+  }
+  // Since Z doesn't need to be interpolated, just set the fragment shader's
+  // Z and W values here, once and for all fragment shader invocations.
+  // SSE2 does not support unsigned comparison, so bias Z to be negative.
+  uint16_t z = uint16_t(0xFFFF * screenZ) - 0x8000;
+  fragment_shader->gl_FragCoord.z = screenZ;
+  fragment_shader->gl_FragCoord.w = w;
+
+  // Finally draw 2D spans for the quad. Currently only supports drawing to
+  // RGBA8 and R8 color buffers.
+  if (colortex.internal_format == GL_RGBA8) {
+    draw_quad_spans<uint32_t>(nump, p, z, interp_outs, colortex, layer,
+                              depthtex, clipRect);
+  } else if (colortex.internal_format == GL_R8) {
+    draw_quad_spans<uint8_t>(nump, p, z, interp_outs, colortex, layer, depthtex,
+                             clipRect);
+  } else {
+    assert(false);
+  }
+}
 
 void VertexArray::validate() {
   int last_enabled = -1;
@@ -2653,32 +3581,78 @@ void VertexArray::validate() {
   max_attrib = last_enabled;
 }
 
+template <typename INDEX>
+static inline void draw_elements(GLsizei count, GLsizei instancecount,
+                                 Buffer& indices_buf, size_t offset,
+                                 VertexArray& v, Texture& colortex, int layer,
+                                 Texture& depthtex) {
+  assert((offset & (sizeof(INDEX) - 1)) == 0);
+  INDEX* indices = (INDEX*)(indices_buf.buf + offset);
+  count = min(count,
+              (GLsizei)((indices_buf.size - offset) / sizeof(INDEX)));
+  // Triangles must be indexed at offsets 0, 1, 2.
+  // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3.
+  if (count == 6 && indices[1] == indices[0] + 1 &&
+      indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) {
+    assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1);
+    // Fast path - since there is only a single quad, we only load per-vertex
+    // attribs once for all instances, as they won't change across instances
+    // or within an instance.
+    vertex_shader->load_attribs(v.attribs, indices[0], 0, 4);
+    draw_quad(4, colortex, layer, depthtex);
+    for (GLsizei instance = 1; instance < instancecount; instance++) {
+      vertex_shader->load_attribs(v.attribs, indices[0], instance, 0);
+      draw_quad(4, colortex, layer, depthtex);
+    }
+  } else {
+    for (GLsizei instance = 0; instance < instancecount; instance++) {
+      for (GLsizei i = 0; i + 3 <= count; i += 3) {
+        if (indices[i + 1] != indices[i] + 1 ||
+            indices[i + 2] != indices[i] + 2) {
+          continue;
+        }
+        int nump = 3;
+        if (i + 6 <= count && indices[i + 5] == indices[i] + 3) {
+          assert(indices[i + 3] == indices[i] + 2 &&
+                 indices[i + 4] == indices[i] + 1);
+          nump = 4;
+          i += 3;
+        }
+        vertex_shader->load_attribs(v.attribs, indices[i], instance, nump);
+        draw_quad(nump, colortex, layer, depthtex);
+      }
+    }
+  }
+}
+
 extern "C" {
 
 void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
-                           GLintptr offset, GLsizei instancecount) {
-  if (offset < 0 || count <= 0 || instancecount <= 0 || !vertex_shader ||
-      !fragment_shader) {
+                           void* indicesptr, GLsizei instancecount) {
+  assert(mode == GL_TRIANGLES);
+  assert(type == GL_UNSIGNED_SHORT || type == GL_UNSIGNED_INT);
+  if (count <= 0 || instancecount <= 0) {
     return;
   }
 
-  Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true);
-  if (!fb.color_attachment) {
-    return;
-  }
+  Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER);
   Texture& colortex = ctx->textures[fb.color_attachment];
   if (!colortex.buf) {
     return;
   }
-  assert(!colortex.locked);
   assert(colortex.internal_format == GL_RGBA8 ||
          colortex.internal_format == GL_R8);
   Texture& depthtex = ctx->textures[ctx->depthtest ? fb.depth_attachment : 0];
   if (depthtex.buf) {
-    assert(depthtex.internal_format == GL_DEPTH_COMPONENT24);
+    assert(depthtex.internal_format == GL_DEPTH_COMPONENT16);
     assert(colortex.width == depthtex.width &&
            colortex.height == depthtex.height);
-    assert(colortex.offset == depthtex.offset);
+  }
+
+  Buffer& indices_buf = ctx->buffers[ctx->element_array_buffer_binding];
+  size_t offset = (size_t)indicesptr;
+  if (!indices_buf.buf || offset >= indices_buf.size) {
+    return;
   }
 
   // debugf("current_vertex_array %d\n", ctx->current_vertex_array);
@@ -2689,8 +3663,8 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
     v.validate();
   }
 
-#ifdef PRINT_TIMINGS
-  uint64_t start = get_time_value();
+#ifndef NDEBUG
+  // uint64_t start = get_time_value();
 #endif
 
   ctx->shaded_rows = 0;
@@ -2698,43 +3672,14 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
 
   vertex_shader->init_batch();
 
-  switch (type) {
-    case GL_UNSIGNED_SHORT:
-      assert(mode == GL_TRIANGLES);
-      draw_elements<uint16_t>(count, instancecount, offset, v, colortex,
-                              depthtex);
-      break;
-    case GL_UNSIGNED_INT:
-      assert(mode == GL_TRIANGLES);
-      draw_elements<uint32_t>(count, instancecount, offset, v, colortex,
-                              depthtex);
-      break;
-    case GL_NONE:
-      // Non-standard GL extension - if element type is GL_NONE, then we don't
-      // use any element buffer and behave as if DrawArrays was called instead.
-      for (GLsizei instance = 0; instance < instancecount; instance++) {
-        switch (mode) {
-          case GL_LINES:
-            for (GLsizei i = 0; i + 2 <= count; i += 2) {
-              vertex_shader->load_attribs(v.attribs, offset + i, instance, 2);
-              draw_quad(2, colortex, depthtex);
-            }
-            break;
-          case GL_TRIANGLES:
-            for (GLsizei i = 0; i + 3 <= count; i += 3) {
-              vertex_shader->load_attribs(v.attribs, offset + i, instance, 3);
-              draw_quad(3, colortex, depthtex);
-            }
-            break;
-          default:
-            assert(false);
-            break;
-        }
-      }
-      break;
-    default:
-      assert(false);
-      break;
+  if (type == GL_UNSIGNED_SHORT) {
+    draw_elements<uint16_t>(count, instancecount, indices_buf, offset, v,
+                            colortex, fb.layer, depthtex);
+  } else if (type == GL_UNSIGNED_INT) {
+    draw_elements<uint32_t>(count, instancecount, indices_buf, offset, v,
+                            colortex, fb.layer, depthtex);
+  } else {
+    assert(false);
   }
 
   if (ctx->samples_passed_query) {
@@ -2742,66 +3687,329 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type,
     q.value += ctx->shaded_pixels;
   }
 
-#ifdef PRINT_TIMINGS
-  uint64_t end = get_time_value();
-  printf(
-      "%7.3fms draw(%s, %d): %d pixels in %d rows (avg %f pixels/row, "
-      "%fns/pixel)\n",
-      double(end - start) / (1000. * 1000.),
-      ctx->programs[ctx->current_program].impl->get_name(), instancecount,
-      ctx->shaded_pixels, ctx->shaded_rows,
-      double(ctx->shaded_pixels) / ctx->shaded_rows,
-      double(end - start) / max(ctx->shaded_pixels, 1));
+#ifndef NDEBUG
+  // uint64_t end = get_time_value();
+  // debugf("draw(%d): %fms for %d pixels in %d rows (avg %f pixels/row, %f
+  // ns/pixel)\n", instancecount, double(end - start)/(1000.*1000.),
+  // ctx->shaded_pixels, ctx->shaded_rows,
+  // double(ctx->shaded_pixels)/ctx->shaded_rows, double(end -
+  // start)/max(ctx->shaded_pixels, 1));
 #endif
 }
 
-void Finish() {
-#ifdef PRINT_TIMINGS
-  printf("Finish\n");
-#endif
+} // extern "C"
+
+template <typename P>
+static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth,
+                             int span) {
+  int frac = 0;
+  for (P* end = dst + span; dst < end; dst++) {
+    *dst = *src;
+    // Step source according to width ratio.
+    for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) {
+      src++;
+    }
+  }
 }
 
-void MakeCurrent(Context* c) {
-  if (ctx == c) {
+static void scale_blit(Texture& srctex, const IntRect& srcReq, int srcZ,
+                       Texture& dsttex, const IntRect& dstReq, int dstZ,
+                       bool invertY) {
+  // Cache scaling ratios
+  int srcWidth = srcReq.width();
+  int srcHeight = srcReq.height();
+  int dstWidth = dstReq.width();
+  int dstHeight = dstReq.height();
+  // Compute valid dest bounds
+  IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY);
+  // Compute valid source bounds
+  // Scale source to dest, rounding inward to avoid sampling outside source
+  IntRect srcBounds = srctex.sample_bounds(srcReq)
+    .scale(srcWidth, srcHeight, dstWidth, dstHeight, true);
+  // Limit dest sampling bounds to overlap source bounds
+  dstBounds.intersect(srcBounds);
+  // Check if sampling bounds are empty
+  if (dstBounds.is_empty()) {
     return;
   }
-  ctx = c;
-  setup_program(ctx ? ctx->current_program : 0);
+  // Compute final source bounds from clamped dest sampling bounds
+  srcBounds = IntRect(dstBounds)
+    .scale(dstWidth, dstHeight, srcWidth, srcHeight);
+  // Calculate source and dest pointers from clamped offsets
+  int bpp = srctex.bpp();
+  int srcStride = srctex.stride(bpp);
+  int destStride = dsttex.stride(bpp);
+  char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ, invertY);
+  char* src = srctex.sample_ptr(srcReq, srcBounds, srcZ);
+  // Inverted Y must step downward along dest rows
+  if (invertY) {
+    destStride = -destStride;
+  }
+  int span = dstBounds.width();
+  int frac = 0;
+  for (int rows = dstBounds.height(); rows > 0; rows--) {
+    if (srcWidth == dstWidth) {
+      // No scaling, so just do a fast copy.
+      memcpy(dest, src, span * bpp);
+    } else {
+      // Do scaling with different source and dest widths.
+      switch (bpp) {
+        case 1:
+          scale_row((uint8_t*)dest, dstWidth, (uint8_t*)src, srcWidth, span);
+          break;
+        case 2:
+          scale_row((uint16_t*)dest, dstWidth, (uint16_t*)src, srcWidth, span);
+          break;
+        case 4:
+          scale_row((uint32_t*)dest, dstWidth, (uint32_t*)src, srcWidth, span);
+          break;
+        default:
+          assert(false);
+          break;
+      }
+    }
+    dest += destStride;
+    // Step source according to height ratio.
+    for (frac += srcHeight; frac >= dstHeight; frac -= dstHeight) {
+      src += srcStride;
+    }
+  }
+}
+
+static void linear_row(uint32_t* dest, int span, const vec2_scalar& srcUV,
+                       float srcDU, int srcZOffset, sampler2DArray sampler) {
+  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+  for (; span >= 4; span -= 4) {
+    auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset);
+    unaligned_store(dest, srcpx);
+    dest += 4;
+    uv.x += 4 * srcDU;
+  }
+  if (span > 0) {
+    auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset);
+    auto mask = span_mask_RGBA8(span);
+    auto dstpx = unaligned_load<PackedRGBA8>(dest);
+    unaligned_store(dest, (mask & dstpx) | (~mask & srcpx));
+  }
 }
 
-Context* CreateContext() { return new Context; }
+static void linear_row(uint8_t* dest, int span, const vec2_scalar& srcUV,
+                       float srcDU, int srcZOffset, sampler2DArray sampler) {
+  vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f));
+  for (; span >= 4; span -= 4) {
+    auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset);
+    unaligned_store(dest, pack(srcpx));
+    dest += 4;
+    uv.x += 4 * srcDU;
+  }
+  if (span > 0) {
+    auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset);
+    auto mask = span_mask_R8(span);
+    auto dstpx = unpack(unaligned_load<PackedR8>(dest));
+    unaligned_store(dest, pack((mask & dstpx) | (~mask & srcpx)));
+  }
+}
 
-void ReferenceContext(Context* c) {
-  if (!c) {
+static void linear_blit(Texture& srctex, const IntRect& srcReq, int srcZ,
+                        Texture& dsttex, const IntRect& dstReq, int dstZ,
+                        bool invertY) {
+  assert(srctex.internal_format == GL_RGBA8 ||
+         srctex.internal_format == GL_R8);
+  // Compute valid dest bounds
+  IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY);
+  // Check if sampling bounds are empty
+  if (dstBounds.is_empty()) {
     return;
   }
-  ++c->references;
+  // Initialize sampler for source texture
+  sampler2DArray_impl sampler;
+  init_sampler(&sampler, srctex);
+  init_depth(&sampler, srctex);
+  sampler.filter = TextureFilter::LINEAR;
+  // Compute source UVs
+  int srcZOffset = srcZ * sampler.height_stride;
+  vec2_scalar srcUV(srcReq.x0, srcReq.y0);
+  vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(),
+                     float(srcReq.height()) / dstReq.height());
+  // Skip to clamped source start
+  srcUV += srcDUV * vec2_scalar(dstBounds.x0, dstBounds.y0);
+  // Offset source UVs to texel centers and scale by lerp precision
+  srcUV = linearQuantize(srcUV + 0.5f, 128);
+  srcDUV *= 128.0f;
+  // Calculate dest pointer from clamped offsets
+  int bpp = dsttex.bpp();
+  int destStride = dsttex.stride(bpp);
+  char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ, invertY);
+  // Inverted Y must step downward along dest rows
+  if (invertY) {
+    destStride = -destStride;
+  }
+  int span = dstBounds.width();
+  for (int rows = dstBounds.height(); rows > 0; rows--) {
+    switch (bpp) {
+      case 1:
+        linear_row((uint8_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+                   &sampler);
+        break;
+      case 4:
+        linear_row((uint32_t*)dest, span, srcUV, srcDUV.x, srcZOffset,
+                   &sampler);
+        break;
+      default:
+        assert(false);
+        break;
+    }
+    dest += destStride;
+    srcUV.y += srcDUV.y;
+  }
 }
 
-void DestroyContext(Context* c) {
-  if (!c) {
+extern "C" {
+
+void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                     GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                     GLbitfield mask, GLenum filter) {
+  assert(mask == GL_COLOR_BUFFER_BIT);
+  Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER);
+  if (!srcfb || srcfb->layer < 0) return;
+  Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER);
+  if (!dstfb || dstfb->layer < 0) return;
+  Texture& srctex = ctx->textures[srcfb->color_attachment];
+  if (!srctex.buf || srcfb->layer >= max(srctex.depth, 1)) return;
+  Texture& dsttex = ctx->textures[dstfb->color_attachment];
+  if (!dsttex.buf || dstfb->layer >= max(dsttex.depth, 1)) return;
+  if (srctex.internal_format != dsttex.internal_format) {
+    assert(false);
     return;
   }
-  assert(c->references > 0);
-  --c->references;
-  if (c->references > 0) {
+  // Force flipped Y onto dest coordinates
+  if (srcY1 < srcY0) {
+    swap(srcY0, srcY1);
+    swap(dstY0, dstY1);
+  }
+  bool invertY = dstY1 < dstY0;
+  if (invertY) {
+    swap(dstY0, dstY1);
+  }
+  IntRect srcReq = {srcX0, srcY0, srcX1, srcY1};
+  IntRect dstReq = {dstX0, dstY0, dstX1, dstY1};
+  if (srcReq.is_empty() || dstReq.is_empty()) {
     return;
   }
-  if (ctx == c) {
-    MakeCurrent(nullptr);
+  prepare_texture(srctex);
+  prepare_texture(dsttex, &dstReq);
+  if (!srcReq.same_size(dstReq) && filter == GL_LINEAR &&
+      (srctex.internal_format == GL_RGBA8 ||
+       srctex.internal_format == GL_R8)) {
+    linear_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer,
+                invertY);
+  } else {
+    scale_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer,
+               invertY);
   }
-  delete c;
 }
 
-size_t ReportMemory(size_t (*size_of_op)(void*)) {
-  size_t size = 0;
+void Finish() {}
+
+void MakeCurrent(void* ctx_ptr) {
+  ctx = (Context*)ctx_ptr;
   if (ctx) {
-    for (auto& t : ctx->textures) {
-      if (t && t->should_free()) {
-        size += size_of_op(t->buf);
+    setup_program(ctx->current_program);
+    blend_key = ctx->blend ? ctx->blend_key : BLEND_KEY_NONE;
+  } else {
+    setup_program(0);
+    blend_key = BLEND_KEY_NONE;
+  }
+}
+
+void* CreateContext() { return new Context; }
+
+void DestroyContext(void* ctx_ptr) {
+  if (!ctx_ptr) {
+    return;
+  }
+  if (ctx == ctx_ptr) {
+    MakeCurrent(nullptr);
+  }
+  delete (Context*)ctx_ptr;
+}
+
+void Composite(GLuint srcId, GLint srcX, GLint srcY, GLsizei srcWidth,
+               GLsizei srcHeight, GLint dstX, GLint dstY, GLboolean opaque,
+               GLboolean flip) {
+  Framebuffer& fb = ctx->framebuffers[0];
+  if (!fb.color_attachment) {
+    return;
+  }
+  Texture& srctex = ctx->textures[srcId];
+  if (!srctex.buf) return;
+  prepare_texture(srctex);
+  Texture& dsttex = ctx->textures[fb.color_attachment];
+  if (!dsttex.buf) return;
+  assert(srctex.bpp() == 4);
+  const int bpp = 4;
+  size_t src_stride = srctex.stride(bpp);
+  size_t dest_stride = dsttex.stride(bpp);
+  if (srcY < 0) {
+    dstY -= srcY;
+    srcHeight += srcY;
+    srcY = 0;
+  }
+  if (dstY < 0) {
+    srcY -= dstY;
+    srcHeight += dstY;
+    dstY = 0;
+  }
+  if (srcY + srcHeight > srctex.height) {
+    srcHeight = srctex.height - srcY;
+  }
+  if (dstY + srcHeight > dsttex.height) {
+    srcHeight = dsttex.height - dstY;
+  }
+  IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight};
+  prepare_texture(dsttex, &skip);
+  char* dest = dsttex.sample_ptr(dstX, flip ? dsttex.height - 1 - dstY : dstY,
+                                 fb.layer, bpp, dest_stride);
+  char* src = srctex.sample_ptr(srcX, srcY, 0, bpp, src_stride);
+  if (flip) {
+    dest_stride = -dest_stride;
+  }
+  if (opaque) {
+    for (int y = 0; y < srcHeight; y++) {
+      memcpy(dest, src, srcWidth * bpp);
+      dest += dest_stride;
+      src += src_stride;
+    }
+  } else {
+    for (int y = 0; y < srcHeight; y++) {
+      char* end = src + srcWidth * bpp;
+      while (src + 4 * bpp <= end) {
+        WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
+        WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
+        PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)));
+        unaligned_store(dest, r);
+        src += 4 * bpp;
+        dest += 4 * bpp;
       }
+      if (src < end) {
+        WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src));
+        WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest));
+        U32 r = bit_cast<U32>(
+            pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))));
+        unaligned_store(dest, r.x);
+        if (src + bpp < end) {
+          unaligned_store(dest + bpp, r.y);
+          if (src + 2 * bpp < end) {
+            unaligned_store(dest + 2 * bpp, r.z);
+          }
+        }
+        dest += end - src;
+        src = end;
+      }
+      dest += dest_stride - srcWidth * bpp;
+      src += src_stride - srcWidth * bpp;
     }
   }
-  return size;
 }
+
 }  // extern "C"
author	Mukilan Thiyagarajan <mukilan@igalia.com>	2023-09-14 15:00:42 +0530
committer	Mukilan Thiyagarajan <mukilan@igalia.com>	2023-09-14 15:00:42 +0530
commit	c385b3c9737c17d59cb02e520c3b68b232cb6497 (patch)
tree	ad598ffbbdfbcecd6a4cf458abe2afc702d92c27 /third_party/webrender/swgl/src/gl.cc
parent	988e05a68b48c9e744bf49459faf41a1bd9b81d7 (diff)
download	servo-revert-webrender.tar.gz servo-revert-webrender.zip