diff options
author | Mukilan Thiyagarajan <mukilan@igalia.com> | 2023-09-14 15:00:42 +0530 |
---|---|---|
committer | Mukilan Thiyagarajan <mukilan@igalia.com> | 2023-09-14 15:00:42 +0530 |
commit | c385b3c9737c17d59cb02e520c3b68b232cb6497 (patch) | |
tree | ad598ffbbdfbcecd6a4cf458abe2afc702d92c27 /third_party/webrender/swgl/src/gl.cc | |
parent | 988e05a68b48c9e744bf49459faf41a1bd9b81d7 (diff) | |
download | servo-revert-webrender.tar.gz servo-revert-webrender.zip |
Revert "Upgrade WebRender to e491e1ae637b2eed1e7195855d88357e5eb3ddf9 (#30323)"revert-webrender
This reverts commit a9d37cb85ac2c55fc630fccffe1ba60ff00f555b.
Diffstat (limited to 'third_party/webrender/swgl/src/gl.cc')
-rw-r--r-- | third_party/webrender/swgl/src/gl.cc | 3164 |
1 files changed, 2186 insertions, 978 deletions
diff --git a/third_party/webrender/swgl/src/gl.cc b/third_party/webrender/swgl/src/gl.cc index 6e214547421..f4a69752dde 100644 --- a/third_party/webrender/swgl/src/gl.cc +++ b/third_party/webrender/swgl/src/gl.cc @@ -22,65 +22,15 @@ # define debugf(...) printf(__VA_ARGS__) #endif -// #define PRINT_TIMINGS - #ifdef _WIN32 # define ALWAYS_INLINE __forceinline -# define NO_INLINE __declspec(noinline) - -// Including Windows.h brings a huge amount of namespace polution so just -// define a couple of things manually -typedef int BOOL; -# define WINAPI __stdcall -# define DECLSPEC_IMPORT __declspec(dllimport) -# define WINBASEAPI DECLSPEC_IMPORT -typedef unsigned long DWORD; -typedef long LONG; -typedef __int64 LONGLONG; -# define DUMMYSTRUCTNAME - -typedef union _LARGE_INTEGER { - struct { - DWORD LowPart; - LONG HighPart; - } DUMMYSTRUCTNAME; - struct { - DWORD LowPart; - LONG HighPart; - } u; - LONGLONG QuadPart; -} LARGE_INTEGER; -extern "C" { -WINBASEAPI BOOL WINAPI -QueryPerformanceCounter(LARGE_INTEGER* lpPerformanceCount); - -WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency); -} - #else -// GCC is slower when dealing with always_inline, especially in debug builds. -// When using Clang, use always_inline more aggressively. -# if defined(__clang__) || defined(NDEBUG) -# define ALWAYS_INLINE __attribute__((always_inline)) inline -# else -# define ALWAYS_INLINE inline -# endif -# define NO_INLINE __attribute__((noinline)) -#endif - -// Some functions may cause excessive binary bloat if inlined in debug or with -// GCC builds, so use PREFER_INLINE on these instead of ALWAYS_INLINE. -#if defined(__clang__) && defined(NDEBUG) -# define PREFER_INLINE ALWAYS_INLINE -#else -# define PREFER_INLINE inline +# define ALWAYS_INLINE __attribute__((always_inline)) inline #endif #define UNREACHABLE __builtin_unreachable() -#define UNUSED [[maybe_unused]] - -#define FALLTHROUGH [[fallthrough]] +#define UNUSED __attribute__((unused)) #ifdef MOZILLA_CLIENT # define IMPLICIT __attribute__((annotate("moz_implicit"))) @@ -91,32 +41,19 @@ WINBASEAPI BOOL WINAPI QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency); #include "gl_defs.h" #include "glsl.h" #include "program.h" -#include "texture.h" using namespace glsl; -typedef ivec2_scalar IntPoint; - struct IntRect { int x0; int y0; int x1; int y1; - IntRect() : x0(0), y0(0), x1(0), y1(0) {} - IntRect(int x0, int y0, int x1, int y1) : x0(x0), y0(y0), x1(x1), y1(y1) {} - IntRect(IntPoint origin, IntPoint size) - : x0(origin.x), - y0(origin.y), - x1(origin.x + size.x), - y1(origin.y + size.y) {} - int width() const { return x1 - x0; } int height() const { return y1 - y0; } bool is_empty() const { return width() <= 0 || height() <= 0; } - IntPoint origin() const { return IntPoint(x0, y0); } - bool same_size(const IntRect& o) const { return width() == o.width() && height() == o.height(); } @@ -133,12 +70,6 @@ struct IntRect { return *this; } - IntRect intersection(const IntRect& o) { - IntRect result = *this; - result.intersect(o); - return result; - } - // Scale from source-space to dest-space, optionally rounding inward IntRect& scale(int srcWidth, int srcHeight, int dstWidth, int dstHeight, bool roundIn = false) { @@ -156,60 +87,15 @@ struct IntRect { swap(y0, y1); } - IntRect& offset(const IntPoint& o) { - x0 += o.x; - y0 += o.y; - x1 += o.x; - y1 += o.y; + IntRect& offset(int dx, int dy) { + x0 += dx; + y0 += dy; + x1 += dx; + y1 += dy; return *this; } - - IntRect operator+(const IntPoint& o) const { - return IntRect(*this).offset(o); - } - IntRect operator-(const IntPoint& o) const { - return IntRect(*this).offset(-o); - } }; -typedef vec2_scalar Point2D; -typedef vec4_scalar Point3D; - -struct IntRange { - int start; - int end; - - int len() const { return end - start; } - - IntRange intersect(IntRange r) const { - return {max(start, r.start), min(end, r.end)}; - } -}; - -struct FloatRange { - float start; - float end; - - float clip(float x) const { return clamp(x, start, end); } - - FloatRange clip(FloatRange r) const { return {clip(r.start), clip(r.end)}; } - - FloatRange merge(FloatRange r) const { - return {min(start, r.start), max(end, r.end)}; - } - - IntRange round() const { - return {int(floor(start + 0.5f)), int(floor(end + 0.5f))}; - } - - IntRange round_out() const { return {int(floor(start)), int(ceil(end))}; } -}; - -template <typename P> -static inline FloatRange x_range(P p0, P p1) { - return {min(p0.x, p1.x), max(p0.x, p1.x)}; -} - struct VertexAttrib { size_t size = 0; // in bytes GLenum type = 0; @@ -237,18 +123,12 @@ static int bytes_for_internal_format(GLenum internal_format) { case GL_R8: case GL_RED: return 1; - case GL_RG8: - case GL_RG: - return 2; case GL_DEPTH_COMPONENT: case GL_DEPTH_COMPONENT16: + return 2; case GL_DEPTH_COMPONENT24: case GL_DEPTH_COMPONENT32: return 4; - case GL_RGB_RAW_422_APPLE: - return 2; - case GL_R16: - return 2; default: debugf("internal format: %x\n", internal_format); assert(0); @@ -268,12 +148,6 @@ static TextureFormat gl_format_to_texture_format(int type) { return TextureFormat::RGBA8; case GL_R8: return TextureFormat::R8; - case GL_RG8: - return TextureFormat::RG8; - case GL_R16: - return TextureFormat::R16; - case GL_RGB_RAW_422_APPLE: - return TextureFormat::YUV422; default: assert(0); return TextureFormat::RGBA8; @@ -287,34 +161,19 @@ struct Query { struct Buffer { char* buf = nullptr; size_t size = 0; - size_t capacity = 0; bool allocate(size_t new_size) { - // If the size remains unchanged, don't allocate anything. - if (new_size == size) { - return false; - } - // If the new size is within the existing capacity of the buffer, just - // reuse the existing buffer. - if (new_size <= capacity) { - size = new_size; - return true; - } - // Otherwise we need to reallocate the buffer to hold up to the requested - // larger size. - char* new_buf = (char*)realloc(buf, new_size); - assert(new_buf); - if (!new_buf) { - // If we fail, null out the buffer rather than leave around the old - // allocation state. + if (new_size != size) { + char* new_buf = (char*)realloc(buf, new_size); + assert(new_buf); + if (new_buf) { + buf = new_buf; + size = new_size; + return true; + } cleanup(); - return false; } - // The reallocation succeeded, so install the buffer. - buf = new_buf; - size = new_size; - capacity = new_size; - return true; + return false; } void cleanup() { @@ -322,7 +181,6 @@ struct Buffer { free(buf); buf = nullptr; size = 0; - capacity = 0; } } @@ -331,6 +189,7 @@ struct Buffer { struct Framebuffer { GLuint color_attachment = 0; + GLint layer = 0; GLuint depth_attachment = 0; }; @@ -364,32 +223,17 @@ struct Texture { GLenum internal_format = 0; int width = 0; int height = 0; + int depth = 0; char* buf = nullptr; size_t buf_size = 0; - uint32_t buf_stride = 0; - uint8_t buf_bpp = 0; GLenum min_filter = GL_NEAREST; GLenum mag_filter = GL_LINEAR; - // The number of active locks on this texture. If this texture has any active - // locks, we need to disallow modifying or destroying the texture as it may - // be accessed by other threads where modifications could lead to races. - int32_t locked = 0; - // When used as an attachment of a framebuffer, rendering to the texture - // behaves as if it is located at the given offset such that the offset is - // subtracted from all transformed vertexes after the viewport is applied. - IntPoint offset; enum FLAGS { - // If the buffer is internally-allocated by SWGL SHOULD_FREE = 1 << 1, - // If the buffer has been cleared to initialize it. Currently this is only - // utilized by depth buffers which need to know when depth runs have reset - // to a valid row state. When unset, the depth runs may contain garbage. - CLEARED = 1 << 2, }; int flags = SHOULD_FREE; bool should_free() const { return bool(flags & SHOULD_FREE); } - bool cleared() const { return bool(flags & CLEARED); } void set_flag(int flag, bool val) { if (val) { @@ -398,14 +242,7 @@ struct Texture { flags &= ~flag; } } - void set_should_free(bool val) { - // buf must be null before SHOULD_FREE can be safely toggled. Otherwise, we - // might accidentally mistakenly realloc an externally allocated buffer as - // if it were an internally allocated one. - assert(!buf); - set_flag(SHOULD_FREE, val); - } - void set_cleared(bool val) { set_flag(CLEARED, val); } + void set_should_free(bool val) { set_flag(SHOULD_FREE, val); } // Delayed-clearing state. When a clear of an FB is requested, we don't // immediately clear each row, as the rows may be subsequently overwritten @@ -418,9 +255,6 @@ struct Texture { uint32_t clear_val = 0; uint32_t* cleared_rows = nullptr; - void init_depth_runs(uint32_t z); - void fill_depth_runs(uint32_t z, const IntRect& scissor); - void enable_delayed_clear(uint32_t val) { delay_clear = height; clear_val = val; @@ -441,88 +275,40 @@ struct Texture { } } - int bpp() const { return buf_bpp; } - void set_bpp() { buf_bpp = bytes_for_internal_format(internal_format); } + int bpp() const { return bytes_for_internal_format(internal_format); } - size_t stride() const { return buf_stride; } - void set_stride() { buf_stride = aligned_stride(buf_bpp * width); } - - // Set an external backing buffer of this texture. - void set_buffer(void* new_buf, size_t new_stride) { - assert(!should_free()); - // Ensure that the supplied stride is at least as big as the row data and - // is aligned to the smaller of either the BPP or word-size. We need to at - // least be able to sample data from within a row and sample whole pixels - // of smaller formats without risking unaligned access. - set_bpp(); - set_stride(); - assert(new_stride >= size_t(bpp() * width) && - new_stride % min(bpp(), sizeof(uint32_t)) == 0); + size_t stride(int b = 0, int min_width = 0) const { + return aligned_stride((b ? b : bpp()) * max(width, min_width)); + } - buf = (char*)new_buf; - buf_size = 0; - buf_stride = new_stride; + size_t layer_stride(int b = 0, int min_width = 0, int min_height = 0) const { + return stride(b ? b : bpp(), min_width) * max(height, min_height); } bool allocate(bool force = false, int min_width = 0, int min_height = 0) { - assert(!locked); // Locked textures shouldn't be reallocated - // If we get here, some GL API call that invalidates the texture was used. - // Mark the buffer as not-cleared to signal this. - set_cleared(false); - // Check if there is either no buffer currently or if we forced validation - // of the buffer size because some dimension might have changed. if ((!buf || force) && should_free()) { - // Initialize the buffer's BPP and stride, since they may have changed. - set_bpp(); - set_stride(); - // Compute new size based on the maximum potential stride, rather than - // the current stride, to hopefully avoid reallocations when size would - // otherwise change too much... - size_t max_stride = max(buf_stride, aligned_stride(buf_bpp * min_width)); - size_t size = max_stride * max(height, min_height); - if ((!buf && size > 0) || size > buf_size) { + size_t size = layer_stride(bpp(), min_width, min_height) * max(depth, 1); + if (!buf || size > buf_size) { // Allocate with a SIMD register-sized tail of padding at the end so we // can safely read or write past the end of the texture with SIMD ops. - // Currently only the flat Z-buffer texture needs this padding due to - // full-register loads and stores in check_depth and discard_depth. In - // case some code in the future accidentally uses a linear filter on a - // texture with less than 2 pixels per row, we also add this padding - // just to be safe. All other texture types and use-cases should be - // safe to omit padding. - size_t padding = - internal_format == GL_DEPTH_COMPONENT24 || max(width, min_width) < 2 - ? sizeof(Float) - : 0; - char* new_buf = (char*)realloc(buf, size + padding); + char* new_buf = (char*)realloc(buf, size + sizeof(Float)); assert(new_buf); if (new_buf) { - // Successfully reallocated the buffer, so go ahead and set it. buf = new_buf; buf_size = size; return true; } - // Allocation failed, so ensure we don't leave stale buffer state. cleanup(); } } - // Nothing changed... return false; } void cleanup() { - assert(!locked); // Locked textures shouldn't be destroyed - if (buf) { - // If we need to toggle SHOULD_FREE state, ensure that buf is nulled out, - // regardless of whether we internally allocated it. This will prevent us - // from wrongly treating buf as having been internally allocated for when - // we go to realloc if it actually was externally allocted. - if (should_free()) { - free(buf); - } + if (buf && should_free()) { + free(buf); buf = nullptr; buf_size = 0; - buf_bpp = 0; - buf_stride = 0; } disable_delayed_clear(); } @@ -530,41 +316,44 @@ struct Texture { ~Texture() { cleanup(); } IntRect bounds() const { return IntRect{0, 0, width, height}; } - IntRect offset_bounds() const { return bounds() + offset; } // Find the valid sampling bounds relative to the requested region IntRect sample_bounds(const IntRect& req, bool invertY = false) const { - IntRect bb = bounds().intersect(req) - req.origin(); + IntRect bb = bounds().intersect(req).offset(-req.x0, -req.y0); if (invertY) bb.invert_y(req.height()); return bb; } // Get a pointer for sampling at the given offset - char* sample_ptr(int x, int y) const { - return buf + y * stride() + x * bpp(); + char* sample_ptr(int x, int y, int z, int bpp, size_t stride) const { + return buf + (height * z + y) * stride + x * bpp; + } + + char* sample_ptr(int x, int y, int z, int bpp) const { + return sample_ptr(x, y, z, bpp, stride(bpp)); + } + + char* sample_ptr(int x, int y, int z) const { + return sample_ptr(x, y, z, bpp()); } // Get a pointer for sampling the requested region and limit to the provided // sampling bounds - char* sample_ptr(const IntRect& req, const IntRect& bounds, + char* sample_ptr(const IntRect& req, const IntRect& bounds, int z, bool invertY = false) const { // Offset the sample pointer by the clamped bounds int x = req.x0 + bounds.x0; // Invert the Y offset if necessary int y = invertY ? req.y1 - 1 - bounds.y0 : req.y0 + bounds.y0; - return sample_ptr(x, y); + return sample_ptr(x, y, z); } }; -// The last vertex attribute is reserved as a null attribute in case a vertex -// attribute is used without being set. -#define MAX_ATTRIBS 17 -#define NULL_ATTRIB 16 +#define MAX_ATTRIBS 16 +#define NULL_ATTRIB 15 struct VertexArray { VertexAttrib attribs[MAX_ATTRIBS]; int max_attrib = -1; - // The GL spec defines element array buffer binding to be part of VAO state. - GLuint element_array_buffer_binding = 0; void validate(); }; @@ -580,67 +369,33 @@ struct Program { FragmentShaderImpl* frag_impl = nullptr; bool deleted = false; - ~Program() { delete impl; } + ~Program() { + delete impl; + } }; -// clang-format off -// Fully-expand GL defines while ignoring more than 4 suffixes +// for GL defines to fully expand #define CONCAT_KEY(prefix, x, y, z, w, ...) prefix##x##y##z##w -// Generate a blend key enum symbol -#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0, 0) -#define MASK_BLEND_KEY(...) CONCAT_KEY(MASK_BLEND_, __VA_ARGS__, 0, 0, 0) -#define AA_BLEND_KEY(...) CONCAT_KEY(AA_BLEND_, __VA_ARGS__, 0, 0, 0) -#define AA_MASK_BLEND_KEY(...) CONCAT_KEY(AA_MASK_BLEND_, __VA_ARGS__, 0, 0, 0) - -// Utility macro to easily generate similar code for all implemented blend modes +#define BLEND_KEY(...) CONCAT_KEY(BLEND_, __VA_ARGS__, 0, 0) #define FOR_EACH_BLEND_KEY(macro) \ - macro(GL_ONE, GL_ZERO, 0, 0) \ - macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \ - macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \ - macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0) \ - macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE) \ - macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \ - macro(GL_ZERO, GL_SRC_COLOR, 0, 0) \ - macro(GL_ONE, GL_ONE, 0, 0) \ - macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \ - macro(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE) \ - macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, 0, 0) \ - macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0) \ - macro(GL_MIN, 0, 0, 0) \ - macro(GL_MAX, 0, 0, 0) \ - macro(GL_MULTIPLY_KHR, 0, 0, 0) \ - macro(GL_SCREEN_KHR, 0, 0, 0) \ - macro(GL_OVERLAY_KHR, 0, 0, 0) \ - macro(GL_DARKEN_KHR, 0, 0, 0) \ - macro(GL_LIGHTEN_KHR, 0, 0, 0) \ - macro(GL_COLORDODGE_KHR, 0, 0, 0) \ - macro(GL_COLORBURN_KHR, 0, 0, 0) \ - macro(GL_HARDLIGHT_KHR, 0, 0, 0) \ - macro(GL_SOFTLIGHT_KHR, 0, 0, 0) \ - macro(GL_DIFFERENCE_KHR, 0, 0, 0) \ - macro(GL_EXCLUSION_KHR, 0, 0, 0) \ - macro(GL_HSL_HUE_KHR, 0, 0, 0) \ - macro(GL_HSL_SATURATION_KHR, 0, 0, 0) \ - macro(GL_HSL_COLOR_KHR, 0, 0, 0) \ - macro(GL_HSL_LUMINOSITY_KHR, 0, 0, 0) \ - macro(SWGL_BLEND_DROP_SHADOW, 0, 0, 0) \ - macro(SWGL_BLEND_SUBPIXEL_TEXT, 0, 0, 0) + macro(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE) \ + macro(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, 0, 0) \ + macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, 0, 0) \ + macro(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE) \ + macro(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA, 0, 0) macro( \ + GL_ZERO, GL_SRC_COLOR, 0, 0) macro(GL_ONE, GL_ONE, 0, 0) \ + macro(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA) \ + macro(GL_ONE, GL_ZERO, 0, 0) macro( \ + GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE) \ + macro(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR, \ + 0, 0) \ + macro(GL_ONE, GL_ONE_MINUS_SRC1_COLOR, 0, 0) #define DEFINE_BLEND_KEY(...) BLEND_KEY(__VA_ARGS__), -#define DEFINE_MASK_BLEND_KEY(...) MASK_BLEND_KEY(__VA_ARGS__), -#define DEFINE_AA_BLEND_KEY(...) AA_BLEND_KEY(__VA_ARGS__), -#define DEFINE_AA_MASK_BLEND_KEY(...) AA_MASK_BLEND_KEY(__VA_ARGS__), enum BlendKey : uint8_t { + BLEND_KEY_NONE = 0, FOR_EACH_BLEND_KEY(DEFINE_BLEND_KEY) - FOR_EACH_BLEND_KEY(DEFINE_MASK_BLEND_KEY) - FOR_EACH_BLEND_KEY(DEFINE_AA_BLEND_KEY) - FOR_EACH_BLEND_KEY(DEFINE_AA_MASK_BLEND_KEY) - BLEND_KEY_NONE = BLEND_KEY(GL_ONE, GL_ZERO), - MASK_BLEND_KEY_NONE = MASK_BLEND_KEY(GL_ONE, GL_ZERO), - AA_BLEND_KEY_NONE = AA_BLEND_KEY(GL_ONE, GL_ZERO), - AA_MASK_BLEND_KEY_NONE = AA_MASK_BLEND_KEY(GL_ONE, GL_ZERO), }; -// clang-format on const size_t MAX_TEXTURE_UNITS = 16; @@ -704,10 +459,8 @@ struct ObjectStore { O* find(size_t i) const { return i < size ? objects[i] : nullptr; } - template <typename T> - void on_erase(T*, ...) {} - template <typename T> - void on_erase(T* o, decltype(&T::on_erase)) { + template <typename T> void on_erase(T*, ...) {} + template <typename T> void on_erase(T* o, decltype(&T::on_erase)) { o->on_erase(); } @@ -727,8 +480,6 @@ struct ObjectStore { }; struct Context { - int32_t references = 1; - ObjectStore<Query> queries; ObjectStore<Buffer> buffers; ObjectStore<Texture> textures; @@ -756,7 +507,7 @@ struct Context { bool scissortest = false; IntRect scissor = {0, 0, 0, 0}; - GLfloat clearcolor[4] = {0, 0, 0, 0}; + uint32_t clearcolor = 0; GLdouble cleardepth = 1; int unpack_row_length = 0; @@ -766,10 +517,14 @@ struct Context { struct TextureUnit { GLuint texture_2d_binding = 0; + GLuint texture_3d_binding = 0; + GLuint texture_2d_array_binding = 0; GLuint texture_rectangle_binding = 0; void unlink(GLuint n) { ::unlink(texture_2d_binding, n); + ::unlink(texture_3d_binding, n); + ::unlink(texture_2d_array_binding, n); ::unlink(texture_rectangle_binding, n); } }; @@ -784,6 +539,7 @@ struct Context { GLuint pixel_pack_buffer_binding = 0; GLuint pixel_unpack_buffer_binding = 0; GLuint array_buffer_binding = 0; + GLuint element_array_buffer_binding = 0; GLuint time_elapsed_query = 0; GLuint samples_passed_query = 0; GLuint renderbuffer_binding = 0; @@ -800,9 +556,13 @@ struct Context { case GL_ARRAY_BUFFER: return array_buffer_binding; case GL_ELEMENT_ARRAY_BUFFER: - return vertex_arrays[current_vertex_array].element_array_buffer_binding; + return element_array_buffer_binding; case GL_TEXTURE_2D: return texture_units[active_texture_unit].texture_2d_binding; + case GL_TEXTURE_2D_ARRAY: + return texture_units[active_texture_unit].texture_2d_array_binding; + case GL_TEXTURE_3D: + return texture_units[active_texture_unit].texture_3d_binding; case GL_TEXTURE_RECTANGLE: return texture_units[active_texture_unit].texture_rectangle_binding; case GL_TIME_ELAPSED: @@ -830,17 +590,16 @@ struct Context { return textures[texture_units[unit].texture_2d_binding]; } - Texture& get_texture(sampler2DRect, int unit) { - return textures[texture_units[unit].texture_rectangle_binding]; + Texture& get_texture(sampler2DArray, int unit) { + return textures[texture_units[unit].texture_2d_array_binding]; } - IntRect apply_scissor(IntRect bb, - const IntPoint& origin = IntPoint(0, 0)) const { - return scissortest ? bb.intersect(scissor - origin) : bb; + Texture& get_texture(sampler2DRect, int unit) { + return textures[texture_units[unit].texture_rectangle_binding]; } - IntRect apply_scissor(const Texture& t) const { - return apply_scissor(t.bounds(), t.offset); + IntRect apply_scissor(IntRect bb) const { + return scissortest ? bb.intersect(scissor) : bb; } }; static Context* ctx = nullptr; @@ -851,12 +610,14 @@ static BlendKey blend_key = BLEND_KEY_NONE; static void prepare_texture(Texture& t, const IntRect* skip = nullptr); template <typename S> +static inline void init_depth(S* s, Texture& t) { + s->depth = max(t.depth, 1); + s->height_stride = s->stride * t.height; +} + +template <typename S> static inline void init_filter(S* s, Texture& t) { - // If the width is not at least 2 pixels, then we can't safely sample the end - // of the row with a linear filter. In that case, just punt to using nearest - // filtering instead. - s->filter = t.width >= 2 ? gl_filter_to_texture_filter(t.mag_filter) - : TextureFilter::NEAREST; + s->filter = gl_filter_to_texture_filter(t.mag_filter); } template <typename S> @@ -864,44 +625,20 @@ static inline void init_sampler(S* s, Texture& t) { prepare_texture(t); s->width = t.width; s->height = t.height; - s->stride = t.stride(); int bpp = t.bpp(); - if (bpp >= 4) - s->stride /= 4; - else if (bpp == 2) - s->stride /= 2; - else - assert(bpp == 1); - // Use uint32_t* for easier sampling, but need to cast to uint8_t* or - // uint16_t* for formats with bpp < 4. + s->stride = t.stride(bpp); + if (bpp >= 4) s->stride /= 4; + // Use uint32_t* for easier sampling, but need to cast to uint8_t* for formats + // with bpp < 4. s->buf = (uint32_t*)t.buf; s->format = gl_format_to_texture_format(t.internal_format); } template <typename S> -static inline void null_sampler(S* s) { - // For null texture data, just make the sampler provide a 1x1 buffer that is - // transparent black. Ensure buffer holds at least a SIMD vector of zero data - // for SIMD padding of unaligned loads. - static const uint32_t zeroBuf[sizeof(Float) / sizeof(uint32_t)] = {0}; - s->width = 1; - s->height = 1; - s->stride = s->width; - s->buf = (uint32_t*)zeroBuf; - s->format = TextureFormat::RGBA8; -} - -template <typename S> -static inline void null_filter(S* s) { - s->filter = TextureFilter::NEAREST; -} - -template <typename S> S* lookup_sampler(S* s, int texture) { Texture& t = ctx->get_texture(s, texture); if (!t.buf) { - null_sampler(s); - null_filter(s); + *s = S(); } else { init_sampler(s, t); init_filter(s, t); @@ -913,13 +650,26 @@ template <typename S> S* lookup_isampler(S* s, int texture) { Texture& t = ctx->get_texture(s, texture); if (!t.buf) { - null_sampler(s); + *s = S(); } else { init_sampler(s, t); } return s; } +template <typename S> +S* lookup_sampler_array(S* s, int texture) { + Texture& t = ctx->get_texture(s, texture); + if (!t.buf) { + *s = S(); + } else { + init_sampler(s, t); + init_depth(s, t); + init_filter(s, t); + } + return s; +} + int bytes_per_type(GLenum type) { switch (type) { case GL_INT: @@ -983,40 +733,21 @@ void load_attrib(T& attrib, VertexAttrib& va, uint32_t start, int instance, attrib = T(load_attrib_scalar<scalar_type>(va, src)); } else { // Specialized for WR's primitive vertex order/winding. + // Triangles must be indexed at offsets 0, 1, 2. + // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3. + // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2. + // Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so that the + // points form a convex path that can be traversed by the rasterizer. if (!count) return; - assert(count >= 2 && count <= 4); + assert(count == 3 || count == 4); char* src = (char*)va.buf + va.stride * start + va.offset; - switch (count) { - case 2: { - // Lines must be indexed at offsets 0, 1. - // Line vertexes fill vertex shader SIMD lanes as 0, 1, 1, 0. - scalar_type lanes[2] = { - load_attrib_scalar<scalar_type>(va, src), - load_attrib_scalar<scalar_type>(va, src + va.stride)}; - attrib = (T){lanes[0], lanes[1], lanes[1], lanes[0]}; - break; - } - case 3: { - // Triangles must be indexed at offsets 0, 1, 2. - // Triangle vertexes fill vertex shader SIMD lanes as 0, 1, 2, 2. - scalar_type lanes[3] = { - load_attrib_scalar<scalar_type>(va, src), - load_attrib_scalar<scalar_type>(va, src + va.stride), - load_attrib_scalar<scalar_type>(va, src + va.stride * 2)}; - attrib = (T){lanes[0], lanes[1], lanes[2], lanes[2]}; - break; - } - default: - // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, - // 1, 3. Quad vertexes fill vertex shader SIMD lanes as 0, 1, 3, 2, so - // that the points form a convex path that can be traversed by the - // rasterizer. - attrib = (T){load_attrib_scalar<scalar_type>(va, src), - load_attrib_scalar<scalar_type>(va, src + va.stride), - load_attrib_scalar<scalar_type>(va, src + va.stride * 3), - load_attrib_scalar<scalar_type>(va, src + va.stride * 2)}; - break; - } + attrib = (T){ + load_attrib_scalar<scalar_type>(va, src), + load_attrib_scalar<scalar_type>(va, src + va.stride), + load_attrib_scalar<scalar_type>(va, src + va.stride * 2 + + (count > 3 ? va.stride : 0)), + load_attrib_scalar<scalar_type>(va, src + va.stride * 2) + }; } } @@ -1076,6 +807,7 @@ void Enable(GLenum cap) { switch (cap) { case GL_BLEND: ctx->blend = true; + blend_key = ctx->blend_key; break; case GL_DEPTH_TEST: ctx->depthtest = true; @@ -1090,6 +822,7 @@ void Disable(GLenum cap) { switch (cap) { case GL_BLEND: ctx->blend = false; + blend_key = BLEND_KEY_NONE; break; case GL_DEPTH_TEST: ctx->depthtest = false; @@ -1103,18 +836,10 @@ void Disable(GLenum cap) { GLenum GetError() { return GL_NO_ERROR; } static const char* const extensions[] = { - "GL_ARB_blend_func_extended", - "GL_ARB_clear_texture", - "GL_ARB_copy_image", - "GL_ARB_draw_instanced", - "GL_ARB_explicit_attrib_location", - "GL_ARB_instanced_arrays", - "GL_ARB_invalidate_subdata", - "GL_ARB_texture_storage", - "GL_EXT_timer_query", - "GL_KHR_blend_equation_advanced", - "GL_KHR_blend_equation_advanced_coherent", - "GL_APPLE_rgb_422", + "GL_ARB_blend_func_extended", "GL_ARB_copy_image", + "GL_ARB_draw_instanced", "GL_ARB_explicit_attrib_location", + "GL_ARB_instanced_arrays", "GL_ARB_invalidate_subdata", + "GL_ARB_texture_storage", "GL_EXT_timer_query", }; void GetIntegerv(GLenum pname, GLint* params) { @@ -1128,7 +853,7 @@ void GetIntegerv(GLenum pname, GLint* params) { params[0] = 1 << 15; break; case GL_MAX_ARRAY_TEXTURE_LAYERS: - params[0] = 0; + params[0] = 1 << 15; break; case GL_READ_FRAMEBUFFER_BINDING: params[0] = ctx->read_framebuffer_binding; @@ -1145,12 +870,6 @@ void GetIntegerv(GLenum pname, GLint* params) { case GL_NUM_EXTENSIONS: params[0] = sizeof(extensions) / sizeof(extensions[0]); break; - case GL_MAJOR_VERSION: - params[0] = 3; - break; - case GL_MINOR_VERSION: - params[0] = 2; - break; default: debugf("unhandled glGetIntegerv parameter %x\n", pname); assert(false); @@ -1177,8 +896,6 @@ const char* GetString(GLenum name) { return "Software WebRender"; case GL_VERSION: return "3.2"; - case GL_SHADING_LANGUAGE_VERSION: - return "1.50"; default: debugf("unhandled glGetString parameter %x\n", name); assert(false); @@ -1254,23 +971,17 @@ GLenum remap_blendfunc(GLenum rgb, GLenum a) { return a; } -// Generate a hashed blend key based on blend func and equation state. This -// allows all the blend state to be processed down to a blend key that can be -// dealt with inside a single switch statement. -static void hash_blend_key() { - GLenum srgb = ctx->blendfunc_srgb; - GLenum drgb = ctx->blendfunc_drgb; - GLenum sa = ctx->blendfunc_sa; - GLenum da = ctx->blendfunc_da; - GLenum equation = ctx->blend_equation; +void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) { + ctx->blendfunc_srgb = srgb; + ctx->blendfunc_drgb = drgb; + sa = remap_blendfunc(srgb, sa); + da = remap_blendfunc(drgb, da); + ctx->blendfunc_sa = sa; + ctx->blendfunc_da = da; + #define HASH_BLEND_KEY(x, y, z, w) ((x << 4) | (y) | (z << 24) | (w << 20)) - // Basic non-separate blend funcs used the two argument form int hash = HASH_BLEND_KEY(srgb, drgb, 0, 0); - // Separate alpha blend funcs use the 4 argument hash if (srgb != sa || drgb != da) hash |= HASH_BLEND_KEY(0, 0, sa, da); - // Any other blend equation than the default func_add ignores the func and - // instead generates a one-argument hash based on the equation - if (equation != GL_FUNC_ADD) hash = HASH_BLEND_KEY(equation, 0, 0, 0); switch (hash) { #define MAP_BLEND_KEY(...) \ case HASH_BLEND_KEY(__VA_ARGS__): \ @@ -1278,22 +989,14 @@ static void hash_blend_key() { break; FOR_EACH_BLEND_KEY(MAP_BLEND_KEY) default: - debugf("blendfunc: %x, %x, separate: %x, %x, equation: %x\n", srgb, drgb, - sa, da, equation); + debugf("blendfunc: %x, %x, separate: %x, %x\n", srgb, drgb, sa, da); assert(false); break; } -} -void BlendFunc(GLenum srgb, GLenum drgb, GLenum sa, GLenum da) { - ctx->blendfunc_srgb = srgb; - ctx->blendfunc_drgb = drgb; - sa = remap_blendfunc(srgb, sa); - da = remap_blendfunc(drgb, da); - ctx->blendfunc_sa = sa; - ctx->blendfunc_da = da; - - hash_blend_key(); + if (ctx->blend) { + blend_key = ctx->blend_key; + } } void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { @@ -1302,12 +1005,8 @@ void BlendColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { } void BlendEquation(GLenum mode) { - assert(mode == GL_FUNC_ADD || mode == GL_MIN || mode == GL_MAX || - (mode >= GL_MULTIPLY_KHR && mode <= GL_HSL_LUMINOSITY_KHR)); - if (mode != ctx->blend_equation) { - ctx->blend_equation = mode; - hash_blend_key(); - } + assert(mode == GL_FUNC_ADD); + ctx->blend_equation = mode; } void DepthMask(GLboolean flag) { ctx->depthmask = flag; } @@ -1328,10 +1027,8 @@ void SetScissor(GLint x, GLint y, GLsizei width, GLsizei height) { } void ClearColor(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { - ctx->clearcolor[0] = r; - ctx->clearcolor[1] = g; - ctx->clearcolor[2] = b; - ctx->clearcolor[3] = a; + I32 c = round_pixel((Float){b, g, r, a}); + ctx->clearcolor = bit_cast<uint32_t>(CONVERT(c, U8)); } void ClearDepth(GLdouble depth) { ctx->cleardepth = depth; } @@ -1369,6 +1066,7 @@ void DeleteBuffer(GLuint n) { unlink(ctx->pixel_pack_buffer_binding, n); unlink(ctx->pixel_unpack_buffer_binding, n); unlink(ctx->array_buffer_binding, n); + unlink(ctx->element_array_buffer_binding, n); } } @@ -1434,45 +1132,26 @@ void DeleteProgram(GLuint n) { void LinkProgram(GLuint program) { Program& p = ctx->programs[program]; assert(p.impl); - if (!p.impl) { - return; - } assert(p.impl->interpolants_size() <= sizeof(Interpolants)); if (!p.vert_impl) p.vert_impl = p.impl->get_vertex_shader(); if (!p.frag_impl) p.frag_impl = p.impl->get_fragment_shader(); } -GLint GetLinkStatus(GLuint program) { - if (auto* p = ctx->programs.find(program)) { - return p->impl ? 1 : 0; - } - return 0; -} - void BindAttribLocation(GLuint program, GLuint index, char* name) { Program& p = ctx->programs[program]; assert(p.impl); - if (!p.impl) { - return; - } p.impl->bind_attrib(name, index); } GLint GetAttribLocation(GLuint program, char* name) { Program& p = ctx->programs[program]; assert(p.impl); - if (!p.impl) { - return -1; - } return p.impl->get_attrib(name); } GLint GetUniformLocation(GLuint program, char* name) { Program& p = ctx->programs[program]; assert(p.impl); - if (!p.impl) { - return -1; - } GLint loc = p.impl->get_uniform(name); // debugf("location: %d\n", loc); return loc; @@ -1482,15 +1161,7 @@ static uint64_t get_time_value() { #ifdef __MACH__ return mach_absolute_time(); #elif defined(_WIN32) - LARGE_INTEGER time; - static bool have_frequency = false; - static LARGE_INTEGER frequency; - if (!have_frequency) { - QueryPerformanceFrequency(&frequency); - have_frequency = true; - } - QueryPerformanceCounter(&time); - return time.QuadPart * 1000000000ULL / frequency.QuadPart; + return uint64_t(clock()) * (1000000000ULL / CLOCKS_PER_SEC); #else return ({ struct timespec tp; @@ -1583,113 +1254,60 @@ void PixelStorei(GLenum name, GLint param) { static GLenum remap_internal_format(GLenum format) { switch (format) { case GL_DEPTH_COMPONENT: - return GL_DEPTH_COMPONENT24; + return GL_DEPTH_COMPONENT16; case GL_RGBA: return GL_RGBA8; case GL_RED: return GL_R8; - case GL_RG: - return GL_RG8; - case GL_RGB_422_APPLE: - return GL_RGB_RAW_422_APPLE; default: return format; } } -} // extern "C" - -static bool format_requires_conversion(GLenum external_format, - GLenum internal_format) { - switch (external_format) { - case GL_RGBA: - return internal_format == GL_RGBA8; - default: - return false; - } -} - -static inline void copy_bgra8_to_rgba8(uint32_t* dest, const uint32_t* src, - int width) { - for (; width >= 4; width -= 4, dest += 4, src += 4) { - U32 p = unaligned_load<U32>(src); - U32 rb = p & 0x00FF00FF; - unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16)); - } - for (; width > 0; width--, dest++, src++) { - uint32_t p = *src; - uint32_t rb = p & 0x00FF00FF; - *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16); - } -} - -static void convert_copy(GLenum external_format, GLenum internal_format, - uint8_t* dst_buf, size_t dst_stride, - const uint8_t* src_buf, size_t src_stride, - size_t width, size_t height) { - switch (external_format) { - case GL_RGBA: - if (internal_format == GL_RGBA8) { - for (; height; height--) { - copy_bgra8_to_rgba8((uint32_t*)dst_buf, (const uint32_t*)src_buf, - width); - dst_buf += dst_stride; - src_buf += src_stride; - } - return; - } - break; - default: - break; - } - size_t row_bytes = width * bytes_for_internal_format(internal_format); - for (; height; height--) { - memcpy(dst_buf, src_buf, row_bytes); - dst_buf += dst_stride; - src_buf += src_stride; +void TexStorage3D(GLenum target, GLint levels, GLenum internal_format, + GLsizei width, GLsizei height, GLsizei depth) { + assert(levels == 1); + Texture& t = ctx->textures[ctx->get_binding(target)]; + internal_format = remap_internal_format(internal_format); + bool changed = false; + if (t.width != width || t.height != height || t.depth != depth || + t.internal_format != internal_format) { + changed = true; + t.internal_format = internal_format; + t.width = width; + t.height = height; + t.depth = depth; } + t.disable_delayed_clear(); + t.allocate(changed); } -static void set_tex_storage(Texture& t, GLenum external_format, GLsizei width, - GLsizei height, void* buf = nullptr, - GLsizei stride = 0, GLsizei min_width = 0, - GLsizei min_height = 0) { - GLenum internal_format = remap_internal_format(external_format); +static void set_tex_storage(Texture& t, GLenum internal_format, + GLsizei width, GLsizei height, + bool should_free = true, void* buf = nullptr, + GLsizei min_width = 0, GLsizei min_height = 0) { + internal_format = remap_internal_format(internal_format); bool changed = false; - if (t.width != width || t.height != height || + if (t.width != width || t.height != height || t.depth != 0 || t.internal_format != internal_format) { changed = true; t.internal_format = internal_format; t.width = width; t.height = height; + t.depth = 0; } - // If we are changed from an internally managed buffer to an externally - // supplied one or vice versa, ensure that we clean up old buffer state. - // However, if we have to convert the data from a non-native format, then - // always treat it as internally managed since we will need to copy to an - // internally managed native format buffer. - bool should_free = buf == nullptr || format_requires_conversion( - external_format, internal_format); - if (t.should_free() != should_free) { - changed = true; - t.cleanup(); + if (t.should_free() != should_free || buf != nullptr) { + if (t.should_free()) { + t.cleanup(); + } t.set_should_free(should_free); - } - // If now an external buffer, explicitly set it... - if (!should_free) { - t.set_buffer(buf, stride); + t.buf = (char*)buf; + t.buf_size = 0; } t.disable_delayed_clear(); t.allocate(changed, min_width, min_height); - // If we have a buffer that needs format conversion, then do that now. - if (buf && should_free) { - convert_copy(external_format, internal_format, (uint8_t*)t.buf, t.stride(), - (const uint8_t*)buf, stride, width, height); - } } -extern "C" { - void TexStorage2D(GLenum target, GLint levels, GLenum internal_format, GLsizei width, GLsizei height) { assert(levels == 1); @@ -1701,19 +1319,12 @@ GLenum internal_format_for_data(GLenum format, GLenum ty) { if (format == GL_RED && ty == GL_UNSIGNED_BYTE) { return GL_R8; } else if ((format == GL_RGBA || format == GL_BGRA) && - (ty == GL_UNSIGNED_BYTE || ty == GL_UNSIGNED_INT_8_8_8_8_REV)) { + ty == GL_UNSIGNED_BYTE) { return GL_RGBA8; } else if (format == GL_RGBA && ty == GL_FLOAT) { return GL_RGBA32F; } else if (format == GL_RGBA_INTEGER && ty == GL_INT) { return GL_RGBA32I; - } else if (format == GL_RG && ty == GL_UNSIGNED_BYTE) { - return GL_RG8; - } else if (format == GL_RGB_422_APPLE && - ty == GL_UNSIGNED_SHORT_8_8_REV_APPLE) { - return GL_RGB_RAW_422_APPLE; - } else if (format == GL_RED && ty == GL_UNSIGNED_SHORT) { - return GL_R16; } else { debugf("unknown internal format for format %x, type %x\n", format, ty); assert(false); @@ -1721,6 +1332,20 @@ GLenum internal_format_for_data(GLenum format, GLenum ty) { } } +static inline void copy_bgra8_to_rgba8(uint32_t* dest, uint32_t* src, + int width) { + for (; width >= 4; width -= 4, dest += 4, src += 4) { + U32 p = unaligned_load<U32>(src); + U32 rb = p & 0x00FF00FF; + unaligned_store(dest, (p & 0xFF00FF00) | (rb << 16) | (rb >> 16)); + } + for (; width > 0; width--, dest++, src++) { + uint32_t p = *src; + uint32_t rb = p & 0x00FF00FF; + *dest = (p & 0xFF00FF00) | (rb << 16) | (rb >> 16); + } +} + static Buffer* get_pixel_pack_buffer() { return ctx->pixel_pack_buffer_binding ? &ctx->buffers[ctx->pixel_pack_buffer_binding] @@ -1750,10 +1375,7 @@ static void* get_pixel_unpack_buffer_data(void* data) { void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, GLenum format, GLenum ty, void* data) { - if (level != 0) { - assert(false); - return; - } + if (level != 0) { assert(false); return; } data = get_pixel_unpack_buffer_data(data); if (!data) return; Texture& t = ctx->textures[ctx->get_binding(target)]; @@ -1765,33 +1387,84 @@ void TexSubImage2D(GLenum target, GLint level, GLint xoffset, GLint yoffset, GLsizei row_length = ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width; assert(t.internal_format == internal_format_for_data(format, ty)); - int src_bpp = format_requires_conversion(format, t.internal_format) - ? bytes_for_internal_format(format) - : t.bpp(); - if (!src_bpp || !t.buf) return; - convert_copy(format, t.internal_format, - (uint8_t*)t.sample_ptr(xoffset, yoffset), t.stride(), - (const uint8_t*)data, row_length * src_bpp, width, height); + int bpp = t.bpp(); + if (!bpp || !t.buf) return; + size_t dest_stride = t.stride(bpp); + char* dest = t.sample_ptr(xoffset, yoffset, 0, bpp, dest_stride); + char* src = (char*)data; + for (int y = 0; y < height; y++) { + if (t.internal_format == GL_RGBA8 && format != GL_BGRA) { + copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width); + } else { + memcpy(dest, src, width * bpp); + } + dest += dest_stride; + src += row_length * bpp; + } } void TexImage2D(GLenum target, GLint level, GLint internal_format, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum ty, void* data) { - if (level != 0) { - assert(false); - return; - } + if (level != 0) { assert(false); return; } assert(border == 0); TexStorage2D(target, 1, internal_format, width, height); TexSubImage2D(target, 0, 0, 0, width, height, format, ty, data); } +void TexSubImage3D(GLenum target, GLint level, GLint xoffset, GLint yoffset, + GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, + GLenum format, GLenum ty, void* data) { + if (level != 0) { assert(false); return; } + data = get_pixel_unpack_buffer_data(data); + if (!data) return; + Texture& t = ctx->textures[ctx->get_binding(target)]; + prepare_texture(t); + assert(ctx->unpack_row_length == 0 || ctx->unpack_row_length >= width); + GLsizei row_length = + ctx->unpack_row_length != 0 ? ctx->unpack_row_length : width; + if (format == GL_BGRA) { + assert(ty == GL_UNSIGNED_BYTE); + assert(t.internal_format == GL_RGBA8); + } else { + assert(t.internal_format == internal_format_for_data(format, ty)); + } + int bpp = t.bpp(); + if (!bpp || !t.buf) return; + char* src = (char*)data; + assert(xoffset + width <= t.width); + assert(yoffset + height <= t.height); + assert(zoffset + depth <= t.depth); + size_t dest_stride = t.stride(bpp); + for (int z = 0; z < depth; z++) { + char* dest = t.sample_ptr(xoffset, yoffset, zoffset + z, bpp, dest_stride); + for (int y = 0; y < height; y++) { + if (t.internal_format == GL_RGBA8 && format != GL_BGRA) { + copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width); + } else { + memcpy(dest, src, width * bpp); + } + dest += dest_stride; + src += row_length * bpp; + } + } +} + +void TexImage3D(GLenum target, GLint level, GLint internal_format, + GLsizei width, GLsizei height, GLsizei depth, GLint border, + GLenum format, GLenum ty, void* data) { + if (level != 0) { assert(false); return; } + assert(border == 0); + TexStorage3D(target, 1, internal_format, width, height, depth); + TexSubImage3D(target, 0, 0, 0, 0, width, height, depth, format, ty, data); +} + void GenerateMipmap(UNUSED GLenum target) { // TODO: support mipmaps } -void SetTextureParameter(GLuint texid, GLenum pname, GLint param) { - Texture& t = ctx->textures[texid]; +void TexParameteri(GLenum target, GLenum pname, GLint param) { + Texture& t = ctx->textures[ctx->get_binding(target)]; switch (pname) { case GL_TEXTURE_WRAP_S: assert(param == GL_CLAMP_TO_EDGE); @@ -1810,10 +1483,6 @@ void SetTextureParameter(GLuint texid, GLenum pname, GLint param) { } } -void TexParameteri(GLenum target, GLenum pname, GLint param) { - SetTextureParameter(ctx->get_binding(target), pname, param); -} - void GenTextures(int n, GLuint* result) { for (int i = 0; i < n; i++) { Texture t; @@ -1839,7 +1508,9 @@ void GenRenderbuffers(int n, GLuint* result) { void Renderbuffer::on_erase() { for (auto* fb : ctx->framebuffers) { if (fb) { - unlink(fb->color_attachment, texture); + if (unlink(fb->color_attachment, texture)) { + fb->layer = 0; + } unlink(fb->depth_attachment, texture); } } @@ -1875,11 +1546,10 @@ void RenderbufferStorage(GLenum target, GLenum internal_format, GLsizei width, } switch (internal_format) { case GL_DEPTH_COMPONENT: - case GL_DEPTH_COMPONENT16: case GL_DEPTH_COMPONENT24: case GL_DEPTH_COMPONENT32: - // Force depth format to 24 bits... - internal_format = GL_DEPTH_COMPONENT24; + // Force depth format to 16 bits... + internal_format = GL_DEPTH_COMPONENT16; break; } set_tex_storage(ctx->textures[r.texture], internal_format, width, height); @@ -1963,8 +1633,7 @@ void VertexAttribDivisor(GLuint index, GLuint divisor) { va.divisor = divisor; } -void BufferData(GLenum target, GLsizeiptr size, void* data, - UNUSED GLenum usage) { +void BufferData(GLenum target, GLsizeiptr size, void* data, UNUSED GLenum usage) { Buffer& b = ctx->buffers[ctx->get_binding(target)]; if (b.allocate(size)) { ctx->validate_vertex_array = true; @@ -2004,23 +1673,17 @@ GLboolean UnmapBuffer(GLenum target) { void Uniform1i(GLint location, GLint V0) { // debugf("tex: %d\n", (int)ctx->textures.size); - if (vertex_shader) { - vertex_shader->set_uniform_1i(location, V0); - } + vertex_shader->set_uniform_1i(location, V0); } void Uniform4fv(GLint location, GLsizei count, const GLfloat* v) { assert(count == 1); - if (vertex_shader) { - vertex_shader->set_uniform_4fv(location, v); - } + vertex_shader->set_uniform_4fv(location, v); } void UniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose, const GLfloat* value) { assert(count == 1); assert(!transpose); - if (vertex_shader) { - vertex_shader->set_uniform_matrix4fv(location, value); - } + vertex_shader->set_uniform_matrix4fv(location, value); } void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget, @@ -2031,7 +1694,24 @@ void FramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget, Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)]; if (attachment == GL_COLOR_ATTACHMENT0) { fb.color_attachment = texture; + fb.layer = 0; + } else if (attachment == GL_DEPTH_ATTACHMENT) { + fb.depth_attachment = texture; + } else { + assert(0); + } +} + +void FramebufferTextureLayer(GLenum target, GLenum attachment, GLuint texture, + GLint level, GLint layer) { + assert(target == GL_READ_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER); + assert(level == 0); + Framebuffer& fb = ctx->framebuffers[ctx->get_binding(target)]; + if (attachment == GL_COLOR_ATTACHMENT0) { + fb.color_attachment = texture; + fb.layer = layer; } else if (attachment == GL_DEPTH_ATTACHMENT) { + assert(layer == 0); fb.depth_attachment = texture; } else { assert(0); @@ -2046,6 +1726,7 @@ void FramebufferRenderbuffer(GLenum target, GLenum attachment, Renderbuffer& rb = ctx->renderbuffers[renderbuffer]; if (attachment == GL_COLOR_ATTACHMENT0) { fb.color_attachment = rb.texture; + fb.layer = 0; } else if (attachment == GL_DEPTH_ATTACHMENT) { fb.depth_attachment = rb.texture; } else { @@ -2055,18 +1736,11 @@ void FramebufferRenderbuffer(GLenum target, GLenum attachment, } // extern "C" -static inline Framebuffer* get_framebuffer(GLenum target, - bool fallback = false) { +static inline Framebuffer* get_framebuffer(GLenum target) { if (target == GL_FRAMEBUFFER) { target = GL_DRAW_FRAMEBUFFER; } - Framebuffer* fb = ctx->framebuffers.find(ctx->get_binding(target)); - if (fallback && !fb) { - // If the specified framebuffer isn't found and a fallback is requested, - // use the default framebuffer. - fb = &ctx->framebuffers[0]; - } - return fb; + return ctx->framebuffers.find(ctx->get_binding(target)); } template <typename T> @@ -2092,7 +1766,9 @@ static inline uint32_t clear_chunk(uint16_t value) { return uint32_t(value) | (uint32_t(value) << 16); } -static inline uint32_t clear_chunk(uint32_t value) { return value; } +static inline uint32_t clear_chunk(uint32_t value) { + return value; +} template <typename T> static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) { @@ -2115,22 +1791,20 @@ static inline void clear_row(T* buf, size_t len, T value, uint32_t chunk) { } template <typename T> -static void clear_buffer(Texture& t, T value, IntRect bb, int skip_start = 0, - int skip_end = 0) { +static void clear_buffer(Texture& t, T value, int layer, IntRect bb, + int skip_start = 0, int skip_end = 0) { if (!t.buf) return; skip_start = max(skip_start, bb.x0); skip_end = max(skip_end, skip_start); assert(sizeof(T) == t.bpp()); - size_t stride = t.stride(); - // When clearing multiple full-width rows, collapse them into a single large - // "row" to avoid redundant setup from clearing each row individually. Note - // that we can only safely do this if the stride is tightly packed. - if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end && - (t.should_free() || stride == t.width * sizeof(T))) { + size_t stride = t.stride(sizeof(T)); + // When clearing multiple full-width rows, collapse them into a single + // large "row" to avoid redundant setup from clearing each row individually. + if (bb.width() == t.width && bb.height() > 1 && skip_start >= skip_end) { bb.x1 += (stride / sizeof(T)) * (bb.height() - 1); bb.y1 = bb.y0 + 1; } - T* buf = (T*)t.sample_ptr(bb.x0, bb.y0); + T* buf = (T*)t.sample_ptr(bb.x0, bb.y0, layer, sizeof(T), stride); uint32_t chunk = clear_chunk(value); for (int rows = bb.height(); rows > 0; rows--) { if (bb.x0 < skip_start) { @@ -2144,12 +1818,20 @@ static void clear_buffer(Texture& t, T value, IntRect bb, int skip_start = 0, } template <typename T> +static inline void clear_buffer(Texture& t, T value, int layer = 0) { + IntRect bb = ctx->apply_scissor(t.bounds()); + if (bb.width() > 0) { + clear_buffer<T>(t, value, layer, bb); + } +} + +template <typename T> static inline void force_clear_row(Texture& t, int y, int skip_start = 0, int skip_end = 0) { assert(t.buf != nullptr); assert(sizeof(T) == t.bpp()); assert(skip_start <= skip_end); - T* buf = (T*)t.sample_ptr(0, y); + T* buf = (T*)t.sample_ptr(0, y, 0, sizeof(T)); uint32_t chunk = clear_chunk((T)t.clear_val); if (skip_start > 0) { clear_row<T>(buf, skip_start, t.clear_val, chunk); @@ -2188,9 +1870,9 @@ static void force_clear(Texture& t, const IntRect* skip = nullptr) { while (mask) { int count = __builtin_ctz(mask); if (count > 0) { - clear_buffer<T>(t, t.clear_val, - IntRect{0, start, t.width, start + count}, skip_start, - skip_end); + clear_buffer<T>(t, t.clear_val, 0, + IntRect{0, start, t.width, start + count}, + skip_start, skip_end); t.delay_clear -= count; start += count; mask >>= count; @@ -2201,9 +1883,9 @@ static void force_clear(Texture& t, const IntRect* skip = nullptr) { } int count = (i + 1) * 32 - start; if (count > 0) { - clear_buffer<T>(t, t.clear_val, - IntRect{0, start, t.width, start + count}, skip_start, - skip_end); + clear_buffer<T>(t, t.clear_val, 0, + IntRect{0, start, t.width, start + count}, + skip_start, skip_end); t.delay_clear -= count; } } @@ -2220,7 +1902,7 @@ static void prepare_texture(Texture& t, const IntRect* skip) { case GL_R8: force_clear<uint8_t>(t, skip); break; - case GL_RG8: + case GL_DEPTH_COMPONENT16: force_clear<uint16_t>(t, skip); break; default: @@ -2230,53 +1912,31 @@ static void prepare_texture(Texture& t, const IntRect* skip) { } } -// Setup a clear on a texture. This may either force an immediate clear or -// potentially punt to a delayed clear, if applicable. -template <typename T> -static void request_clear(Texture& t, T value, const IntRect& scissor) { - // If the clear would require a scissor, force clear anything outside - // the scissor, and then immediately clear anything inside the scissor. - if (!scissor.contains(t.offset_bounds())) { - IntRect skip = scissor - t.offset; - force_clear<T>(t, &skip); - clear_buffer<T>(t, value, skip.intersection(t.bounds())); - } else { - // Do delayed clear for 2D texture without scissor. - t.enable_delayed_clear(value); - } -} - -template <typename T> -static inline void request_clear(Texture& t, T value) { - // If scissoring is enabled, use the scissor rect. Otherwise, just scissor to - // the entire texture bounds. - request_clear(t, value, ctx->scissortest ? ctx->scissor : t.offset_bounds()); -} - extern "C" { -void InitDefaultFramebuffer(int x, int y, int width, int height, int stride, - void* buf) { +void InitDefaultFramebuffer(int width, int height) { Framebuffer& fb = ctx->framebuffers[0]; if (!fb.color_attachment) { GenTextures(1, &fb.color_attachment); + fb.layer = 0; } - // If the dimensions or buffer properties changed, we need to reallocate - // the underlying storage for the color buffer texture. Texture& colortex = ctx->textures[fb.color_attachment]; - set_tex_storage(colortex, GL_RGBA8, width, height, buf, stride); - colortex.offset = IntPoint(x, y); + if (colortex.width != width || colortex.height != height) { + colortex.cleanup(); + set_tex_storage(colortex, GL_RGBA8, width, height); + } if (!fb.depth_attachment) { GenTextures(1, &fb.depth_attachment); } - // Ensure dimensions of the depth buffer match the color buffer. Texture& depthtex = ctx->textures[fb.depth_attachment]; - set_tex_storage(depthtex, GL_DEPTH_COMPONENT24, width, height); - depthtex.offset = IntPoint(x, y); + if (depthtex.width != width || depthtex.height != height) { + depthtex.cleanup(); + set_tex_storage(depthtex, GL_DEPTH_COMPONENT16, width, height); + } } void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width, - int32_t* height, int32_t* stride) { + int32_t* height) { Framebuffer* fb = ctx->framebuffers.find(fbo); if (!fb || !fb->color_attachment) { return nullptr; @@ -2285,33 +1945,16 @@ void* GetColorBuffer(GLuint fbo, GLboolean flush, int32_t* width, if (flush) { prepare_texture(colortex); } - assert(colortex.offset == IntPoint(0, 0)); - if (width) { - *width = colortex.width; - } - if (height) { - *height = colortex.height; - } - if (stride) { - *stride = colortex.stride(); - } - return colortex.buf ? colortex.sample_ptr(0, 0) : nullptr; -} - -void ResolveFramebuffer(GLuint fbo) { - Framebuffer* fb = ctx->framebuffers.find(fbo); - if (!fb || !fb->color_attachment) { - return; - } - Texture& colortex = ctx->textures[fb->color_attachment]; - prepare_texture(colortex); + *width = colortex.width; + *height = colortex.height; + return colortex.buf ? colortex.sample_ptr(0, 0, fb->layer) : nullptr; } void SetTextureBuffer(GLuint texid, GLenum internal_format, GLsizei width, - GLsizei height, GLsizei stride, void* buf, - GLsizei min_width, GLsizei min_height) { + GLsizei height, void* buf, GLsizei min_width, + GLsizei min_height) { Texture& t = ctx->textures[texid]; - set_tex_storage(t, internal_format, width, height, buf, stride, min_width, + set_tex_storage(t, internal_format, width, height, !buf, buf, min_width, min_height); } @@ -2323,170 +1966,57 @@ GLenum CheckFramebufferStatus(GLenum target) { return GL_FRAMEBUFFER_COMPLETE; } -void ClearTexSubImage(GLuint texture, GLint level, GLint xoffset, GLint yoffset, - GLint zoffset, GLsizei width, GLsizei height, - GLsizei depth, GLenum format, GLenum type, - const void* data) { - if (level != 0) { - assert(false); - return; - } - Texture& t = ctx->textures[texture]; - assert(!t.locked); - if (width <= 0 || height <= 0 || depth <= 0) { - return; - } - assert(zoffset == 0 && depth == 1); - IntRect scissor = {xoffset, yoffset, xoffset + width, yoffset + height}; - if (t.internal_format == GL_DEPTH_COMPONENT24) { - uint32_t value = 0xFFFFFF; - switch (format) { - case GL_DEPTH_COMPONENT: - switch (type) { - case GL_DOUBLE: - value = uint32_t(*(const GLdouble*)data * 0xFFFFFF); - break; - case GL_FLOAT: - value = uint32_t(*(const GLfloat*)data * 0xFFFFFF); - break; - default: - assert(false); - break; - } - break; - default: - assert(false); - break; - } - if (t.cleared() && !scissor.contains(t.offset_bounds())) { - // If we need to scissor the clear and the depth buffer was already - // initialized, then just fill runs for that scissor area. - t.fill_depth_runs(value, scissor); - } else { - // Otherwise, the buffer is either uninitialized or the clear would - // encompass the entire buffer. If uninitialized, we can safely fill - // the entire buffer with any value and thus ignore any scissoring. - t.init_depth_runs(value); - } - return; - } - - uint32_t color = 0xFF000000; - switch (type) { - case GL_FLOAT: { - const GLfloat* f = (const GLfloat*)data; - Float v = {0.0f, 0.0f, 0.0f, 1.0f}; - switch (format) { - case GL_RGBA: - v.w = f[3]; // alpha - FALLTHROUGH; - case GL_RGB: - v.z = f[2]; // blue - FALLTHROUGH; - case GL_RG: - v.y = f[1]; // green - FALLTHROUGH; - case GL_RED: - v.x = f[0]; // red - break; - default: - assert(false); - break; - } - color = bit_cast<uint32_t>(CONVERT(round_pixel(v), U8)); - break; - } - case GL_UNSIGNED_BYTE: { - const GLubyte* b = (const GLubyte*)data; - switch (format) { - case GL_RGBA: - color = (color & ~0xFF000000) | (uint32_t(b[3]) << 24); // alpha - FALLTHROUGH; - case GL_RGB: - color = (color & ~0x00FF0000) | (uint32_t(b[2]) << 16); // blue - FALLTHROUGH; - case GL_RG: - color = (color & ~0x0000FF00) | (uint32_t(b[1]) << 8); // green - FALLTHROUGH; - case GL_RED: - color = (color & ~0x000000FF) | uint32_t(b[0]); // red - break; - default: - assert(false); - break; - } - break; - } - default: - assert(false); - break; - } - - switch (t.internal_format) { - case GL_RGBA8: - // Clear color needs to swizzle to BGRA. - request_clear<uint32_t>(t, - (color & 0xFF00FF00) | - ((color << 16) & 0xFF0000) | - ((color >> 16) & 0xFF), - scissor); - break; - case GL_R8: - request_clear<uint8_t>(t, uint8_t(color & 0xFF), scissor); - break; - case GL_RG8: - request_clear<uint16_t>(t, uint16_t(color & 0xFFFF), scissor); - break; - default: - assert(false); - break; - } -} - -void ClearTexImage(GLuint texture, GLint level, GLenum format, GLenum type, - const void* data) { - Texture& t = ctx->textures[texture]; - IntRect scissor = t.offset_bounds(); - ClearTexSubImage(texture, level, scissor.x0, scissor.y0, 0, scissor.width(), - scissor.height(), 1, format, type, data); +static inline bool clear_requires_scissor(Texture& t) { + return ctx->scissortest && !ctx->scissor.contains(t.bounds()); } void Clear(GLbitfield mask) { - Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true); + Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER); if ((mask & GL_COLOR_BUFFER_BIT) && fb.color_attachment) { Texture& t = ctx->textures[fb.color_attachment]; - IntRect scissor = ctx->scissortest - ? ctx->scissor.intersection(t.offset_bounds()) - : t.offset_bounds(); - ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0, - scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT, - ctx->clearcolor); + if (t.internal_format == GL_RGBA8) { + uint32_t color = ctx->clearcolor; + // If the clear would require a scissor, force clear anything outside + // the scissor, and then immediately clear anything inside the scissor. + if (clear_requires_scissor(t)) { + force_clear<uint32_t>(t, &ctx->scissor); + clear_buffer<uint32_t>(t, color, fb.layer); + } else if (t.depth > 1) { + // Delayed clear is not supported on texture arrays. + t.disable_delayed_clear(); + clear_buffer<uint32_t>(t, color, fb.layer); + } else { + // Do delayed clear for 2D texture without scissor. + t.enable_delayed_clear(color); + } + } else if (t.internal_format == GL_R8) { + uint8_t color = uint8_t((ctx->clearcolor >> 16) & 0xFF); + if (clear_requires_scissor(t)) { + force_clear<uint8_t>(t, &ctx->scissor); + clear_buffer<uint8_t>(t, color, fb.layer); + } else if (t.depth > 1) { + t.disable_delayed_clear(); + clear_buffer<uint8_t>(t, color, fb.layer); + } else { + t.enable_delayed_clear(color); + } + } else { + assert(false); + } } if ((mask & GL_DEPTH_BUFFER_BIT) && fb.depth_attachment) { Texture& t = ctx->textures[fb.depth_attachment]; - IntRect scissor = ctx->scissortest - ? ctx->scissor.intersection(t.offset_bounds()) - : t.offset_bounds(); - ClearTexSubImage(fb.depth_attachment, 0, scissor.x0, scissor.y0, 0, - scissor.width(), scissor.height(), 1, GL_DEPTH_COMPONENT, - GL_DOUBLE, &ctx->cleardepth); + assert(t.internal_format == GL_DEPTH_COMPONENT16); + uint16_t depth = uint16_t(0xFFFF * ctx->cleardepth) - 0x8000; + if (clear_requires_scissor(t)) { + force_clear<uint16_t>(t, &ctx->scissor); + clear_buffer<uint16_t>(t, depth); + } else { + t.enable_delayed_clear(depth); + } } } -void ClearColorRect(GLuint fbo, GLint xoffset, GLint yoffset, GLsizei width, - GLsizei height, GLfloat r, GLfloat g, GLfloat b, - GLfloat a) { - GLfloat color[] = {r, g, b, a}; - Framebuffer& fb = ctx->framebuffers[fbo]; - Texture& t = ctx->textures[fb.color_attachment]; - IntRect scissor = - IntRect{xoffset, yoffset, xoffset + width, yoffset + height}.intersection( - t.offset_bounds()); - ClearTexSubImage(fb.color_attachment, 0, scissor.x0, scissor.y0, 0, - scissor.width(), scissor.height(), 1, GL_RGBA, GL_FLOAT, - color); -} - void InvalidateFramebuffer(GLenum target, GLsizei num_attachments, const GLenum* attachments) { Framebuffer* fb = get_framebuffer(target); @@ -2497,7 +2027,7 @@ void InvalidateFramebuffer(GLenum target, GLsizei num_attachments, switch (attachments[i]) { case GL_DEPTH_ATTACHMENT: { Texture& t = ctx->textures[fb->depth_attachment]; - t.set_cleared(false); + t.disable_delayed_clear(); break; } case GL_COLOR_ATTACHMENT0: { @@ -2516,58 +2046,40 @@ void ReadPixels(GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER); if (!fb) return; assert(format == GL_RED || format == GL_RGBA || format == GL_RGBA_INTEGER || - format == GL_BGRA || format == GL_RG); + format == GL_BGRA); Texture& t = ctx->textures[fb->color_attachment]; if (!t.buf) return; prepare_texture(t); // debugf("read pixels %d, %d, %d, %d from fb %d with format %x\n", x, y, // width, height, ctx->read_framebuffer_binding, t.internal_format); - x -= t.offset.x; - y -= t.offset.y; - assert(x >= 0 && y >= 0); assert(x + width <= t.width); assert(y + height <= t.height); if (internal_format_for_data(format, type) != t.internal_format) { debugf("mismatched format for read pixels: %x vs %x\n", t.internal_format, internal_format_for_data(format, type)); assert(false); - return; - } - // Only support readback conversions that are reversible - assert(!format_requires_conversion(format, t.internal_format) || - bytes_for_internal_format(format) == t.bpp()); - uint8_t* dest = (uint8_t*)data; - size_t destStride = width * t.bpp(); - if (y < 0) { - dest += -y * destStride; - height += y; - y = 0; - } - if (y + height > t.height) { - height = t.height - y; - } - if (x < 0) { - dest += -x * t.bpp(); - width += x; - x = 0; } - if (x + width > t.width) { - width = t.width - x; - } - if (width <= 0 || height <= 0) { - return; + int bpp = t.bpp(); + char* dest = (char*)data; + size_t src_stride = t.stride(bpp); + char* src = t.sample_ptr(x, y, fb->layer, bpp, src_stride); + for (; height > 0; height--) { + if (t.internal_format == GL_RGBA8 && format != GL_BGRA) { + copy_bgra8_to_rgba8((uint32_t*)dest, (uint32_t*)src, width); + } else { + memcpy(dest, src, width * bpp); + } + dest += width * bpp; + src += src_stride; } - convert_copy(format, t.internal_format, dest, destStride, - (const uint8_t*)t.sample_ptr(x, y), t.stride(), width, height); } void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, - GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX, - GLint dstY, GLint dstZ, GLsizei srcWidth, - GLsizei srcHeight, GLsizei srcDepth) { + GLenum dstTarget, UNUSED GLint dstLevel, GLint dstX, GLint dstY, + GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, + GLsizei srcDepth) { assert(srcLevel == 0 && dstLevel == 0); - assert(srcZ == 0 && srcDepth == 1 && dstZ == 0); if (srcTarget == GL_RENDERBUFFER) { Renderbuffer& rb = ctx->renderbuffers[srcName]; srcName = rb.texture; @@ -2581,44 +2093,532 @@ void CopyImageSubData(GLuint srcName, GLenum srcTarget, UNUSED GLint srcLevel, prepare_texture(srctex); Texture& dsttex = ctx->textures[dstName]; if (!dsttex.buf) return; - assert(!dsttex.locked); IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight}; prepare_texture(dsttex, &skip); assert(srctex.internal_format == dsttex.internal_format); assert(srcWidth >= 0); assert(srcHeight >= 0); + assert(srcDepth >= 0); assert(srcX + srcWidth <= srctex.width); assert(srcY + srcHeight <= srctex.height); + assert(srcZ + srcDepth <= max(srctex.depth, 1)); assert(dstX + srcWidth <= dsttex.width); assert(dstY + srcHeight <= dsttex.height); + assert(dstZ + srcDepth <= max(dsttex.depth, 1)); int bpp = srctex.bpp(); - int src_stride = srctex.stride(); - int dest_stride = dsttex.stride(); - char* dest = dsttex.sample_ptr(dstX, dstY); - char* src = srctex.sample_ptr(srcX, srcY); - for (int y = 0; y < srcHeight; y++) { - memcpy(dest, src, srcWidth * bpp); - dest += dest_stride; - src += src_stride; + int src_stride = srctex.stride(bpp); + int dest_stride = dsttex.stride(bpp); + for (int z = 0; z < srcDepth; z++) { + char* dest = dsttex.sample_ptr(dstX, dstY, dstZ + z, bpp, dest_stride); + char* src = srctex.sample_ptr(srcX, srcY, srcZ + z, bpp, src_stride); + for (int y = 0; y < srcHeight; y++) { + memcpy(dest, src, srcWidth * bpp); + dest += dest_stride; + src += src_stride; + } } } -void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset, - GLint yoffset, GLint x, GLint y, GLsizei width, +void CopyTexSubImage3D(GLenum target, UNUSED GLint level, GLint xoffset, GLint yoffset, + GLint zoffset, GLint x, GLint y, GLsizei width, GLsizei height) { assert(level == 0); Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER); if (!fb) return; - CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D, 0, x, y, 0, - ctx->get_binding(target), GL_TEXTURE_2D, 0, xoffset, yoffset, - 0, width, height, 1); + CopyImageSubData(fb->color_attachment, GL_TEXTURE_3D, 0, x, y, fb->layer, + ctx->get_binding(target), GL_TEXTURE_3D, 0, xoffset, yoffset, + zoffset, width, height, 1); +} + +void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset, GLint yoffset, + GLint x, GLint y, GLsizei width, GLsizei height) { + assert(level == 0); + Framebuffer* fb = get_framebuffer(GL_READ_FRAMEBUFFER); + if (!fb) return; + CopyImageSubData(fb->color_attachment, GL_TEXTURE_2D_ARRAY, 0, x, y, + fb->layer, ctx->get_binding(target), GL_TEXTURE_2D_ARRAY, 0, + xoffset, yoffset, 0, width, height, 1); } } // extern "C" -#include "blend.h" -#include "composite.h" -#include "swgl_ext.h" +using PackedRGBA8 = V16<uint8_t>; +using WideRGBA8 = V16<uint16_t>; +using HalfRGBA8 = V8<uint16_t>; + +static inline WideRGBA8 unpack(PackedRGBA8 p) { return CONVERT(p, WideRGBA8); } + +static inline PackedRGBA8 pack(WideRGBA8 p) { +#if USE_SSE2 + return _mm_packus_epi16(lowHalf(p), highHalf(p)); +#elif USE_NEON + return vcombine_u8(vqmovn_u16(lowHalf(p)), vqmovn_u16(highHalf(p))); +#else + return CONVERT(p, PackedRGBA8); +#endif +} + +static inline HalfRGBA8 packRGBA8(I32 a, I32 b) { +#if USE_SSE2 + return _mm_packs_epi32(a, b); +#elif USE_NEON + return vcombine_u16(vqmovun_s32(a), vqmovun_s32(b)); +#else + return CONVERT(combine(a, b), HalfRGBA8); +#endif +} + +using PackedR8 = V4<uint8_t>; +using WideR8 = V4<uint16_t>; + +static inline WideR8 unpack(PackedR8 p) { return CONVERT(p, WideR8); } + +static inline WideR8 packR8(I32 a) { +#if USE_SSE2 + return lowHalf(bit_cast<V8<uint16_t>>(_mm_packs_epi32(a, a))); +#elif USE_NEON + return vqmovun_s32(a); +#else + return CONVERT(a, WideR8); +#endif +} + +static inline PackedR8 pack(WideR8 p) { +#if USE_SSE2 + auto m = expand(p); + auto r = bit_cast<V16<uint8_t>>(_mm_packus_epi16(m, m)); + return SHUFFLE(r, r, 0, 1, 2, 3); +#elif USE_NEON + return lowHalf(bit_cast<V8<uint8_t>>(vqmovn_u16(expand(p)))); +#else + return CONVERT(p, PackedR8); +#endif +} + +using ZMask4 = V4<int16_t>; +using ZMask8 = V8<int16_t>; + +static inline PackedRGBA8 unpack(ZMask4 mask, uint32_t*) { + return bit_cast<PackedRGBA8>(mask.xxyyzzww); +} + +static inline WideR8 unpack(ZMask4 mask, uint8_t*) { + return bit_cast<WideR8>(mask); +} + +#if USE_SSE2 +# define ZMASK_NONE_PASSED 0xFFFF +# define ZMASK_ALL_PASSED 0 +static inline uint32_t zmask_code(ZMask8 mask) { + return _mm_movemask_epi8(mask); +} +static inline uint32_t zmask_code(ZMask4 mask) { + return zmask_code(mask.xyzwxyzw); +} +#else +using ZMask4Code = V4<uint8_t>; +using ZMask8Code = V8<uint8_t>; +# define ZMASK_NONE_PASSED 0xFFFFFFFFU +# define ZMASK_ALL_PASSED 0 +static inline uint32_t zmask_code(ZMask4 mask) { + return bit_cast<uint32_t>(CONVERT(mask, ZMask4Code)); +} +static inline uint32_t zmask_code(ZMask8 mask) { + return zmask_code( + ZMask4((U16(lowHalf(mask)) >> 12) | (U16(highHalf(mask)) << 4))); +} +#endif + +template <int FUNC, bool MASK> +static ALWAYS_INLINE int check_depth8(uint16_t z, uint16_t* zbuf, + ZMask8& outmask) { + ZMask8 dest = unaligned_load<ZMask8>(zbuf); + ZMask8 src = int16_t(z); + // Invert the depth test to check which pixels failed and should be discarded. + ZMask8 mask = FUNC == GL_LEQUAL ? + // GL_LEQUAL: Not(LessEqual) = Greater + ZMask8(src > dest) + : + // GL_LESS: Not(Less) = GreaterEqual + ZMask8(src >= dest); + switch (zmask_code(mask)) { + case ZMASK_NONE_PASSED: + return 0; + case ZMASK_ALL_PASSED: + if (MASK) { + unaligned_store(zbuf, src); + } + return -1; + default: + if (MASK) { + unaligned_store(zbuf, (mask & dest) | (~mask & src)); + } + outmask = mask; + return 1; + } +} + +template <bool FULL_SPANS, bool DISCARD> +static ALWAYS_INLINE bool check_depth4(ZMask4 src, uint16_t* zbuf, + ZMask4& outmask, int span = 0) { + ZMask4 dest = unaligned_load<ZMask4>(zbuf); + // Invert the depth test to check which pixels failed and should be discarded. + ZMask4 mask = ctx->depthfunc == GL_LEQUAL + ? + // GL_LEQUAL: Not(LessEqual) = Greater + ZMask4(src > dest) + : + // GL_LESS: Not(Less) = GreaterEqual + ZMask4(src >= dest); + if (!FULL_SPANS) { + mask |= ZMask4(span) < ZMask4{1, 2, 3, 4}; + } + if (zmask_code(mask) == ZMASK_NONE_PASSED) { + return false; + } + if (!DISCARD && ctx->depthmask) { + unaligned_store(zbuf, (mask & dest) | (~mask & src)); + } + outmask = mask; + return true; +} + +template <bool FULL_SPANS, bool DISCARD> +static ALWAYS_INLINE bool check_depth4(uint16_t z, uint16_t* zbuf, + ZMask4& outmask, int span = 0) { + return check_depth4<FULL_SPANS, DISCARD>(ZMask4(int16_t(z)), zbuf, outmask, + span); +} + +template <typename T> +static inline ZMask4 packZMask4(T a) { +#if USE_SSE2 + return lowHalf(bit_cast<ZMask8>(_mm_packs_epi32(a, a))); +#elif USE_NEON + return vqmovn_s32(a); +#else + return CONVERT(a, ZMask4); +#endif +} + +static ALWAYS_INLINE ZMask4 packDepth() { + return packZMask4(cast(fragment_shader->gl_FragCoord.z * 0xFFFF) - 0x8000); +} + +static ALWAYS_INLINE void discard_depth(ZMask4 src, uint16_t* zbuf, + ZMask4 mask) { + if (ctx->depthmask) { + ZMask4 dest = unaligned_load<ZMask4>(zbuf); + mask |= packZMask4(fragment_shader->isPixelDiscarded); + unaligned_store(zbuf, (mask & dest) | (~mask & src)); + } +} + +static ALWAYS_INLINE void discard_depth(uint16_t z, uint16_t* zbuf, + ZMask4 mask) { + discard_depth(ZMask4(int16_t(z)), zbuf, mask); +} + +static inline WideRGBA8 pack_pixels_RGBA8(const vec4& v) { + ivec4 i = round_pixel(v); + HalfRGBA8 xz = packRGBA8(i.z, i.x); + HalfRGBA8 yw = packRGBA8(i.y, i.w); + HalfRGBA8 xy = zipLow(xz, yw); + HalfRGBA8 zw = zipHigh(xz, yw); + HalfRGBA8 lo = zip2Low(xy, zw); + HalfRGBA8 hi = zip2High(xy, zw); + return combine(lo, hi); +} + +static inline WideRGBA8 pack_pixels_RGBA8(const vec4_scalar& v) { + I32 i = round_pixel((Float){v.z, v.y, v.x, v.w}); + HalfRGBA8 c = packRGBA8(i, i); + return combine(c, c); +} + +static inline WideRGBA8 pack_pixels_RGBA8() { + return pack_pixels_RGBA8(fragment_shader->gl_FragColor); +} + +template <typename V> +static inline PackedRGBA8 pack_span(uint32_t*, const V& v) { + return pack(pack_pixels_RGBA8(v)); +} + +static inline PackedRGBA8 pack_span(uint32_t*) { + return pack(pack_pixels_RGBA8()); +} + +// (x*y + x) >> 8, cheap approximation of (x*y) / 255 +template <typename T> +static inline T muldiv255(T x, T y) { + return (x * y + x) >> 8; +} + +// Byte-wise addition for when x or y is a signed 8-bit value stored in the +// low byte of a larger type T only with zeroed-out high bits, where T is +// greater than 8 bits, i.e. uint16_t. This can result when muldiv255 is used +// upon signed operands, using up all the precision in a 16 bit integer, and +// potentially losing the sign bit in the last >> 8 shift. Due to the +// properties of two's complement arithmetic, even though we've discarded the +// sign bit, we can still represent a negative number under addition (without +// requiring any extra sign bits), just that any negative number will behave +// like a large unsigned number under addition, generating a single carry bit +// on overflow that we need to discard. Thus, just doing a byte-wise add will +// overflow without the troublesome carry, giving us only the remaining 8 low +// bits we actually need while keeping the high bits at zero. +template <typename T> +static inline T addlow(T x, T y) { + typedef VectorType<uint8_t, sizeof(T)> bytes; + return bit_cast<T>(bit_cast<bytes>(x) + bit_cast<bytes>(y)); +} + +static inline WideRGBA8 alphas(WideRGBA8 c) { + return SHUFFLE(c, c, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15); +} + +static inline WideRGBA8 blend_pixels_RGBA8(PackedRGBA8 pdst, WideRGBA8 src) { + WideRGBA8 dst = unpack(pdst); + const WideRGBA8 RGB_MASK = {0xFFFF, 0xFFFF, 0xFFFF, 0, 0xFFFF, 0xFFFF, + 0xFFFF, 0, 0xFFFF, 0xFFFF, 0xFFFF, 0, + 0xFFFF, 0xFFFF, 0xFFFF, 0}; + const WideRGBA8 ALPHA_MASK = {0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF, + 0, 0, 0, 0xFFFF, 0, 0, 0, 0xFFFF}; + const WideRGBA8 ALPHA_OPAQUE = {0, 0, 0, 255, 0, 0, 0, 255, + 0, 0, 0, 255, 0, 0, 0, 255}; + switch (blend_key) { + case BLEND_KEY_NONE: + return src; + case BLEND_KEY(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE): + // dst + src.a*(src.rgb1 - dst.rgb0) + // use addlow for signed overflow + return addlow(dst, + muldiv255(alphas(src), (src | ALPHA_OPAQUE) - (dst & RGB_MASK))); + case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC_ALPHA): + return src + dst - muldiv255(dst, alphas(src)); + case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_COLOR): + return dst - muldiv255(dst, src); + case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, GL_ZERO, GL_ONE): + return dst - (muldiv255(dst, src) & RGB_MASK); + case BLEND_KEY(GL_ZERO, GL_ONE_MINUS_SRC_ALPHA): + return dst - muldiv255(dst, alphas(src)); + case BLEND_KEY(GL_ZERO, GL_SRC_COLOR): + return muldiv255(src, dst); + case BLEND_KEY(GL_ONE, GL_ONE): + return src + dst; + case BLEND_KEY(GL_ONE, GL_ONE, GL_ONE, GL_ONE_MINUS_SRC_ALPHA): + return src + dst - (muldiv255(dst, src) & ALPHA_MASK); + case BLEND_KEY(GL_ONE, GL_ZERO): + return src; + case BLEND_KEY(GL_ONE_MINUS_DST_ALPHA, GL_ONE, GL_ZERO, GL_ONE): + // src*(1-dst.a) + dst*1 = src - src*dst.a + dst + return dst + ((src - muldiv255(src, alphas(dst))) & RGB_MASK); + case BLEND_KEY(GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_COLOR): + // src*k + (1-src)*dst = src*k + dst - src*dst = dst + src*(k - dst) + // use addlow for signed overflow + return addlow(dst, + muldiv255(src, combine(ctx->blendcolor, ctx->blendcolor) - dst)); + case BLEND_KEY(GL_ONE, GL_ONE_MINUS_SRC1_COLOR): { + WideRGBA8 secondary = + pack_pixels_RGBA8(fragment_shader->gl_SecondaryFragColor); + return src + dst - muldiv255(dst, secondary); + } + default: + UNREACHABLE; + // return src; + } +} + +template <bool DISCARD> +static inline void discard_output(uint32_t* buf, PackedRGBA8 mask) { + PackedRGBA8 dst = unaligned_load<PackedRGBA8>(buf); + WideRGBA8 r = pack_pixels_RGBA8(); + if (blend_key) r = blend_pixels_RGBA8(dst, r); + if (DISCARD) mask |= bit_cast<PackedRGBA8>(fragment_shader->isPixelDiscarded); + unaligned_store(buf, (mask & dst) | (~mask & pack(r))); +} + +template <bool DISCARD> +static inline void discard_output(uint32_t* buf) { + discard_output<DISCARD>(buf, 0); +} + +template <> +inline void discard_output<false>(uint32_t* buf) { + WideRGBA8 r = pack_pixels_RGBA8(); + if (blend_key) r = blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), r); + unaligned_store(buf, pack(r)); +} + +static inline PackedRGBA8 span_mask_RGBA8(int span) { + return bit_cast<PackedRGBA8>(I32(span) < I32{1, 2, 3, 4}); +} + +static inline PackedRGBA8 span_mask(uint32_t*, int span) { + return span_mask_RGBA8(span); +} + +static inline WideR8 pack_pixels_R8(Float c) { + return packR8(round_pixel(c)); +} + +static inline WideR8 pack_pixels_R8() { + return pack_pixels_R8(fragment_shader->gl_FragColor.x); +} + +template <typename C> +static inline PackedR8 pack_span(uint8_t*, C c) { + return pack(pack_pixels_R8(c)); +} + +static inline PackedR8 pack_span(uint8_t*) { return pack(pack_pixels_R8()); } + +static inline WideR8 blend_pixels_R8(WideR8 dst, WideR8 src) { + switch (blend_key) { + case BLEND_KEY_NONE: + return src; + case BLEND_KEY(GL_ZERO, GL_SRC_COLOR): + return muldiv255(src, dst); + case BLEND_KEY(GL_ONE, GL_ONE): + return src + dst; + case BLEND_KEY(GL_ONE, GL_ZERO): + return src; + default: + UNREACHABLE; + // return src; + } +} + +template <bool DISCARD> +static inline void discard_output(uint8_t* buf, WideR8 mask) { + WideR8 dst = unpack(unaligned_load<PackedR8>(buf)); + WideR8 r = pack_pixels_R8(); + if (blend_key) r = blend_pixels_R8(dst, r); + if (DISCARD) mask |= packR8(fragment_shader->isPixelDiscarded); + unaligned_store(buf, pack((mask & dst) | (~mask & r))); +} + +template <bool DISCARD> +static inline void discard_output(uint8_t* buf) { + discard_output<DISCARD>(buf, 0); +} + +template <> +inline void discard_output<false>(uint8_t* buf) { + WideR8 r = pack_pixels_R8(); + if (blend_key) r = blend_pixels_R8(unpack(unaligned_load<PackedR8>(buf)), r); + unaligned_store(buf, pack(r)); +} + +static inline WideR8 span_mask_R8(int span) { + return bit_cast<WideR8>(WideR8(span) < WideR8{1, 2, 3, 4}); +} + +static inline WideR8 span_mask(uint8_t*, int span) { + return span_mask_R8(span); +} + +template <bool DISCARD, bool W, typename P, typename M> +static inline void commit_output(P* buf, M mask) { + fragment_shader->run<W>(); + discard_output<DISCARD>(buf, mask); +} + +template <bool DISCARD, bool W, typename P> +static inline void commit_output(P* buf) { + fragment_shader->run<W>(); + discard_output<DISCARD>(buf); +} + +template <bool DISCARD, bool W, typename P> +static inline void commit_output(P* buf, int span) { + commit_output<DISCARD, W>(buf, span_mask(buf, span)); +} + +template <bool DISCARD, bool W, typename P, typename Z> +static inline void commit_output(P* buf, Z z, uint16_t* zbuf) { + ZMask4 zmask; + if (check_depth4<true, DISCARD>(z, zbuf, zmask)) { + commit_output<DISCARD, W>(buf, unpack(zmask, buf)); + if (DISCARD) { + discard_depth(z, zbuf, zmask); + } + } else { + fragment_shader->skip<W>(); + } +} + +template <bool DISCARD, bool W, typename P, typename Z> +static inline void commit_output(P* buf, Z z, uint16_t* zbuf, int span) { + ZMask4 zmask; + if (check_depth4<false, DISCARD>(z, zbuf, zmask, span)) { + commit_output<DISCARD, W>(buf, unpack(zmask, buf)); + if (DISCARD) { + discard_depth(z, zbuf, zmask); + } + } +} + +static inline void commit_span(uint32_t* buf, PackedRGBA8 r) { + if (blend_key) + r = pack(blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), unpack(r))); + unaligned_store(buf, r); +} + +UNUSED static inline void commit_solid_span(uint32_t* buf, PackedRGBA8 r, + int len) { + if (blend_key) { + auto src = unpack(r); + for (uint32_t* end = &buf[len]; buf < end; buf += 4) { + unaligned_store( + buf, pack(blend_pixels_RGBA8(unaligned_load<PackedRGBA8>(buf), src))); + } + } else { + fill_n(buf, len, bit_cast<U32>(r).x); + } +} + +UNUSED static inline void commit_texture_span(uint32_t* buf, uint32_t* src, + int len) { + if (blend_key) { + for (uint32_t* end = &buf[len]; buf < end; buf += 4, src += 4) { + PackedRGBA8 r = unaligned_load<PackedRGBA8>(src); + unaligned_store(buf, pack(blend_pixels_RGBA8( + unaligned_load<PackedRGBA8>(buf), unpack(r)))); + } + } else { + memcpy(buf, src, len * sizeof(uint32_t)); + } +} + +static inline void commit_span(uint8_t* buf, PackedR8 r) { + if (blend_key) + r = pack(blend_pixels_R8(unpack(unaligned_load<PackedR8>(buf)), unpack(r))); + unaligned_store(buf, r); +} + +UNUSED static inline void commit_solid_span(uint8_t* buf, PackedR8 r, int len) { + if (blend_key) { + auto src = unpack(r); + for (uint8_t* end = &buf[len]; buf < end; buf += 4) { + unaligned_store(buf, pack(blend_pixels_R8( + unpack(unaligned_load<PackedR8>(buf)), src))); + } + } else { + fill_n((uint32_t*)buf, len / 4, bit_cast<uint32_t>(r)); + } +} + +#define DISPATCH_DRAW_SPAN(self, buf, len) do { \ + int drawn = self->draw_span(buf, len); \ + if (drawn) self->step_interp_inputs(drawn >> 2); \ + for (buf += drawn; drawn < len; drawn += 4, buf += 4) { \ + run(self); \ + commit_span(buf, pack_span(buf)); \ + } \ +} while (0) + +#include "texture.h" #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" @@ -2627,14 +2627,942 @@ void CopyTexSubImage2D(GLenum target, UNUSED GLint level, GLint xoffset, #pragma GCC diagnostic ignored "-Wunused-variable" #pragma GCC diagnostic ignored "-Wimplicit-fallthrough" #ifdef __clang__ -# pragma GCC diagnostic ignored "-Wunused-private-field" +#pragma GCC diagnostic ignored "-Wunused-private-field" #else -# pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" #endif #include "load_shader.h" #pragma GCC diagnostic pop -#include "rasterize.h" +typedef vec2_scalar Point2D; +typedef vec4_scalar Point3D; + +struct ClipRect { + float x0; + float y0; + float x1; + float y1; + + ClipRect(const IntRect& i) : x0(i.x0), y0(i.y0), x1(i.x1), y1(i.y1) {} + ClipRect(Texture& t) : ClipRect(ctx->apply_scissor(t.bounds())) {} + + template <typename P> + bool overlaps(int nump, const P* p) const { + // Generate a mask of which side of the clip rect all of a polygon's points + // fall inside of. This is a cheap conservative estimate of whether the + // bounding box of the polygon might overlap the clip rect, rather than an + // exact test that would require multiple slower line intersections. + int sides = 0; + for (int i = 0; i < nump; i++) { + sides |= p[i].x < x1 ? (p[i].x > x0 ? 1 | 2 : 1) : 2; + sides |= p[i].y < y1 ? (p[i].y > y0 ? 4 | 8 : 4) : 8; + } + return sides == 0xF; + } +}; + +// Helper function for drawing 8-pixel wide chunks of a span with depth buffer. +// Using 8-pixel chunks maximizes use of 16-bit depth values in 128-bit wide +// SIMD register. However, since fragment shaders process only 4 pixels per +// invocation, we need to run fragment shader twice for every 8 pixel batch +// of results we get from the depth test. Perspective is not supported. +template <int FUNC, bool MASK, typename P> +static inline void draw_depth_span(uint16_t z, P* buf, uint16_t* depth, + int span) { + int skip = 0; + // Check if the fragment shader has an optimized draw specialization. + if (fragment_shader->has_draw_span(buf)) { + // The loop tries to accumulate runs of pixels that passed (len) and + // runs of pixels that failed (skip). This allows it to pass the largest + // possible span in between changes in depth pass or fail status to the + // fragment shader's draw specialer. + int len = 0; + do { + ZMask8 zmask; + // Process depth in 8-pixel chunks. + switch (check_depth8<FUNC, MASK>(z, depth, zmask)) { + case 0: // All pixels failed the depth test. + if (len) { + // Flush out passed pixels. + fragment_shader->draw_span(buf - len, len); + len = 0; + } + // Accumulate 2 skipped chunks. + skip += 2; + break; + case -1: // All pixels passed the depth test. + if (skip) { + // Flushed out any skipped chunks. + fragment_shader->skip(skip); + skip = 0; + } + // Accumulate 8 passed pixels. + len += 8; + break; + default: // Mixture of pass and fail results. + if (len) { + // Flush out any passed pixels. + fragment_shader->draw_span(buf - len, len); + len = 0; + } else if (skip) { + // Flush out any skipped chunks. + fragment_shader->skip(skip); + skip = 0; + } + // Run fragment shader on first 4 depth results. + commit_output<false, false>(buf, unpack(lowHalf(zmask), buf)); + // Run fragment shader on next 4 depth results. + commit_output<false, false>(buf + 4, unpack(highHalf(zmask), buf)); + break; + } + // Advance to next 8 pixels... + buf += 8; + depth += 8; + span -= 8; + } while (span >= 8); + // Flush out any remaining passed pixels. + if (len) { + fragment_shader->draw_span(buf - len, len); + } + } else { + // No draw specialization, so we can use a simpler loop here that just + // accumulates depth failures, but otherwise invokes fragment shader + // immediately on depth pass. + do { + ZMask8 zmask; + // Process depth in 8-pixel chunks. + switch (check_depth8<FUNC, MASK>(z, depth, zmask)) { + case 0: // All pixels failed the depth test. + // Accumulate 2 skipped chunks. + skip += 2; + break; + case -1: // All pixels passed the depth test. + if (skip) { + // Flush out any skipped chunks. + fragment_shader->skip(skip); + skip = 0; + } + // Run the fragment shader for two 4-pixel chunks. + commit_output<false, false>(buf); + commit_output<false, false>(buf + 4); + break; + default: // Mixture of pass and fail results. + if (skip) { + // Flush out any skipped chunks. + fragment_shader->skip(skip); + skip = 0; + } + // Run fragment shader on first 4 depth results. + commit_output<false, false>(buf, unpack(lowHalf(zmask), buf)); + // Run fragment shader on next 4 depth results. + commit_output<false, false>(buf + 4, unpack(highHalf(zmask), buf)); + break; + } + // Advance to next 8 pixels... + buf += 8; + depth += 8; + span -= 8; + } while (span >= 8); + } + // Flush out any remaining skipped chunks. + if (skip) { + fragment_shader->skip(skip); + } +} + +// Draw a simple span in 4-pixel wide chunks, optionally using depth. +template <bool DISCARD, bool W, typename P, typename Z> +static ALWAYS_INLINE void draw_span(P* buf, uint16_t* depth, int span, Z z) { + if (depth) { + // Depth testing is enabled. If perspective is used, Z values will vary + // across the span, we use packDepth to generate 16-bit Z values suitable + // for depth testing based on current values from gl_FragCoord.z. + // Otherwise, for the no-perspective case, we just use the provided Z. + // Process 4-pixel chunks first. + for (; span >= 4; span -= 4, buf += 4, depth += 4) { + commit_output<DISCARD, W>(buf, z(), depth); + } + // If there are any remaining pixels, do a partial chunk. + if (span > 0) { + commit_output<DISCARD, W>(buf, z(), depth, span); + } + } else { + // Process 4-pixel chunks first. + for (; span >= 4; span -= 4, buf += 4) { + commit_output<DISCARD, W>(buf); + } + // If there are any remaining pixels, do a partial chunk. + if (span > 0) { + commit_output<DISCARD, W>(buf, span); + } + } +} + +// Draw spans for each row of a given quad (or triangle) with a constant Z +// value. The quad is assumed convex. It is clipped to fall within the given +// clip rect. In short, this function rasterizes a quad by first finding a +// top most starting point and then from there tracing down the left and right +// sides of this quad until it hits the bottom, outputting a span between the +// current left and right positions at each row along the way. Points are +// assumed to be ordered in either CW or CCW to support this, but currently +// both orders (CW and CCW) are supported and equivalent. +template <typename P> +static inline void draw_quad_spans(int nump, Point2D p[4], uint16_t z, + Interpolants interp_outs[4], + Texture& colortex, int layer, + Texture& depthtex, + const ClipRect& clipRect) { + // Only triangles and convex quads supported. + assert(nump == 3 || nump == 4); + Point2D l0, r0, l1, r1; + int l0i, r0i, l1i, r1i; + { + // Find the index of the top-most (smallest Y) point from which + // rasterization can start. + int top = nump > 3 && p[3].y < p[2].y + ? (p[0].y < p[1].y ? (p[0].y < p[3].y ? 0 : 3) + : (p[1].y < p[3].y ? 1 : 3)) + : (p[0].y < p[1].y ? (p[0].y < p[2].y ? 0 : 2) + : (p[1].y < p[2].y ? 1 : 2)); + // Helper to find next index in the points array, walking forward. +#define NEXT_POINT(idx) \ + ({ \ + int cur = (idx) + 1; \ + cur < nump ? cur : 0; \ + }) + // Helper to find the previous index in the points array, walking backward. +#define PREV_POINT(idx) \ + ({ \ + int cur = (idx)-1; \ + cur >= 0 ? cur : nump - 1; \ + }) + // Start looking for "left"-side and "right"-side descending edges starting + // from the determined top point. + int next = NEXT_POINT(top); + int prev = PREV_POINT(top); + if (p[top].y == p[next].y) { + // If the next point is on the same row as the top, then advance one more + // time to the next point and use that as the "left" descending edge. + l0i = next; + l1i = NEXT_POINT(next); + // Assume top and prev form a descending "right" edge, as otherwise this + // will be a collapsed polygon and harmlessly bail out down below. + r0i = top; + r1i = prev; + } else if (p[top].y == p[prev].y) { + // If the prev point is on the same row as the top, then advance to the + // prev again and use that as the "right" descending edge. + // Assume top and next form a non-empty descending "left" edge. + l0i = top; + l1i = next; + r0i = prev; + r1i = PREV_POINT(prev); + } else { + // Both next and prev are on distinct rows from top, so both "left" and + // "right" edges are non-empty/descending. + l0i = r0i = top; + l1i = next; + r1i = prev; + } + // Load the points from the indices. + l0 = p[l0i]; // Start of left edge + r0 = p[r0i]; // End of left edge + l1 = p[l1i]; // Start of right edge + r1 = p[r1i]; // End of right edge + // debugf("l0: %d(%f,%f), r0: %d(%f,%f) -> l1: %d(%f,%f), r1: + // %d(%f,%f)\n", l0i, l0.x, l0.y, r0i, r0.x, r0.y, l1i, l1.x, l1.y, r1i, + // r1.x, r1.y); + } + + struct Edge + { + float yScale; + float xSlope; + float x; + Interpolants interpSlope; + Interpolants interp; + + Edge(float y, const Point2D& p0, const Point2D& p1, + const Interpolants& i0, const Interpolants& i1) : + // Inverse Y scale for slope calculations. Avoid divide on 0-length edge. + // Later checks below ensure that Y <= p1.y, or otherwise we don't use + // this edge. We just need to guard against Y == p1.y == p0.y. In that + // case, Y - p0.y == 0 and will cancel out the slopes below, except if + // yScale is Inf for some reason (or worse, NaN), which 1/(p1.y-p0.y) + // might produce if we don't bound it. + yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)), + // Calculate dX/dY slope + xSlope((p1.x - p0.x) * yScale), + // Initialize current X based on Y and slope + x(p0.x + (y - p0.y) * xSlope), + // Calculate change in interpolants per change in Y + interpSlope((i1 - i0) * yScale), + // Initialize current interpolants based on Y and slope + interp(i0 + (y - p0.y) * interpSlope) + {} + + void nextRow() { + // step current X and interpolants to next row from slope + x += xSlope; + interp += interpSlope; + } + }; + + // Vertex selection above should result in equal left and right start rows + assert(l0.y == r0.y); + // Find the start y, clip to within the clip rect, and round to row center. + float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f; + // Initialize left and right edges from end points and start Y + Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]); + Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]); + // Get pointer to color buffer and depth buffer at current Y + P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer, sizeof(P)); + uint16_t* fdepth = + (uint16_t*)depthtex.sample_ptr(0, int(y), 0, sizeof(uint16_t)); + // Loop along advancing Ys, rasterizing spans at each row + float checkY = min(min(l1.y, r1.y), clipRect.y1); + for (;;) { + // Check if we maybe passed edge ends or outside clip rect... + if (y > checkY) { + // If we're outside the clip rect, we're done. + if (y > clipRect.y1) break; + // Helper to find the next non-duplicate vertex that doesn't loop back. +#define STEP_EDGE(e0i, e0, e1i, e1, STEP_POINT, end) \ + for (;;) { \ + /* Set new start of edge to be end of old edge */ \ + e0i = e1i; \ + e0 = e1; \ + /* Set new end of edge to next point */ \ + e1i = STEP_POINT(e1i); \ + e1 = p[e1i]; \ + /* If the edge is descending, use it. */ \ + if (e1.y > e0.y) break; \ + /* If the edge is ascending or crossed the end, we're done. */ \ + if (e1.y < e0.y || e0i == end) return; \ + /* Otherwise, it's a duplicate, so keep searching. */ \ + } + // Check if Y advanced past the end of the left edge + if (y > l1.y) { + // Step to next left edge past Y and reset edge interpolants. + do { STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i); } while (y > l1.y); + left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]); + } + // Check if Y advanced past the end of the right edge + if (y > r1.y) { + // Step to next right edge past Y and reset edge interpolants. + do { STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i); } while (y > r1.y); + right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]); + } + // Reset check condition for next time around. + checkY = min(min(l1.y, r1.y), clipRect.y1); + } + // lx..rx form the bounds of the span. WR does not use backface culling, + // so we need to use min/max to support the span in either orientation. + // Clip the span to fall within the clip rect and then round to nearest + // column. + int startx = int(max(min(left.x, right.x), clipRect.x0) + 0.5f); + int endx = int(min(max(left.x, right.x), clipRect.x1) + 0.5f); + // Check if span is non-empty. + int span = endx - startx; + if (span > 0) { + ctx->shaded_rows++; + ctx->shaded_pixels += span; + // Advance color/depth buffer pointers to the start of the span. + P* buf = fbuf + startx; + // Check if the we will need to use depth-buffer or discard on this span. + uint16_t* depth = depthtex.buf != nullptr ? fdepth + startx : nullptr; + bool use_discard = fragment_shader->use_discard(); + if (depthtex.delay_clear) { + // Delayed clear is enabled for the depth buffer. Check if this row + // needs to be cleared. + int yi = int(y); + uint32_t& mask = depthtex.cleared_rows[yi / 32]; + if ((mask & (1 << (yi & 31))) == 0) { + // The depth buffer is unitialized on this row, but we know it will + // thus be cleared entirely to the clear value. This lets us quickly + // check the constant Z value of the quad against the clear Z to know + // if the entire span passes or fails the depth test all at once. + switch (ctx->depthfunc) { + case GL_LESS: + if (int16_t(z) < int16_t(depthtex.clear_val)) + break; + else + goto next_span; + case GL_LEQUAL: + if (int16_t(z) <= int16_t(depthtex.clear_val)) + break; + else + goto next_span; + } + // If we got here, we passed the depth test. + if (ctx->depthmask) { + // Depth writes are enabled, so we need to initialize depth. + mask |= 1 << (yi & 31); + depthtex.delay_clear--; + if (use_discard) { + // if discard is enabled, we don't know what pixels may be + // written to, so we have to clear the entire row. + force_clear_row<uint16_t>(depthtex, yi); + } else { + // Otherwise, we only need to clear the pixels that fall outside + // the current span on this row. + if (startx > 0 || endx < depthtex.width) { + force_clear_row<uint16_t>(depthtex, yi, startx, endx); + } + // Fill in the span's Z values with constant Z. + clear_buffer<uint16_t>(depthtex, z, 0, + IntRect{startx, yi, endx, yi + 1}); + // We already passed the depth test, so no need to test depth + // any more. + depth = nullptr; + } + } else { + // No depth writes, so don't clear anything, and no need to test. + depth = nullptr; + } + } + } + if (colortex.delay_clear) { + // Delayed clear is enabled for the color buffer. Check if needs clear. + int yi = int(y); + uint32_t& mask = colortex.cleared_rows[yi / 32]; + if ((mask & (1 << (yi & 31))) == 0) { + mask |= 1 << (yi & 31); + colortex.delay_clear--; + if (depth || blend_key || use_discard) { + // If depth test, blending, or discard is used, old color values + // might be sampled, so we need to clear the entire row to fill it. + force_clear_row<P>(colortex, yi); + } else if (startx > 0 || endx < colortex.width) { + // Otherwise, we only need to clear the row outside of the span. + // The fragment shader will fill the row within the span itself. + force_clear_row<P>(colortex, yi, startx, endx); + } + } + } + // Initialize fragment shader interpolants to current span position. + fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1); + fragment_shader->gl_FragCoord.y = y; + { + // Change in interpolants is difference between current right and left + // edges per the change in right and left X. + Interpolants step = + (right.interp - left.interp) * (1.0f / (right.x - left.x)); + // Advance current interpolants to X at start of span. + Interpolants o = left.interp + step * (startx + 0.5f - left.x); + fragment_shader->init_span(&o, &step, 4.0f); + } + if (!use_discard) { + // Fast paths for the case where fragment discard is not used. + if (depth) { + // If depth is used, we want to process spans in 8-pixel chunks to + // maximize sampling and testing 16-bit depth values within the 128- + // bit width of a SIMD register. + if (span >= 8) { + // Specializations for supported depth functions depending on + // whether depth writes are enabled. + if (ctx->depthfunc == GL_LEQUAL) { + if (ctx->depthmask) + draw_depth_span<GL_LEQUAL, true>(z, buf, depth, span); + else + draw_depth_span<GL_LEQUAL, false>(z, buf, depth, span); + } else { + if (ctx->depthmask) + draw_depth_span<GL_LESS, true>(z, buf, depth, span); + else + draw_depth_span<GL_LESS, false>(z, buf, depth, span); + } + // Advance buffers past processed chunks. + buf += span & ~7; + depth += span & ~7; + span &= 7; + } + } else { + // Check if the fragment shader has an optimized draw specialization. + if (span >= 4 && fragment_shader->has_draw_span(buf)) { + // Draw specialization expects 4-pixel chunks. + int len = span & ~3; + fragment_shader->draw_span(buf, len); + buf += len; + span &= 3; + } + } + draw_span<false, false>(buf, depth, span, [=]{ return z; }); + } else { + // If discard is used, then use slower fallbacks. This should be rare. + // Just needs to work, doesn't need to be too fast yet... + draw_span<true, false>(buf, depth, span, [=]{ return z; }); + } + } + next_span: + // Advance Y and edge interpolants to next row. + y++; + left.nextRow(); + right.nextRow(); + // Advance buffers to next row. + fbuf += colortex.stride(sizeof(P)) / sizeof(P); + fdepth += depthtex.stride(sizeof(uint16_t)) / sizeof(uint16_t); + } +} + +// Draw perspective-correct spans for a convex quad that has been clipped to +// the near and far Z planes, possibly producing a clipped convex polygon with +// more than 4 sides. This assumes the Z value will vary across the spans and +// requires interpolants to factor in W values. This tends to be slower than +// the simpler 2D draw_quad_spans above, especially since we can't optimize the +// depth test easily when Z values, and should be used only rarely if possible. +template <typename P> +static inline void draw_perspective_spans(int nump, Point3D* p, + Interpolants* interp_outs, + Texture& colortex, int layer, + Texture& depthtex, + const ClipRect& clipRect) { + Point3D l0, r0, l1, r1; + int l0i, r0i, l1i, r1i; + { + // Find the index of the top-most point (smallest Y) from which + // rasterization can start. + int top = 0; + for (int i = 1; i < nump; i++) { + if (p[i].y < p[top].y) { + top = i; + } + } + // Find left-most top point, the start of the left descending edge. + // Advance forward in the points array, searching at most nump points + // in case the polygon is flat. + l0i = top; + for (int i = top + 1; i < nump && p[i].y == p[top].y; i++) { + l0i = i; + } + if (l0i == nump - 1) { + for (int i = 0; i <= top && p[i].y == p[top].y; i++) { + l0i = i; + } + } + // Find right-most top point, the start of the right descending edge. + // Advance backward in the points array, searching at most nump points. + r0i = top; + for (int i = top - 1; i >= 0 && p[i].y == p[top].y; i--) { + r0i = i; + } + if (r0i == 0) { + for (int i = nump - 1; i >= top && p[i].y == p[top].y; i--) { + r0i = i; + } + } + // End of left edge is next point after left edge start. + l1i = NEXT_POINT(l0i); + // End of right edge is prev point after right edge start. + r1i = PREV_POINT(r0i); + l0 = p[l0i]; // Start of left edge + r0 = p[r0i]; // End of left edge + l1 = p[l1i]; // Start of right edge + r1 = p[r1i]; // End of right edge + } + + struct Edge + { + float yScale; + // Current coordinates for edge. Where in the 2D case of draw_quad_spans, + // it is enough to just track the X coordinate as we advance along the rows, + // for the perspective case we also need to keep track of Z and W. For + // simplicity, we just use the full 3D point to track all these coordinates. + Point3D pSlope; + Point3D p; + Interpolants interpSlope; + Interpolants interp; + + Edge(float y, const Point3D& p0, const Point3D& p1, + const Interpolants& i0, const Interpolants& i1) : + // Inverse Y scale for slope calculations. Avoid divide on 0-length edge. + yScale(1.0f / max(p1.y - p0.y, 1.0f / 256)), + // Calculate dX/dY slope + pSlope((p1 - p0) * yScale), + // Initialize current coords based on Y and slope + p(p0 + (y - p0.y) * pSlope), + // Crucially, these interpolants must be scaled by the point's 1/w value, + // which allows linear interpolation in a perspective-correct manner. + // This will be canceled out inside the fragment shader later. + // Calculate change in interpolants per change in Y + interpSlope((i1 * p1.w - i0 * p0.w) * yScale), + // Initialize current interpolants based on Y and slope + interp(i0 * p0.w + (y - p0.y) * interpSlope) + {} + + float x() const { return p.x; } + vec2_scalar zw() const { return {p.z, p.w}; } + + void nextRow() { + // step current coords and interpolants to next row from slope + p += pSlope; + interp += interpSlope; + } + }; + + // Vertex selection above should result in equal left and right start rows + assert(l0.y == r0.y); + // Find the start y, clip to within the clip rect, and round to row center. + float y = floor(max(l0.y, clipRect.y0) + 0.5f) + 0.5f; + // Initialize left and right edges from end points and start Y + Edge left(y, l0, l1, interp_outs[l0i], interp_outs[l1i]); + Edge right(y, r0, r1, interp_outs[r0i], interp_outs[r1i]); + // Get pointer to color buffer and depth buffer at current Y + P* fbuf = (P*)colortex.sample_ptr(0, int(y), layer, sizeof(P)); + uint16_t* fdepth = + (uint16_t*)depthtex.sample_ptr(0, int(y), 0, sizeof(uint16_t)); + // Loop along advancing Ys, rasterizing spans at each row + float checkY = min(min(l1.y, r1.y), clipRect.y1); + for (;;) { + // Check if we maybe passed edge ends or outside clip rect... + if (y > checkY) { + // If we're outside the clip rect, we're done. + if (y > clipRect.y1) break; + // Check if Y advanced past the end of the left edge + if (y > l1.y) { + // Step to next left edge past Y and reset edge interpolants. + do { STEP_EDGE(l0i, l0, l1i, l1, NEXT_POINT, r1i); } while (y > l1.y); + left = Edge(y, l0, l1, interp_outs[l0i], interp_outs[l1i]); + } + // Check if Y advanced past the end of the right edge + if (y > r1.y) { + // Step to next right edge past Y and reset edge interpolants. + do { STEP_EDGE(r0i, r0, r1i, r1, PREV_POINT, l1i); } while (y > r1.y); + right = Edge(y, r0, r1, interp_outs[r0i], interp_outs[r1i]); + } + // Reset check condition for next time around. + checkY = min(min(l1.y, r1.y), clipRect.y1); + } + // lx..rx form the bounds of the span. WR does not use backface culling, + // so we need to use min/max to support the span in either orientation. + // Clip the span to fall within the clip rect and then round to nearest + // column. + int startx = int(max(min(left.x(), right.x()), clipRect.x0) + 0.5f); + int endx = int(min(max(left.x(), right.x()), clipRect.x1) + 0.5f); + // Check if span is non-empty. + int span = endx - startx; + if (span > 0) { + ctx->shaded_rows++; + ctx->shaded_pixels += span; + // Advance color/depth buffer pointers to the start of the span. + P* buf = fbuf + startx; + // Check if the we will need to use depth-buffer or discard on this span. + uint16_t* depth = depthtex.buf != nullptr ? fdepth + startx : nullptr; + bool use_discard = fragment_shader->use_discard(); + if (depthtex.delay_clear) { + // Delayed clear is enabled for the depth buffer. Check if this row + // needs to be cleared. + int yi = int(y); + uint32_t& mask = depthtex.cleared_rows[yi / 32]; + if ((mask & (1 << (yi & 31))) == 0) { + mask |= 1 << (yi & 31); + depthtex.delay_clear--; + // Since Z varies across the span, it's easier to just clear the + // row and rely on later depth testing. If necessary, this could be + // optimized to test against the start and end Z values of the span + // here. + force_clear_row<uint16_t>(depthtex, yi); + } + } + if (colortex.delay_clear) { + // Delayed clear is enabled for the color buffer. Check if needs clear. + int yi = int(y); + uint32_t& mask = colortex.cleared_rows[yi / 32]; + if ((mask & (1 << (yi & 31))) == 0) { + mask |= 1 << (yi & 31); + colortex.delay_clear--; + if (depth || blend_key || use_discard) { + // If depth test, blending, or discard is used, old color values + // might be sampled, so we need to clear the entire row to fill it. + force_clear_row<P>(colortex, yi); + } else if (startx > 0 || endx < colortex.width) { + // Otherwise, we only need to clear the row outside of the span. + // The fragment shader will fill the row within the span itself. + force_clear_row<P>(colortex, yi, startx, endx); + } + } + } + // Initialize fragment shader interpolants to current span position. + fragment_shader->gl_FragCoord.x = init_interp(startx + 0.5f, 1); + fragment_shader->gl_FragCoord.y = y; + { + // Calculate the fragment Z and W change per change in fragment X step. + vec2_scalar stepZW = + (right.zw() - left.zw()) * (1.0f / (right.x() - left.x())); + // Calculate initial Z and W values for span start. + vec2_scalar zw = left.zw() + stepZW * (startx + 0.5f - left.x()); + // Set fragment shader's Z and W values so that it can use them to + // cancel out the 1/w baked into the interpolants. + fragment_shader->gl_FragCoord.z = init_interp(zw.x, stepZW.x); + fragment_shader->gl_FragCoord.w = init_interp(zw.y, stepZW.y); + fragment_shader->stepZW = stepZW * 4.0f; + // Change in interpolants is difference between current right and left + // edges per the change in right and left X. The left and right + // interpolant values were previously multipled by 1/w, so the step and + // initial span values take this into account. + Interpolants step = + (right.interp - left.interp) * (1.0f / (right.x() - left.x())); + // Advance current interpolants to X at start of span. + Interpolants o = left.interp + step * (startx + 0.5f - left.x()); + fragment_shader->init_span<true>(&o, &step, 4.0f); + } + if (!use_discard) { + // No discard is used. Common case. + draw_span<false, true>(buf, depth, span, packDepth); + } else { + // Discard is used. Rare. + draw_span<true, true>(buf, depth, span, packDepth); + } + } + // Advance Y and edge interpolants to next row. + y++; + left.nextRow(); + right.nextRow(); + // Advance buffers to next row. + fbuf += colortex.stride(sizeof(P)) / sizeof(P); + fdepth += depthtex.stride(sizeof(uint16_t)) / sizeof(uint16_t); + } +} + +// Clip a primitive against both sides of a view-frustum axis, producing +// intermediate vertexes with interpolated attributes that will no longer +// intersect the selected axis planes. This assumes the primitive is convex +// and should produce at most N+2 vertexes for each invocation (only in the +// worst case where one point falls outside on each of the opposite sides +// with the rest of the points inside). +template <XYZW AXIS> +static int clip_side(int nump, Point3D* p, Interpolants* interp, Point3D* outP, + Interpolants* outInterp) { + int numClip = 0; + Point3D prev = p[nump - 1]; + Interpolants prevInterp = interp[nump - 1]; + float prevCoord = prev.select(AXIS); + // Coordinate must satisfy -W <= C <= W. Determine if it is outside, and + // if so, remember which side it is outside of. + int prevSide = prevCoord < -prev.w ? -1 : (prevCoord > prev.w ? 1 : 0); + // Loop through points, finding edges that cross the planes by evaluating + // the side at each point. + for (int i = 0; i < nump; i++) { + Point3D cur = p[i]; + Interpolants curInterp = interp[i]; + float curCoord = cur.select(AXIS); + int curSide = curCoord < -cur.w ? -1 : (curCoord > cur.w ? 1 : 0); + // Check if the previous and current end points are on different sides. + if (curSide != prevSide) { + // One of the edge's end points is outside the plane with the other + // inside the plane. Find the offset where it crosses the plane and + // adjust the point and interpolants to there. + if (prevSide) { + // Edge that was previously outside crosses inside. + // Evaluate plane equation for previous and current end-point + // based on previous side and calculate relative offset. + assert(numClip < nump + 2); + float prevDist = prevCoord - prevSide * prev.w; + float curDist = curCoord - prevSide * cur.w; + float k = prevDist / (prevDist - curDist); + outP[numClip] = prev + (cur - prev) * k; + outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k; + numClip++; + } + if (curSide) { + // Edge that was previously inside crosses outside. + // Evaluate plane equation for previous and current end-point + // based on current side and calculate relative offset. + assert(numClip < nump + 2); + float prevDist = prevCoord - curSide * prev.w; + float curDist = curCoord - curSide * cur.w; + float k = prevDist / (prevDist - curDist); + outP[numClip] = prev + (cur - prev) * k; + outInterp[numClip] = prevInterp + (curInterp - prevInterp) * k; + numClip++; + } + } + if (!curSide) { + // The current end point is inside the plane, so output point unmodified. + assert(numClip < nump + 2); + outP[numClip] = cur; + outInterp[numClip] = curInterp; + numClip++; + } + prev = cur; + prevInterp = curInterp; + prevCoord = curCoord; + prevSide = curSide; + } + return numClip; +} + +// Helper function to dispatch to perspective span drawing with points that +// have already been transformed and clipped. +static inline void draw_perspective_clipped(int nump, Point3D* p_clip, + Interpolants* interp_clip, + Texture& colortex, int layer, + Texture& depthtex) { + // If polygon is ouside clip rect, nothing to draw. + ClipRect clipRect(colortex); + if (!clipRect.overlaps(nump, p_clip)) { + return; + } + + // Finally draw perspective-correct spans for the polygon. + if (colortex.internal_format == GL_RGBA8) { + draw_perspective_spans<uint32_t>(nump, p_clip, interp_clip, colortex, + layer, depthtex, clipRect); + } else if (colortex.internal_format == GL_R8) { + draw_perspective_spans<uint8_t>(nump, p_clip, interp_clip, colortex, + layer, depthtex, clipRect); + } else { + assert(false); + } +} + +// Draws a perspective-correct 3D primitive with varying Z value, as opposed +// to a simple 2D planar primitive with a constant Z value that could be +// trivially Z rejected. This requires clipping the primitive against the near +// and far planes to ensure it stays within the valid Z-buffer range. The Z +// and W of each fragment of the primitives are interpolated across the +// generated spans and then depth-tested as appropriate. +// Additionally, vertex attributes must be interpolated with perspective- +// correction by dividing by W before interpolation, and then later multiplied +// by W again to produce the final correct attribute value for each fragment. +// This process is expensive and should be avoided if possible for primitive +// batches that are known ahead of time to not need perspective-correction. +static void draw_perspective(int nump, + Interpolants interp_outs[4], + Texture& colortex, int layer, + Texture& depthtex) { + // Convert output of vertex shader to screen space. + vec4 pos = vertex_shader->gl_Position; + vec3_scalar scale = + vec3_scalar(ctx->viewport.width(), ctx->viewport.height(), 1) * 0.5f; + vec3_scalar offset = + vec3_scalar(ctx->viewport.x0, ctx->viewport.y0, 0.0f) + scale; + if (test_none(pos.z <= -pos.w || pos.z >= pos.w)) { + // No points cross the near or far planes, so no clipping required. + // Just divide coords by W and convert to viewport. + Float w = 1.0f / pos.w; + vec3 screen = pos.sel(X, Y, Z) * w * scale + offset; + Point3D p[4] = { + {screen.x.x, screen.y.x, screen.z.x, w.x}, + {screen.x.y, screen.y.y, screen.z.y, w.y}, + {screen.x.z, screen.y.z, screen.z.z, w.z}, + {screen.x.w, screen.y.w, screen.z.w, w.w} + }; + draw_perspective_clipped(nump, p, interp_outs, colortex, layer, depthtex); + } else { + // Points cross the near or far planes, so we need to clip. + // Start with the original 3 or 4 points... + Point3D p[4] = { + {pos.x.x, pos.y.x, pos.z.x, pos.w.x}, + {pos.x.y, pos.y.y, pos.z.y, pos.w.y}, + {pos.x.z, pos.y.z, pos.z.z, pos.w.z}, + {pos.x.w, pos.y.w, pos.z.w, pos.w.w} + }; + // Clipping can expand the points by 1 for each of 6 view frustum planes. + Point3D p_clip[4 + 6]; + Interpolants interp_clip[4 + 6]; + // Clip against near and far Z planes. + nump = clip_side<Z>(nump, p, interp_outs, p_clip, interp_clip); + // If no points are left inside the view frustum, there's nothing to draw. + if (nump < 3) { + return; + } + // After clipping against only the near and far planes, we might still + // produce points where W = 0, exactly at the camera plane. OpenGL specifies + // that for clip coordinates, points must satisfy: + // -W <= X <= W + // -W <= Y <= W + // -W <= Z <= W + // When Z = W = 0, this is trivially satisfied, but when we transform and + // divide by W below it will produce a divide by 0. Usually we want to only + // clip Z to avoid the extra work of clipping X and Y. We can still project + // points that fall outside the view frustum X and Y so long as Z is valid. + // The span drawing code will then ensure X and Y are clamped to viewport + // boundaries. However, in the Z = W = 0 case, sometimes clipping X and Y, + // will push W further inside the view frustum so that it is no longer 0, + // allowing us to finally proceed to projecting the points to the screen. + for (int i = 0; i < nump; i++) { + // Found an invalid W, so need to clip against X and Y... + if (p_clip[i].w <= 0.0f) { + // Ping-pong p_clip -> p_tmp -> p_clip. + Point3D p_tmp[4 + 6]; + Interpolants interp_tmp[4 + 6]; + nump = clip_side<X>(nump, p_clip, interp_clip, p_tmp, interp_tmp); + if (nump < 3) return; + nump = clip_side<Y>(nump, p_tmp, interp_tmp, p_clip, interp_clip); + if (nump < 3) return; + // After clipping against X and Y planes, there's still points left + // to draw, so proceed to trying projection now... + break; + } + } + // Divide coords by W and convert to viewport. + for (int i = 0; i < nump; i++) { + float w = 1.0f / p_clip[i].w; + p_clip[i] = Point3D(p_clip[i].sel(X, Y, Z) * w * scale + offset, w); + } + draw_perspective_clipped(nump, p_clip, interp_clip, colortex, layer, + depthtex); + } +} + +static void draw_quad(int nump, Texture& colortex, int layer, + Texture& depthtex) { + // Run vertex shader once for the primitive's vertices. + // Reserve space for 6 sets of interpolants, in case we need to clip against + // near and far planes in the perspective case. + Interpolants interp_outs[4]; + vertex_shader->run_primitive((char*)interp_outs, sizeof(Interpolants)); + vec4 pos = vertex_shader->gl_Position; + // Check if any vertex W is different from another. If so, use perspective. + if (test_any(pos.w != pos.w.x)) { + draw_perspective(nump, interp_outs, colortex, layer, depthtex); + return; + } + + // Convert output of vertex shader to screen space. + // Divide coords by W and convert to viewport. + float w = 1.0f / pos.w.x; + vec2 screen = + (pos.sel(X, Y) * w + 1) * 0.5f * + vec2_scalar(ctx->viewport.width(), ctx->viewport.height()) + + vec2_scalar(ctx->viewport.x0, ctx->viewport.y0); + Point2D p[4] = {{screen.x.x, screen.y.x}, + {screen.x.y, screen.y.y}, + {screen.x.z, screen.y.z}, + {screen.x.w, screen.y.w}}; + + // If quad is ouside clip rect, nothing to draw. + ClipRect clipRect(colortex); + if (!clipRect.overlaps(nump, p)) { + return; + } + + // Since the quad is assumed 2D, Z is constant across the quad. + float screenZ = (pos.z.x * w + 1) * 0.5f; + if (screenZ < 0 || screenZ > 1) { + // Z values would cross the near or far plane, so just bail. + return; + } + // Since Z doesn't need to be interpolated, just set the fragment shader's + // Z and W values here, once and for all fragment shader invocations. + // SSE2 does not support unsigned comparison, so bias Z to be negative. + uint16_t z = uint16_t(0xFFFF * screenZ) - 0x8000; + fragment_shader->gl_FragCoord.z = screenZ; + fragment_shader->gl_FragCoord.w = w; + + // Finally draw 2D spans for the quad. Currently only supports drawing to + // RGBA8 and R8 color buffers. + if (colortex.internal_format == GL_RGBA8) { + draw_quad_spans<uint32_t>(nump, p, z, interp_outs, colortex, layer, + depthtex, clipRect); + } else if (colortex.internal_format == GL_R8) { + draw_quad_spans<uint8_t>(nump, p, z, interp_outs, colortex, layer, depthtex, + clipRect); + } else { + assert(false); + } +} void VertexArray::validate() { int last_enabled = -1; @@ -2653,32 +3581,78 @@ void VertexArray::validate() { max_attrib = last_enabled; } +template <typename INDEX> +static inline void draw_elements(GLsizei count, GLsizei instancecount, + Buffer& indices_buf, size_t offset, + VertexArray& v, Texture& colortex, int layer, + Texture& depthtex) { + assert((offset & (sizeof(INDEX) - 1)) == 0); + INDEX* indices = (INDEX*)(indices_buf.buf + offset); + count = min(count, + (GLsizei)((indices_buf.size - offset) / sizeof(INDEX))); + // Triangles must be indexed at offsets 0, 1, 2. + // Quads must be successive triangles indexed at offsets 0, 1, 2, 2, 1, 3. + if (count == 6 && indices[1] == indices[0] + 1 && + indices[2] == indices[0] + 2 && indices[5] == indices[0] + 3) { + assert(indices[3] == indices[0] + 2 && indices[4] == indices[0] + 1); + // Fast path - since there is only a single quad, we only load per-vertex + // attribs once for all instances, as they won't change across instances + // or within an instance. + vertex_shader->load_attribs(v.attribs, indices[0], 0, 4); + draw_quad(4, colortex, layer, depthtex); + for (GLsizei instance = 1; instance < instancecount; instance++) { + vertex_shader->load_attribs(v.attribs, indices[0], instance, 0); + draw_quad(4, colortex, layer, depthtex); + } + } else { + for (GLsizei instance = 0; instance < instancecount; instance++) { + for (GLsizei i = 0; i + 3 <= count; i += 3) { + if (indices[i + 1] != indices[i] + 1 || + indices[i + 2] != indices[i] + 2) { + continue; + } + int nump = 3; + if (i + 6 <= count && indices[i + 5] == indices[i] + 3) { + assert(indices[i + 3] == indices[i] + 2 && + indices[i + 4] == indices[i] + 1); + nump = 4; + i += 3; + } + vertex_shader->load_attribs(v.attribs, indices[i], instance, nump); + draw_quad(nump, colortex, layer, depthtex); + } + } + } +} + extern "C" { void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type, - GLintptr offset, GLsizei instancecount) { - if (offset < 0 || count <= 0 || instancecount <= 0 || !vertex_shader || - !fragment_shader) { + void* indicesptr, GLsizei instancecount) { + assert(mode == GL_TRIANGLES); + assert(type == GL_UNSIGNED_SHORT || type == GL_UNSIGNED_INT); + if (count <= 0 || instancecount <= 0) { return; } - Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER, true); - if (!fb.color_attachment) { - return; - } + Framebuffer& fb = *get_framebuffer(GL_DRAW_FRAMEBUFFER); Texture& colortex = ctx->textures[fb.color_attachment]; if (!colortex.buf) { return; } - assert(!colortex.locked); assert(colortex.internal_format == GL_RGBA8 || colortex.internal_format == GL_R8); Texture& depthtex = ctx->textures[ctx->depthtest ? fb.depth_attachment : 0]; if (depthtex.buf) { - assert(depthtex.internal_format == GL_DEPTH_COMPONENT24); + assert(depthtex.internal_format == GL_DEPTH_COMPONENT16); assert(colortex.width == depthtex.width && colortex.height == depthtex.height); - assert(colortex.offset == depthtex.offset); + } + + Buffer& indices_buf = ctx->buffers[ctx->element_array_buffer_binding]; + size_t offset = (size_t)indicesptr; + if (!indices_buf.buf || offset >= indices_buf.size) { + return; } // debugf("current_vertex_array %d\n", ctx->current_vertex_array); @@ -2689,8 +3663,8 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type, v.validate(); } -#ifdef PRINT_TIMINGS - uint64_t start = get_time_value(); +#ifndef NDEBUG + // uint64_t start = get_time_value(); #endif ctx->shaded_rows = 0; @@ -2698,43 +3672,14 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type, vertex_shader->init_batch(); - switch (type) { - case GL_UNSIGNED_SHORT: - assert(mode == GL_TRIANGLES); - draw_elements<uint16_t>(count, instancecount, offset, v, colortex, - depthtex); - break; - case GL_UNSIGNED_INT: - assert(mode == GL_TRIANGLES); - draw_elements<uint32_t>(count, instancecount, offset, v, colortex, - depthtex); - break; - case GL_NONE: - // Non-standard GL extension - if element type is GL_NONE, then we don't - // use any element buffer and behave as if DrawArrays was called instead. - for (GLsizei instance = 0; instance < instancecount; instance++) { - switch (mode) { - case GL_LINES: - for (GLsizei i = 0; i + 2 <= count; i += 2) { - vertex_shader->load_attribs(v.attribs, offset + i, instance, 2); - draw_quad(2, colortex, depthtex); - } - break; - case GL_TRIANGLES: - for (GLsizei i = 0; i + 3 <= count; i += 3) { - vertex_shader->load_attribs(v.attribs, offset + i, instance, 3); - draw_quad(3, colortex, depthtex); - } - break; - default: - assert(false); - break; - } - } - break; - default: - assert(false); - break; + if (type == GL_UNSIGNED_SHORT) { + draw_elements<uint16_t>(count, instancecount, indices_buf, offset, v, + colortex, fb.layer, depthtex); + } else if (type == GL_UNSIGNED_INT) { + draw_elements<uint32_t>(count, instancecount, indices_buf, offset, v, + colortex, fb.layer, depthtex); + } else { + assert(false); } if (ctx->samples_passed_query) { @@ -2742,66 +3687,329 @@ void DrawElementsInstanced(GLenum mode, GLsizei count, GLenum type, q.value += ctx->shaded_pixels; } -#ifdef PRINT_TIMINGS - uint64_t end = get_time_value(); - printf( - "%7.3fms draw(%s, %d): %d pixels in %d rows (avg %f pixels/row, " - "%fns/pixel)\n", - double(end - start) / (1000. * 1000.), - ctx->programs[ctx->current_program].impl->get_name(), instancecount, - ctx->shaded_pixels, ctx->shaded_rows, - double(ctx->shaded_pixels) / ctx->shaded_rows, - double(end - start) / max(ctx->shaded_pixels, 1)); +#ifndef NDEBUG + // uint64_t end = get_time_value(); + // debugf("draw(%d): %fms for %d pixels in %d rows (avg %f pixels/row, %f + // ns/pixel)\n", instancecount, double(end - start)/(1000.*1000.), + // ctx->shaded_pixels, ctx->shaded_rows, + // double(ctx->shaded_pixels)/ctx->shaded_rows, double(end - + // start)/max(ctx->shaded_pixels, 1)); #endif } -void Finish() { -#ifdef PRINT_TIMINGS - printf("Finish\n"); -#endif +} // extern "C" + +template <typename P> +static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth, + int span) { + int frac = 0; + for (P* end = dst + span; dst < end; dst++) { + *dst = *src; + // Step source according to width ratio. + for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { + src++; + } + } } -void MakeCurrent(Context* c) { - if (ctx == c) { +static void scale_blit(Texture& srctex, const IntRect& srcReq, int srcZ, + Texture& dsttex, const IntRect& dstReq, int dstZ, + bool invertY) { + // Cache scaling ratios + int srcWidth = srcReq.width(); + int srcHeight = srcReq.height(); + int dstWidth = dstReq.width(); + int dstHeight = dstReq.height(); + // Compute valid dest bounds + IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY); + // Compute valid source bounds + // Scale source to dest, rounding inward to avoid sampling outside source + IntRect srcBounds = srctex.sample_bounds(srcReq) + .scale(srcWidth, srcHeight, dstWidth, dstHeight, true); + // Limit dest sampling bounds to overlap source bounds + dstBounds.intersect(srcBounds); + // Check if sampling bounds are empty + if (dstBounds.is_empty()) { return; } - ctx = c; - setup_program(ctx ? ctx->current_program : 0); + // Compute final source bounds from clamped dest sampling bounds + srcBounds = IntRect(dstBounds) + .scale(dstWidth, dstHeight, srcWidth, srcHeight); + // Calculate source and dest pointers from clamped offsets + int bpp = srctex.bpp(); + int srcStride = srctex.stride(bpp); + int destStride = dsttex.stride(bpp); + char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ, invertY); + char* src = srctex.sample_ptr(srcReq, srcBounds, srcZ); + // Inverted Y must step downward along dest rows + if (invertY) { + destStride = -destStride; + } + int span = dstBounds.width(); + int frac = 0; + for (int rows = dstBounds.height(); rows > 0; rows--) { + if (srcWidth == dstWidth) { + // No scaling, so just do a fast copy. + memcpy(dest, src, span * bpp); + } else { + // Do scaling with different source and dest widths. + switch (bpp) { + case 1: + scale_row((uint8_t*)dest, dstWidth, (uint8_t*)src, srcWidth, span); + break; + case 2: + scale_row((uint16_t*)dest, dstWidth, (uint16_t*)src, srcWidth, span); + break; + case 4: + scale_row((uint32_t*)dest, dstWidth, (uint32_t*)src, srcWidth, span); + break; + default: + assert(false); + break; + } + } + dest += destStride; + // Step source according to height ratio. + for (frac += srcHeight; frac >= dstHeight; frac -= dstHeight) { + src += srcStride; + } + } +} + +static void linear_row(uint32_t* dest, int span, const vec2_scalar& srcUV, + float srcDU, int srcZOffset, sampler2DArray sampler) { + vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); + for (; span >= 4; span -= 4) { + auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset); + unaligned_store(dest, srcpx); + dest += 4; + uv.x += 4 * srcDU; + } + if (span > 0) { + auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv), srcZOffset); + auto mask = span_mask_RGBA8(span); + auto dstpx = unaligned_load<PackedRGBA8>(dest); + unaligned_store(dest, (mask & dstpx) | (~mask & srcpx)); + } } -Context* CreateContext() { return new Context; } +static void linear_row(uint8_t* dest, int span, const vec2_scalar& srcUV, + float srcDU, int srcZOffset, sampler2DArray sampler) { + vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); + for (; span >= 4; span -= 4) { + auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset); + unaligned_store(dest, pack(srcpx)); + dest += 4; + uv.x += 4 * srcDU; + } + if (span > 0) { + auto srcpx = textureLinearPackedR8(sampler, ivec2(uv), srcZOffset); + auto mask = span_mask_R8(span); + auto dstpx = unpack(unaligned_load<PackedR8>(dest)); + unaligned_store(dest, pack((mask & dstpx) | (~mask & srcpx))); + } +} -void ReferenceContext(Context* c) { - if (!c) { +static void linear_blit(Texture& srctex, const IntRect& srcReq, int srcZ, + Texture& dsttex, const IntRect& dstReq, int dstZ, + bool invertY) { + assert(srctex.internal_format == GL_RGBA8 || + srctex.internal_format == GL_R8); + // Compute valid dest bounds + IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY); + // Check if sampling bounds are empty + if (dstBounds.is_empty()) { return; } - ++c->references; + // Initialize sampler for source texture + sampler2DArray_impl sampler; + init_sampler(&sampler, srctex); + init_depth(&sampler, srctex); + sampler.filter = TextureFilter::LINEAR; + // Compute source UVs + int srcZOffset = srcZ * sampler.height_stride; + vec2_scalar srcUV(srcReq.x0, srcReq.y0); + vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(), + float(srcReq.height()) / dstReq.height()); + // Skip to clamped source start + srcUV += srcDUV * vec2_scalar(dstBounds.x0, dstBounds.y0); + // Offset source UVs to texel centers and scale by lerp precision + srcUV = linearQuantize(srcUV + 0.5f, 128); + srcDUV *= 128.0f; + // Calculate dest pointer from clamped offsets + int bpp = dsttex.bpp(); + int destStride = dsttex.stride(bpp); + char* dest = dsttex.sample_ptr(dstReq, dstBounds, dstZ, invertY); + // Inverted Y must step downward along dest rows + if (invertY) { + destStride = -destStride; + } + int span = dstBounds.width(); + for (int rows = dstBounds.height(); rows > 0; rows--) { + switch (bpp) { + case 1: + linear_row((uint8_t*)dest, span, srcUV, srcDUV.x, srcZOffset, + &sampler); + break; + case 4: + linear_row((uint32_t*)dest, span, srcUV, srcDUV.x, srcZOffset, + &sampler); + break; + default: + assert(false); + break; + } + dest += destStride; + srcUV.y += srcDUV.y; + } } -void DestroyContext(Context* c) { - if (!c) { +extern "C" { + +void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, + GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, + GLbitfield mask, GLenum filter) { + assert(mask == GL_COLOR_BUFFER_BIT); + Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER); + if (!srcfb || srcfb->layer < 0) return; + Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER); + if (!dstfb || dstfb->layer < 0) return; + Texture& srctex = ctx->textures[srcfb->color_attachment]; + if (!srctex.buf || srcfb->layer >= max(srctex.depth, 1)) return; + Texture& dsttex = ctx->textures[dstfb->color_attachment]; + if (!dsttex.buf || dstfb->layer >= max(dsttex.depth, 1)) return; + if (srctex.internal_format != dsttex.internal_format) { + assert(false); return; } - assert(c->references > 0); - --c->references; - if (c->references > 0) { + // Force flipped Y onto dest coordinates + if (srcY1 < srcY0) { + swap(srcY0, srcY1); + swap(dstY0, dstY1); + } + bool invertY = dstY1 < dstY0; + if (invertY) { + swap(dstY0, dstY1); + } + IntRect srcReq = {srcX0, srcY0, srcX1, srcY1}; + IntRect dstReq = {dstX0, dstY0, dstX1, dstY1}; + if (srcReq.is_empty() || dstReq.is_empty()) { return; } - if (ctx == c) { - MakeCurrent(nullptr); + prepare_texture(srctex); + prepare_texture(dsttex, &dstReq); + if (!srcReq.same_size(dstReq) && filter == GL_LINEAR && + (srctex.internal_format == GL_RGBA8 || + srctex.internal_format == GL_R8)) { + linear_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer, + invertY); + } else { + scale_blit(srctex, srcReq, srcfb->layer, dsttex, dstReq, dstfb->layer, + invertY); } - delete c; } -size_t ReportMemory(size_t (*size_of_op)(void*)) { - size_t size = 0; +void Finish() {} + +void MakeCurrent(void* ctx_ptr) { + ctx = (Context*)ctx_ptr; if (ctx) { - for (auto& t : ctx->textures) { - if (t && t->should_free()) { - size += size_of_op(t->buf); + setup_program(ctx->current_program); + blend_key = ctx->blend ? ctx->blend_key : BLEND_KEY_NONE; + } else { + setup_program(0); + blend_key = BLEND_KEY_NONE; + } +} + +void* CreateContext() { return new Context; } + +void DestroyContext(void* ctx_ptr) { + if (!ctx_ptr) { + return; + } + if (ctx == ctx_ptr) { + MakeCurrent(nullptr); + } + delete (Context*)ctx_ptr; +} + +void Composite(GLuint srcId, GLint srcX, GLint srcY, GLsizei srcWidth, + GLsizei srcHeight, GLint dstX, GLint dstY, GLboolean opaque, + GLboolean flip) { + Framebuffer& fb = ctx->framebuffers[0]; + if (!fb.color_attachment) { + return; + } + Texture& srctex = ctx->textures[srcId]; + if (!srctex.buf) return; + prepare_texture(srctex); + Texture& dsttex = ctx->textures[fb.color_attachment]; + if (!dsttex.buf) return; + assert(srctex.bpp() == 4); + const int bpp = 4; + size_t src_stride = srctex.stride(bpp); + size_t dest_stride = dsttex.stride(bpp); + if (srcY < 0) { + dstY -= srcY; + srcHeight += srcY; + srcY = 0; + } + if (dstY < 0) { + srcY -= dstY; + srcHeight += dstY; + dstY = 0; + } + if (srcY + srcHeight > srctex.height) { + srcHeight = srctex.height - srcY; + } + if (dstY + srcHeight > dsttex.height) { + srcHeight = dsttex.height - dstY; + } + IntRect skip = {dstX, dstY, dstX + srcWidth, dstY + srcHeight}; + prepare_texture(dsttex, &skip); + char* dest = dsttex.sample_ptr(dstX, flip ? dsttex.height - 1 - dstY : dstY, + fb.layer, bpp, dest_stride); + char* src = srctex.sample_ptr(srcX, srcY, 0, bpp, src_stride); + if (flip) { + dest_stride = -dest_stride; + } + if (opaque) { + for (int y = 0; y < srcHeight; y++) { + memcpy(dest, src, srcWidth * bpp); + dest += dest_stride; + src += src_stride; + } + } else { + for (int y = 0; y < srcHeight; y++) { + char* end = src + srcWidth * bpp; + while (src + 4 * bpp <= end) { + WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src)); + WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest)); + PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); + unaligned_store(dest, r); + src += 4 * bpp; + dest += 4 * bpp; } + if (src < end) { + WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src)); + WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest)); + U32 r = bit_cast<U32>( + pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx)))); + unaligned_store(dest, r.x); + if (src + bpp < end) { + unaligned_store(dest + bpp, r.y); + if (src + 2 * bpp < end) { + unaligned_store(dest + 2 * bpp, r.z); + } + } + dest += end - src; + src = end; + } + dest += dest_stride - srcWidth * bpp; + src += src_stride - srcWidth * bpp; } } - return size; } + } // extern "C" |