diff options
Diffstat (limited to 'third_party/webrender/swgl/src/composite.h')
-rw-r--r-- | third_party/webrender/swgl/src/composite.h | 1069 |
1 files changed, 0 insertions, 1069 deletions
diff --git a/third_party/webrender/swgl/src/composite.h b/third_party/webrender/swgl/src/composite.h deleted file mode 100644 index f88de485fdd..00000000000 --- a/third_party/webrender/swgl/src/composite.h +++ /dev/null @@ -1,1069 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -template <bool COMPOSITE, typename P> -static inline void copy_row(P* dst, const P* src, int span) { - // No scaling, so just do a fast copy. - memcpy(dst, src, span * sizeof(P)); -} - -template <> -void copy_row<true, uint32_t>(uint32_t* dst, const uint32_t* src, int span) { - // No scaling, so just do a fast composite. - auto* end = dst + span; - while (dst + 4 <= end) { - WideRGBA8 srcpx = unpack(unaligned_load<PackedRGBA8>(src)); - WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst)); - PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - unaligned_store(dst, r); - src += 4; - dst += 4; - } - if (dst < end) { - WideRGBA8 srcpx = unpack(partial_load_span<PackedRGBA8>(src, end - dst)); - WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst)); - auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - partial_store_span(dst, r, end - dst); - } -} - -template <bool COMPOSITE, typename P> -static inline void scale_row(P* dst, int dstWidth, const P* src, int srcWidth, - int span, int frac) { - // Do scaling with different source and dest widths. - for (P* end = dst + span; dst < end; dst++) { - *dst = *src; - // Step source according to width ratio. - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - } -} - -template <> -void scale_row<true, uint32_t>(uint32_t* dst, int dstWidth, const uint32_t* src, - int srcWidth, int span, int frac) { - // Do scaling with different source and dest widths. - // Gather source pixels four at a time for better packing. - auto* end = dst + span; - for (; dst + 4 <= end; dst += 4) { - U32 srcn; - srcn.x = *src; - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - srcn.y = *src; - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - srcn.z = *src; - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - srcn.w = *src; - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn)); - WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dst)); - PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - unaligned_store(dst, r); - } - if (dst < end) { - // Process any remaining pixels. Try to gather as many pixels as possible - // into a single source chunk for compositing. - U32 srcn = {*src, 0, 0, 0}; - if (end - dst > 1) { - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - srcn.y = *src; - if (end - dst > 2) { - for (frac += srcWidth; frac >= dstWidth; frac -= dstWidth) { - src++; - } - srcn.z = *src; - } - } - WideRGBA8 srcpx = unpack(bit_cast<PackedRGBA8>(srcn)); - WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dst, end - dst)); - auto r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - partial_store_span(dst, r, end - dst); - } -} - -template <bool COMPOSITE = false> -static NO_INLINE void scale_blit(Texture& srctex, const IntRect& srcReq, - Texture& dsttex, const IntRect& dstReq, - bool invertY, const IntRect& clipRect) { - assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 && - dsttex.internal_format == GL_RGBA8)); - // Cache scaling ratios - int srcWidth = srcReq.width(); - int srcHeight = srcReq.height(); - int dstWidth = dstReq.width(); - int dstHeight = dstReq.height(); - // Compute valid dest bounds - IntRect dstBounds = dsttex.sample_bounds(dstReq).intersect(clipRect); - // Compute valid source bounds - IntRect srcBounds = srctex.sample_bounds(srcReq, invertY); - // If srcReq is outside the source texture, we need to clip the sampling - // bounds so that we never sample outside valid source bounds. Get texture - // bounds relative to srcReq and scale to dest-space rounding inward, using - // this rect to limit the dest bounds further. - IntRect srcClip = srctex.bounds() - srcReq.origin(); - if (invertY) { - srcClip.invert_y(srcReq.height()); - } - srcClip.scale(srcWidth, srcHeight, dstWidth, dstHeight, true); - dstBounds.intersect(srcClip); - // Check if clipped sampling bounds are empty - if (dstBounds.is_empty()) { - return; - } - - // Calculate source and dest pointers from clamped offsets - int bpp = srctex.bpp(); - int srcStride = srctex.stride(); - int destStride = dsttex.stride(); - char* dest = dsttex.sample_ptr(dstReq, dstBounds); - // Clip the source bounds by the destination offset. - int fracX = srcWidth * dstBounds.x0; - int fracY = srcHeight * dstBounds.y0; - srcBounds.x0 = max(fracX / dstWidth, srcBounds.x0); - srcBounds.y0 = max(fracY / dstHeight, srcBounds.y0); - fracX %= dstWidth; - fracY %= dstHeight; - char* src = srctex.sample_ptr(srcReq, srcBounds, invertY); - // Inverted Y must step downward along source rows - if (invertY) { - srcStride = -srcStride; - } - int span = dstBounds.width(); - for (int rows = dstBounds.height(); rows > 0; rows--) { - switch (bpp) { - case 1: - if (srcWidth == dstWidth) - copy_row<COMPOSITE>((uint8_t*)dest, (uint8_t*)src, span); - else - scale_row<COMPOSITE>((uint8_t*)dest, dstWidth, (uint8_t*)src, - srcWidth, span, fracX); - break; - case 2: - if (srcWidth == dstWidth) - copy_row<COMPOSITE>((uint16_t*)dest, (uint16_t*)src, span); - else - scale_row<COMPOSITE>((uint16_t*)dest, dstWidth, (uint16_t*)src, - srcWidth, span, fracX); - break; - case 4: - if (srcWidth == dstWidth) - copy_row<COMPOSITE>((uint32_t*)dest, (uint32_t*)src, span); - else - scale_row<COMPOSITE>((uint32_t*)dest, dstWidth, (uint32_t*)src, - srcWidth, span, fracX); - break; - default: - assert(false); - break; - } - dest += destStride; - // Step source according to height ratio. - for (fracY += srcHeight; fracY >= dstHeight; fracY -= dstHeight) { - src += srcStride; - } - } -} - -template <bool COMPOSITE> -static void linear_row_blit(uint32_t* dest, int span, const vec2_scalar& srcUV, - float srcDU, sampler2D sampler) { - vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); - for (; span >= 4; span -= 4) { - auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv)); - unaligned_store(dest, srcpx); - dest += 4; - uv.x += 4 * srcDU; - } - if (span > 0) { - auto srcpx = textureLinearPackedRGBA8(sampler, ivec2(uv)); - partial_store_span(dest, srcpx, span); - } -} - -template <> -void linear_row_blit<true>(uint32_t* dest, int span, const vec2_scalar& srcUV, - float srcDU, sampler2D sampler) { - vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); - for (; span >= 4; span -= 4) { - WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv)); - WideRGBA8 dstpx = unpack(unaligned_load<PackedRGBA8>(dest)); - PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - unaligned_store(dest, r); - - dest += 4; - uv.x += 4 * srcDU; - } - if (span > 0) { - WideRGBA8 srcpx = textureLinearUnpackedRGBA8(sampler, ivec2(uv)); - WideRGBA8 dstpx = unpack(partial_load_span<PackedRGBA8>(dest, span)); - PackedRGBA8 r = pack(srcpx + dstpx - muldiv255(dstpx, alphas(srcpx))); - partial_store_span(dest, r, span); - } -} - -template <bool COMPOSITE> -static void linear_row_blit(uint8_t* dest, int span, const vec2_scalar& srcUV, - float srcDU, sampler2D sampler) { - vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); - for (; span >= 4; span -= 4) { - auto srcpx = textureLinearPackedR8(sampler, ivec2(uv)); - unaligned_store(dest, srcpx); - dest += 4; - uv.x += 4 * srcDU; - } - if (span > 0) { - auto srcpx = textureLinearPackedR8(sampler, ivec2(uv)); - partial_store_span(dest, srcpx, span); - } -} - -template <bool COMPOSITE> -static void linear_row_blit(uint16_t* dest, int span, const vec2_scalar& srcUV, - float srcDU, sampler2D sampler) { - vec2 uv = init_interp(srcUV, vec2_scalar(srcDU, 0.0f)); - for (; span >= 4; span -= 4) { - auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv)); - unaligned_store(dest, srcpx); - dest += 4; - uv.x += 4 * srcDU; - } - if (span > 0) { - auto srcpx = textureLinearPackedRG8(sampler, ivec2(uv)); - partial_store_span(dest, srcpx, span); - } -} - -template <bool COMPOSITE = false> -static NO_INLINE void linear_blit(Texture& srctex, const IntRect& srcReq, - Texture& dsttex, const IntRect& dstReq, - bool invertY, const IntRect& clipRect) { - assert(srctex.internal_format == GL_RGBA8 || - srctex.internal_format == GL_R8 || srctex.internal_format == GL_RG8); - assert(!COMPOSITE || (srctex.internal_format == GL_RGBA8 && - dsttex.internal_format == GL_RGBA8)); - // Compute valid dest bounds - IntRect dstBounds = dsttex.sample_bounds(dstReq); - dstBounds.intersect(clipRect); - // Check if sampling bounds are empty - if (dstBounds.is_empty()) { - return; - } - // Initialize sampler for source texture - sampler2D_impl sampler; - init_sampler(&sampler, srctex); - sampler.filter = TextureFilter::LINEAR; - // Compute source UVs - vec2_scalar srcUV(srcReq.x0, srcReq.y0); - vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(), - float(srcReq.height()) / dstReq.height()); - // Inverted Y must step downward along source rows - if (invertY) { - srcUV.y += srcReq.height(); - srcDUV.y = -srcDUV.y; - } - // Skip to clamped source start - srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f); - // Scale UVs by lerp precision - srcUV = linearQuantize(srcUV, 128); - srcDUV *= 128.0f; - // Calculate dest pointer from clamped offsets - int bpp = dsttex.bpp(); - int destStride = dsttex.stride(); - char* dest = dsttex.sample_ptr(dstReq, dstBounds); - int span = dstBounds.width(); - for (int rows = dstBounds.height(); rows > 0; rows--) { - switch (bpp) { - case 1: - linear_row_blit<COMPOSITE>((uint8_t*)dest, span, srcUV, srcDUV.x, - &sampler); - break; - case 2: - linear_row_blit<COMPOSITE>((uint16_t*)dest, span, srcUV, srcDUV.x, - &sampler); - break; - case 4: - linear_row_blit<COMPOSITE>((uint32_t*)dest, span, srcUV, srcDUV.x, - &sampler); - break; - default: - assert(false); - break; - } - dest += destStride; - srcUV.y += srcDUV.y; - } -} - -extern "C" { - -void BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, - GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, - GLbitfield mask, GLenum filter) { - assert(mask == GL_COLOR_BUFFER_BIT); - Framebuffer* srcfb = get_framebuffer(GL_READ_FRAMEBUFFER); - if (!srcfb) return; - Framebuffer* dstfb = get_framebuffer(GL_DRAW_FRAMEBUFFER); - if (!dstfb) return; - Texture& srctex = ctx->textures[srcfb->color_attachment]; - if (!srctex.buf) return; - Texture& dsttex = ctx->textures[dstfb->color_attachment]; - if (!dsttex.buf) return; - assert(!dsttex.locked); - if (srctex.internal_format != dsttex.internal_format) { - assert(false); - return; - } - // Force flipped Y onto dest coordinates - if (srcY1 < srcY0) { - swap(srcY0, srcY1); - swap(dstY0, dstY1); - } - bool invertY = dstY1 < dstY0; - if (invertY) { - swap(dstY0, dstY1); - } - IntRect srcReq = IntRect{srcX0, srcY0, srcX1, srcY1} - srctex.offset; - IntRect dstReq = IntRect{dstX0, dstY0, dstX1, dstY1} - dsttex.offset; - if (srcReq.is_empty() || dstReq.is_empty()) { - return; - } - IntRect clipRect = {0, 0, dstReq.width(), dstReq.height()}; - prepare_texture(srctex); - prepare_texture(dsttex, &dstReq); - if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR && - (srctex.internal_format == GL_RGBA8 || srctex.internal_format == GL_R8 || - srctex.internal_format == GL_RG8)) { - linear_blit(srctex, srcReq, dsttex, dstReq, invertY, dstReq); - } else { - scale_blit(srctex, srcReq, dsttex, dstReq, invertY, clipRect); - } -} - -typedef Texture LockedTexture; - -// Lock the given texture to prevent modification. -LockedTexture* LockTexture(GLuint texId) { - Texture& tex = ctx->textures[texId]; - if (!tex.buf) { - assert(tex.buf != nullptr); - return nullptr; - } - if (__sync_fetch_and_add(&tex.locked, 1) == 0) { - // If this is the first time locking the texture, flush any delayed clears. - prepare_texture(tex); - } - return (LockedTexture*)&tex; -} - -// Lock the given framebuffer's color attachment to prevent modification. -LockedTexture* LockFramebuffer(GLuint fboId) { - Framebuffer& fb = ctx->framebuffers[fboId]; - // Only allow locking a framebuffer if it has a valid color attachment. - if (!fb.color_attachment) { - assert(fb.color_attachment != 0); - return nullptr; - } - return LockTexture(fb.color_attachment); -} - -// Reference an already locked resource -void LockResource(LockedTexture* resource) { - if (!resource) { - return; - } - __sync_fetch_and_add(&resource->locked, 1); -} - -// Remove a lock on a texture that has been previously locked -void UnlockResource(LockedTexture* resource) { - if (!resource) { - return; - } - if (__sync_fetch_and_add(&resource->locked, -1) <= 0) { - // The lock should always be non-zero before unlocking. - assert(0); - } -} - -// Get the underlying buffer for a locked resource -void* GetResourceBuffer(LockedTexture* resource, int32_t* width, - int32_t* height, int32_t* stride) { - *width = resource->width; - *height = resource->height; - *stride = resource->stride(); - return resource->buf; -} - -// Extension for optimized compositing of textures or framebuffers that may be -// safely used across threads. The source and destination must be locked to -// ensure that they can be safely accessed while the SWGL context might be used -// by another thread. Band extents along the Y axis may be used to clip the -// destination rectangle without effecting the integer scaling ratios. -void Composite(LockedTexture* lockedDst, LockedTexture* lockedSrc, GLint srcX, - GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX, - GLint dstY, GLsizei dstWidth, GLsizei dstHeight, - GLboolean opaque, GLboolean flip, GLenum filter, GLint clipX, - GLint clipY, GLsizei clipWidth, GLsizei clipHeight) { - if (!lockedDst || !lockedSrc) { - return; - } - Texture& srctex = *lockedSrc; - Texture& dsttex = *lockedDst; - assert(srctex.bpp() == 4); - assert(dsttex.bpp() == 4); - - IntRect srcReq = - IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - srctex.offset; - IntRect dstReq = - IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset; - // Compute clip rect as relative to the dstReq, as that's the same coords - // as used for the sampling bounds. - IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth, - clipY - dstY + clipHeight}; - - if (opaque) { - // Ensure we have rows of at least 2 pixels when using the linear filter - // to avoid overreading the row. - if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR) { - linear_blit<false>(srctex, srcReq, dsttex, dstReq, flip, clipRect); - } else { - scale_blit<false>(srctex, srcReq, dsttex, dstReq, flip, clipRect); - } - } else { - if (!srcReq.same_size(dstReq) && srctex.width >= 2 && filter == GL_LINEAR) { - linear_blit<true>(srctex, srcReq, dsttex, dstReq, flip, clipRect); - } else { - scale_blit<true>(srctex, srcReq, dsttex, dstReq, flip, clipRect); - } - } -} - -} // extern "C" - -// Saturated add helper for YUV conversion. Supported platforms have intrinsics -// to do this natively, but support a slower generic fallback just in case. -static inline V8<int16_t> addsat(V8<int16_t> x, V8<int16_t> y) { -#if USE_SSE2 - return _mm_adds_epi16(x, y); -#elif USE_NEON - return vqaddq_s16(x, y); -#else - auto r = x + y; - // An overflow occurred if the signs of both inputs x and y did not differ - // but yet the sign of the result did differ. - auto overflow = (~(x ^ y) & (r ^ x)) >> 15; - // If there was an overflow, we need to choose the appropriate limit to clamp - // to depending on whether or not the inputs are negative. - auto limit = (x >> 15) ^ 0x7FFF; - // If we didn't overflow, just use the result, and otherwise, use the limit. - return (~overflow & r) | (overflow & limit); -#endif -} - -// Interleave and packing helper for YUV conversion. During transform by the -// color matrix, the color components are de-interleaved as this format is -// usually what comes out of the planar YUV textures. The components thus need -// to be interleaved before finally getting packed to BGRA format. Alpha is -// forced to be opaque. -static inline PackedRGBA8 packYUV(V8<int16_t> gg, V8<int16_t> br) { - return pack(bit_cast<WideRGBA8>(zip(br, gg))) | - PackedRGBA8{0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; -} - -// clang-format off -// Supports YUV color matrixes of the form: -// [R] [1.1643835616438356, 0.0, rv ] [Y - 16] -// [G] = [1.1643835616438358, -gu, -gv ] x [U - 128] -// [B] [1.1643835616438356, bu, 0.0 ] [V - 128] -// We must be able to multiply a YUV input by a matrix coefficient ranging as -// high as ~2.2 in the U/V cases, where U/V can be signed values between -128 -// and 127. The largest fixed-point representation we can thus support without -// overflowing 16 bit integers leaves us 6 bits of fractional precision while -// also supporting a sign bit. The closest representation of the Y coefficient -// ~1.164 in this precision is 74.5/2^6 which is common to all color spaces -// we support. Conversions can still sometimes overflow the precision and -// require clamping back into range, so we use saturated additions to do this -// efficiently at no extra cost. -// clang-format on -struct YUVMatrix { - // These constants are loaded off the "this" pointer via relative addressing - // modes and should be about as quick to load as directly addressed SIMD - // constant memory. - V8<int16_t> rbCoeffs; - V8<int16_t> gCoeffs; - V8<uint16_t> yScale; - V8<int16_t> yBias; - V8<int16_t> uvBias; - V8<int16_t> brMask; - - // Set the coefficients to cancel out and pass through YUV as GBR. All biases - // are set to zero and the BR-mask is set to remove the contribution of Y to - // the BR channels. Scales are set such that the shift by 6 in convert is - // balanced. - YUVMatrix() - : rbCoeffs(1 << 6), - gCoeffs(0), - yScale(1 << (6 + 1)), - yBias(0), - uvBias(0), - brMask(0) {} - - // Convert matrix coefficients to fixed-point representation. - YUVMatrix(double rv, double gu, double gv, double bu) - : rbCoeffs( - zip(I16(int16_t(bu * 64.0 + 0.5)), I16(int16_t(rv * 64.0 + 0.5)))), - gCoeffs(zip(I16(-int16_t(gu * -64.0 + 0.5)), - I16(-int16_t(gv * -64.0 + 0.5)))), - yScale(2 * 74 + 1), - yBias(int16_t(-16 * 74.5) + (1 << 5)), - uvBias(-128), - brMask(-1) {} - - ALWAYS_INLINE PackedRGBA8 convert(V8<int16_t> yy, V8<int16_t> uv) const { - // Bias Y values by -16 and multiply by 74.5. Add 2^5 offset to round to - // nearest 2^6. Note that we have to use an unsigned multiply with a 2x - // scale to represent a fractional scale and to avoid shifting with the sign - // bit. - yy = bit_cast<V8<int16_t>>((bit_cast<V8<uint16_t>>(yy) * yScale) >> 1) + - yBias; - - // Bias U/V values by -128. - uv += uvBias; - - // Compute (R, B) = (74.5*Y + rv*V, 74.5*Y + bu*U) - auto br = rbCoeffs * uv; - br = addsat(yy & brMask, br); - br >>= 6; - - // Compute G = 74.5*Y + -gu*U + -gv*V - auto gg = gCoeffs * uv; - gg = addsat( - yy, - addsat(gg, bit_cast<V8<int16_t>>(bit_cast<V4<uint32_t>>(gg) >> 16))); - gg >>= 6; - - // Interleave B/R and G values. Force alpha to opaque. - return packYUV(gg, br); - } -}; - -enum YUVColorSpace { REC_601 = 0, REC_709, REC_2020, IDENTITY }; - -static const YUVMatrix yuvMatrix[IDENTITY + 1] = { - // clang-format off -// From Rec601: -// [R] [1.1643835616438356, 0.0, 1.5960267857142858 ] [Y - 16] -// [G] = [1.1643835616438358, -0.3917622900949137, -0.8129676472377708 ] x [U - 128] -// [B] [1.1643835616438356, 2.017232142857143, 8.862867620416422e-17] [V - 128] - {1.5960267857142858, -0.3917622900949137, -0.8129676472377708, 2.017232142857143}, - -// From Rec709: -// [R] [1.1643835616438356, 0.0, 1.7927410714285714] [Y - 16] -// [G] = [1.1643835616438358, -0.21324861427372963, -0.532909328559444 ] x [U - 128] -// [B] [1.1643835616438356, 2.1124017857142854, 0.0 ] [V - 128] - {1.7927410714285714, -0.21324861427372963, -0.532909328559444, 2.1124017857142854}, - -// From Re2020: -// [R] [1.16438356164384, 0.0, 1.678674107142860 ] [Y - 16] -// [G] = [1.16438356164384, -0.187326104219343, -0.650424318505057 ] x [U - 128] -// [B] [1.16438356164384, 2.14177232142857, 0.0 ] [V - 128] - {1.678674107142860, -0.187326104219343, -0.650424318505057, 2.14177232142857}, - -// Identity -// [R] [V] -// [G] = [Y] -// [B] [U] - {}, - // clang-format on -}; - -// Helper function for textureLinearRowR8 that samples horizontal taps and -// combines them based on Y fraction with next row. -template <typename S> -static ALWAYS_INLINE V8<int16_t> linearRowTapsR8(S sampler, I32 ix, - int32_t offsety, - int32_t stridey, - int16_t fracy) { - uint8_t* buf = (uint8_t*)sampler->buf + offsety; - auto a0 = unaligned_load<V2<uint8_t>>(&buf[ix.x]); - auto b0 = unaligned_load<V2<uint8_t>>(&buf[ix.y]); - auto c0 = unaligned_load<V2<uint8_t>>(&buf[ix.z]); - auto d0 = unaligned_load<V2<uint8_t>>(&buf[ix.w]); - auto abcd0 = CONVERT(combine(a0, b0, c0, d0), V8<int16_t>); - buf += stridey; - auto a1 = unaligned_load<V2<uint8_t>>(&buf[ix.x]); - auto b1 = unaligned_load<V2<uint8_t>>(&buf[ix.y]); - auto c1 = unaligned_load<V2<uint8_t>>(&buf[ix.z]); - auto d1 = unaligned_load<V2<uint8_t>>(&buf[ix.w]); - auto abcd1 = CONVERT(combine(a1, b1, c1, d1), V8<int16_t>); - abcd0 += ((abcd1 - abcd0) * fracy) >> 7; - return abcd0; -} - -// Optimized version of textureLinearPackedR8 for Y R8 texture. This assumes -// constant Y and returns a duplicate of the result interleaved with itself -// to aid in later YUV transformation. -template <typename S> -static inline V8<int16_t> textureLinearRowR8(S sampler, I32 ix, int32_t offsety, - int32_t stridey, int16_t fracy) { - assert(sampler->format == TextureFormat::R8); - - // Calculate X fraction and clamp X offset into range. - I32 fracx = ix; - ix >>= 7; - fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F; - ix = clampCoord(ix, sampler->width - 1); - - // Load the sample taps and combine rows. - auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy); - - // Unzip the result and do final horizontal multiply-add base on X fraction. - auto abcdl = SHUFFLE(abcd, abcd, 0, 0, 2, 2, 4, 4, 6, 6); - auto abcdh = SHUFFLE(abcd, abcd, 1, 1, 3, 3, 5, 5, 7, 7); - abcdl += ((abcdh - abcdl) * CONVERT(fracx, I16).xxyyzzww) >> 7; - - // The final result is the packed values interleaved with a duplicate of - // themselves. - return abcdl; -} - -// Optimized version of textureLinearPackedR8 for paired U/V R8 textures. -// Since the two textures have the same dimensions and stride, the addressing -// math can be shared between both samplers. This also allows a coalesced -// multiply in the final stage by packing both U/V results into a single -// operation. -template <typename S> -static inline V8<int16_t> textureLinearRowPairedR8(S sampler, S sampler2, - I32 ix, int32_t offsety, - int32_t stridey, - int16_t fracy) { - assert(sampler->format == TextureFormat::R8 && - sampler2->format == TextureFormat::R8); - assert(sampler->width == sampler2->width && - sampler->height == sampler2->height); - assert(sampler->stride == sampler2->stride); - - // Calculate X fraction and clamp X offset into range. - I32 fracx = ix; - ix >>= 7; - fracx = ((fracx & (ix >= 0)) | (ix > int32_t(sampler->width) - 2)) & 0x7F; - ix = clampCoord(ix, sampler->width - 1); - - // Load the sample taps for the first sampler and combine rows. - auto abcd = linearRowTapsR8(sampler, ix, offsety, stridey, fracy); - - // Load the sample taps for the second sampler and combine rows. - auto xyzw = linearRowTapsR8(sampler2, ix, offsety, stridey, fracy); - - // We are left with a result vector for each sampler with values for adjacent - // pixels interleaved together in each. We need to unzip these values so that - // we can do the final horizontal multiply-add based on the X fraction. - auto abcdxyzwl = SHUFFLE(abcd, xyzw, 0, 8, 2, 10, 4, 12, 6, 14); - auto abcdxyzwh = SHUFFLE(abcd, xyzw, 1, 9, 3, 11, 5, 13, 7, 15); - abcdxyzwl += ((abcdxyzwh - abcdxyzwl) * CONVERT(fracx, I16).xxyyzzww) >> 7; - - // The final result is the packed values for the first sampler interleaved - // with the packed values for the second sampler. - return abcdxyzwl; -} - -// Casting to int loses some precision while stepping that can offset the -// image, so shift the values by some extra bits of precision to minimize -// this. We support up to 16 bits of image size, 7 bits of quantization, -// and 1 bit for sign, which leaves 8 bits left for extra precision. -const int STEP_BITS = 8; - -// Optimized version of textureLinearPackedR8 for Y R8 texture with -// half-resolution paired U/V R8 textures. This allows us to more efficiently -// pack YUV samples into vectors to substantially reduce math operations even -// further. -template <bool BLEND> -static inline void upscaleYUV42R8(uint32_t* dest, int span, uint8_t* yRow, - I32 yU, int32_t yDU, int32_t yStrideV, - int16_t yFracV, uint8_t* cRow1, - uint8_t* cRow2, I32 cU, int32_t cDU, - int32_t cStrideV, int16_t cFracV, - const YUVMatrix& colorSpace) { - // As much as possible try to utilize the fact that we're only using half - // the UV samples to combine Y and UV samples into single vectors. Here we - // need to initialize several useful vector quantities for stepping fractional - // offsets. For the UV samples, we take the average of the first+second and - // third+fourth samples in a chunk which conceptually correspond to offsets - // 0.5 and 1.5 (in 0..2 range). This allows us to reconstruct intermediate - // samples 0.25, 0.75, 1.25, and 1.75 later. X fraction is shifted over into - // the top 7 bits of an unsigned short so that we can mask off the exact - // fractional bits we need to blend merely by right shifting them into - // position. - cU = (cU.xzxz + cU.ywyw) >> 1; - auto ycFracX = CONVERT(combine(yU, cU), V8<uint16_t>) - << (16 - (STEP_BITS + 7)); - auto ycFracDX = combine(I16(yDU), I16(cDU)) << (16 - (STEP_BITS + 7)); - auto ycFracV = combine(I16(yFracV), I16(cFracV)); - I32 yI = yU >> (STEP_BITS + 7); - I32 cI = cU >> (STEP_BITS + 7); - // Load initial combined YUV samples for each row and blend them. - auto ycSrc0 = - CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x]), - combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x]), - unaligned_load<V2<uint8_t>>(&cRow2[cI.x]))), - V8<int16_t>); - auto ycSrc1 = CONVERT( - combine(unaligned_load<V4<uint8_t>>(&yRow[yI.x + yStrideV]), - combine(unaligned_load<V2<uint8_t>>(&cRow1[cI.x + cStrideV]), - unaligned_load<V2<uint8_t>>(&cRow2[cI.x + cStrideV]))), - V8<int16_t>); - auto ycSrc = ycSrc0 + (((ycSrc1 - ycSrc0) * ycFracV) >> 7); - - // Here we shift in results from the next sample while caching results from - // the previous sample. This allows us to reduce the multiplications in the - // inner loop down to only two since we just need to blend the new samples - // horizontally and then vertically once each. - for (uint32_t* end = dest + span; dest < end; dest += 4) { - yU += yDU; - I32 yIn = yU >> (STEP_BITS + 7); - cU += cDU; - I32 cIn = cU >> (STEP_BITS + 7); - // Load combined YUV samples for the next chunk on each row and blend them. - auto ycSrc0n = - CONVERT(combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x]), - combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x]), - unaligned_load<V2<uint8_t>>(&cRow2[cIn.x]))), - V8<int16_t>); - auto ycSrc1n = CONVERT( - combine(unaligned_load<V4<uint8_t>>(&yRow[yIn.x + yStrideV]), - combine(unaligned_load<V2<uint8_t>>(&cRow1[cIn.x + cStrideV]), - unaligned_load<V2<uint8_t>>(&cRow2[cIn.x + cStrideV]))), - V8<int16_t>); - auto ycSrcn = ycSrc0n + (((ycSrc1n - ycSrc0n) * ycFracV) >> 7); - - // The source samples for the chunk may not match the actual tap offsets. - // Since we're upscaling, we know the tap offsets fall within all the - // samples in a 4-wide chunk. Since we can't rely on PSHUFB or similar, - // instead we do laborious shuffling here for the Y samples and then the UV - // samples. - auto yshuf = lowHalf(ycSrc); - auto yshufn = - SHUFFLE(yshuf, yIn.x == yI.w ? lowHalf(ycSrcn).yyyy : lowHalf(ycSrcn), - 1, 2, 3, 4); - if (yI.y == yI.x) { - yshuf = yshuf.xxyz; - yshufn = yshufn.xxyz; - } - if (yI.z == yI.y) { - yshuf = yshuf.xyyz; - yshufn = yshufn.xyyz; - } - if (yI.w == yI.z) { - yshuf = yshuf.xyzz; - yshufn = yshufn.xyzz; - } - - auto cshuf = highHalf(ycSrc); - auto cshufn = - SHUFFLE(cshuf, cIn.x == cI.y ? highHalf(ycSrcn).yyww : highHalf(ycSrcn), - 1, 4, 3, 6); - if (cI.y == cI.x) { - cshuf = cshuf.xxzz; - cshufn = cshufn.xxzz; - } - - // After shuffling, combine the Y and UV samples back into a single vector - // for blending. Shift X fraction into position as unsigned to mask off top - // bits and get rid of low bits to avoid multiplication overflow. - auto yuvPx = combine(yshuf, cshuf); - yuvPx += ((combine(yshufn, cshufn) - yuvPx) * - bit_cast<V8<int16_t>>(ycFracX >> (16 - 7))) >> - 7; - - // Cache the new samples as the current samples on the next iteration. - ycSrc = ycSrcn; - ycFracX += ycFracDX; - yI = yIn; - cI = cIn; - - // De-interleave the Y and UV results. We need to average the UV results - // to produce values for intermediate samples. Taps for UV were collected at - // offsets 0.5 and 1.5, such that if we take a quarter of the difference - // (1.5-0.5)/4, subtract it from even samples, and add it to odd samples, - // we can estimate samples 0.25, 0.75, 1.25, and 1.75. - auto yPx = SHUFFLE(yuvPx, yuvPx, 0, 0, 1, 1, 2, 2, 3, 3); - auto uvPx = SHUFFLE(yuvPx, yuvPx, 4, 6, 4, 6, 5, 7, 5, 7) + - ((SHUFFLE(yuvPx, yuvPx, 4, 6, 5, 7, 4, 6, 5, 7) - - SHUFFLE(yuvPx, yuvPx, 5, 7, 4, 6, 5, 7, 4, 6)) >> - 2); - - commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx)); - } -} - -// This is the inner loop driver of CompositeYUV that processes an axis-aligned -// YUV span, dispatching based on appropriate format and scaling. This is also -// reused by blendYUV to accelerate some cases of texture sampling in the -// shader. -template <bool BLEND = false> -static void linear_row_yuv(uint32_t* dest, int span, sampler2DRect samplerY, - const vec2_scalar& srcUV, float srcDU, - sampler2DRect samplerU, sampler2DRect samplerV, - const vec2_scalar& chromaUV, float chromaDU, - int colorDepth, const YUVMatrix& colorSpace) { - // Calculate varying and constant interp data for Y plane. - I32 yU = cast(init_interp(srcUV.x, srcDU) * (1 << STEP_BITS)); - int32_t yV = int32_t(srcUV.y); - - // Calculate varying and constant interp data for chroma planes. - I32 cU = cast(init_interp(chromaUV.x, chromaDU) * (1 << STEP_BITS)); - int32_t cV = int32_t(chromaUV.y); - - // We need to skip 4 pixels per chunk. - int32_t yDU = int32_t((4 << STEP_BITS) * srcDU); - int32_t cDU = int32_t((4 << STEP_BITS) * chromaDU); - - if (samplerY->width < 2 || samplerU->width < 2) { - // If the source row has less than 2 pixels, it's not safe to use a linear - // filter because it may overread the row. Just convert the single pixel - // with nearest filtering and fill the row with it. - I16 yuv = CONVERT( - round_pixel((Float){texelFetch(samplerY, ivec2(srcUV)).x.x, - texelFetch(samplerU, ivec2(chromaUV)).x.x, - texelFetch(samplerV, ivec2(chromaUV)).x.x, 1.0f}), - I16); - commit_solid_span<BLEND>( - dest, - unpack(colorSpace.convert(V8<int16_t>(yuv.x), - zip(I16(yuv.y), I16(yuv.z)))), - span); - } else if (samplerY->format == TextureFormat::R16) { - // Sample each YUV plane, rescale it to fit in low 8 bits of word, and - // then transform them by the appropriate color space. - assert(colorDepth > 8); - // Need to right shift the sample by the amount of bits over 8 it - // occupies. On output from textureLinearUnpackedR16, we have lost 1 bit - // of precision at the low end already, hence 1 is subtracted from the - // color depth. - int rescaleBits = (colorDepth - 1) - 8; - for (; span >= 4; span -= 4) { - auto yPx = - textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >> - rescaleBits; - auto uPx = - textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >> - rescaleBits; - auto vPx = - textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >> - rescaleBits; - commit_blend_span<BLEND>( - dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx))); - dest += 4; - yU += yDU; - cU += cDU; - } - if (span > 0) { - // Handle any remaining pixels... - auto yPx = - textureLinearUnpackedR16(samplerY, ivec2(yU >> STEP_BITS, yV)) >> - rescaleBits; - auto uPx = - textureLinearUnpackedR16(samplerU, ivec2(cU >> STEP_BITS, cV)) >> - rescaleBits; - auto vPx = - textureLinearUnpackedR16(samplerV, ivec2(cU >> STEP_BITS, cV)) >> - rescaleBits; - commit_blend_span<BLEND>( - dest, colorSpace.convert(zip(yPx, yPx), zip(uPx, vPx)), span); - } - } else { - assert(samplerY->format == TextureFormat::R8); - assert(colorDepth == 8); - - // Calculate varying and constant interp data for Y plane. - int16_t yFracV = yV & 0x7F; - yV >>= 7; - int32_t yOffsetV = clampCoord(yV, samplerY->height) * samplerY->stride; - int32_t yStrideV = - yV >= 0 && yV < int32_t(samplerY->height) - 1 ? samplerY->stride : 0; - - // Calculate varying and constant interp data for chroma planes. - int16_t cFracV = cV & 0x7F; - cV >>= 7; - int32_t cOffsetV = clampCoord(cV, samplerU->height) * samplerU->stride; - int32_t cStrideV = - cV >= 0 && cV < int32_t(samplerU->height) - 1 ? samplerU->stride : 0; - - // If we're sampling the UV planes at half the resolution of the Y plane, - // then try to use half resolution fast-path. - if (yDU >= cDU && cDU > 0 && yDU <= (4 << (STEP_BITS + 7)) && - cDU <= (2 << (STEP_BITS + 7))) { - // Ensure that samples don't fall outside of the valid bounds of each - // planar texture. Step until the initial X coordinates are positive. - for (; (yU.x < 0 || cU.x < 0) && span >= 4; span -= 4) { - auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, - yStrideV, yFracV); - auto uvPx = textureLinearRowPairedR8( - samplerU, samplerV, cU >> STEP_BITS, cOffsetV, cStrideV, cFracV); - commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx)); - dest += 4; - yU += yDU; - cU += cDU; - } - // Calculate the number of aligned chunks that we can step inside the - // bounds of each planar texture without overreading. - int inside = min( - min((((int(samplerY->width) - 4) << (STEP_BITS + 7)) - yU.x) / yDU, - (((int(samplerU->width) - 4) << (STEP_BITS + 7)) - cU.x) / cDU) * - 4, - span & ~3); - if (inside > 0) { - uint8_t* yRow = (uint8_t*)samplerY->buf + yOffsetV; - uint8_t* cRow1 = (uint8_t*)samplerU->buf + cOffsetV; - uint8_t* cRow2 = (uint8_t*)samplerV->buf + cOffsetV; - upscaleYUV42R8<BLEND>(dest, inside, yRow, yU, yDU, yStrideV, yFracV, - cRow1, cRow2, cU, cDU, cStrideV, cFracV, - colorSpace); - span -= inside; - dest += inside; - yU += (inside / 4) * yDU; - cU += (inside / 4) * cDU; - } - // If there are any remaining chunks that weren't inside, handle them - // below. - } - for (; span >= 4; span -= 4) { - // Sample each YUV plane and then transform them by the appropriate - // color space. - auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, - yStrideV, yFracV); - auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS, - cOffsetV, cStrideV, cFracV); - commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx)); - dest += 4; - yU += yDU; - cU += cDU; - } - if (span > 0) { - // Handle any remaining pixels... - auto yPx = textureLinearRowR8(samplerY, yU >> STEP_BITS, yOffsetV, - yStrideV, yFracV); - auto uvPx = textureLinearRowPairedR8(samplerU, samplerV, cU >> STEP_BITS, - cOffsetV, cStrideV, cFracV); - commit_blend_span<BLEND>(dest, colorSpace.convert(yPx, uvPx), span); - } - } -} - -static void linear_convert_yuv(Texture& ytex, Texture& utex, Texture& vtex, - YUVColorSpace colorSpace, int colorDepth, - const IntRect& srcReq, Texture& dsttex, - const IntRect& dstReq, bool invertY, - const IntRect& clipRect) { - // Compute valid dest bounds - IntRect dstBounds = dsttex.sample_bounds(dstReq, invertY); - dstBounds.intersect(clipRect); - // Check if sampling bounds are empty - if (dstBounds.is_empty()) { - return; - } - // Initialize samplers for source textures - sampler2DRect_impl sampler[3]; - init_sampler(&sampler[0], ytex); - init_sampler(&sampler[1], utex); - init_sampler(&sampler[2], vtex); - - // Compute source UVs - vec2_scalar srcUV(srcReq.x0, srcReq.y0); - vec2_scalar srcDUV(float(srcReq.width()) / dstReq.width(), - float(srcReq.height()) / dstReq.height()); - // Inverted Y must step downward along source rows - if (invertY) { - srcUV.y += srcReq.height(); - srcDUV.y = -srcDUV.y; - } - // Skip to clamped source start - srcUV += srcDUV * (vec2_scalar(dstBounds.x0, dstBounds.y0) + 0.5f); - // Calculate separate chroma UVs for chroma planes with different scale - vec2_scalar chromaScale(float(utex.width) / ytex.width, - float(utex.height) / ytex.height); - vec2_scalar chromaUV = srcUV * chromaScale; - vec2_scalar chromaDUV = srcDUV * chromaScale; - // Scale UVs by lerp precision. If the row has only 1 pixel, then don't - // quantize so that we can use nearest filtering instead to avoid overreads. - if (ytex.width >= 2 && utex.width >= 2) { - srcUV = linearQuantize(srcUV, 128); - srcDUV *= 128.0f; - chromaUV = linearQuantize(chromaUV, 128); - chromaDUV *= 128.0f; - } - // Calculate dest pointer from clamped offsets - int destStride = dsttex.stride(); - char* dest = dsttex.sample_ptr(dstReq, dstBounds); - int span = dstBounds.width(); - for (int rows = dstBounds.height(); rows > 0; rows--) { - linear_row_yuv((uint32_t*)dest, span, &sampler[0], srcUV, srcDUV.x, - &sampler[1], &sampler[2], chromaUV, chromaDUV.x, colorDepth, - yuvMatrix[colorSpace]); - dest += destStride; - srcUV.y += srcDUV.y; - chromaUV.y += chromaDUV.y; - } -} - -extern "C" { - -// Extension for compositing a YUV surface represented by separate YUV planes -// to a BGRA destination. The supplied color space is used to determine the -// transform from YUV to BGRA after sampling. -void CompositeYUV(LockedTexture* lockedDst, LockedTexture* lockedY, - LockedTexture* lockedU, LockedTexture* lockedV, - YUVColorSpace colorSpace, GLuint colorDepth, GLint srcX, - GLint srcY, GLsizei srcWidth, GLsizei srcHeight, GLint dstX, - GLint dstY, GLsizei dstWidth, GLsizei dstHeight, - GLboolean flip, GLint clipX, GLint clipY, GLsizei clipWidth, - GLsizei clipHeight) { - if (!lockedDst || !lockedY || !lockedU || !lockedV) { - return; - } - if (colorSpace > IDENTITY) { - assert(false); - return; - } - Texture& ytex = *lockedY; - Texture& utex = *lockedU; - Texture& vtex = *lockedV; - Texture& dsttex = *lockedDst; - // All YUV planes must currently be represented by R8 or R16 textures. - // The chroma (U/V) planes must have matching dimensions. - assert(ytex.bpp() == utex.bpp() && ytex.bpp() == vtex.bpp()); - assert((ytex.bpp() == 1 && colorDepth == 8) || - (ytex.bpp() == 2 && colorDepth > 8)); - // assert(ytex.width == utex.width && ytex.height == utex.height); - assert(utex.width == vtex.width && utex.height == vtex.height); - assert(ytex.offset == utex.offset && ytex.offset == vtex.offset); - assert(dsttex.bpp() == 4); - - IntRect srcReq = - IntRect{srcX, srcY, srcX + srcWidth, srcY + srcHeight} - ytex.offset; - IntRect dstReq = - IntRect{dstX, dstY, dstX + dstWidth, dstY + dstHeight} - dsttex.offset; - // Compute clip rect as relative to the dstReq, as that's the same coords - // as used for the sampling bounds. - IntRect clipRect = {clipX - dstX, clipY - dstY, clipX - dstX + clipWidth, - clipY - dstY + clipHeight}; - // For now, always use a linear filter path that would be required for - // scaling. Further fast-paths for non-scaled video might be desirable in the - // future. - linear_convert_yuv(ytex, utex, vtex, colorSpace, colorDepth, srcReq, dsttex, - dstReq, flip, clipRect); -} - -} // extern "C" |