From 7f539b6033a07550000d33e8d6e4f8bf0fbf96cc Mon Sep 17 00:00:00 2001 From: degasus Date: Sun, 16 Feb 2014 15:17:21 +0100 Subject: [PATCH 1/9] ogl: optimize real xfb a bit Both nvidia + mesa seems not to optimize x / (2**n) to x >> n, so we do it ourself. --- Source/Core/VideoBackends/OGL/TextureConverter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Source/Core/VideoBackends/OGL/TextureConverter.cpp b/Source/Core/VideoBackends/OGL/TextureConverter.cpp index 753bd64ada..dbbb605f6b 100644 --- a/Source/Core/VideoBackends/OGL/TextureConverter.cpp +++ b/Source/Core/VideoBackends/OGL/TextureConverter.cpp @@ -117,7 +117,7 @@ void CreatePrograms() " ivec2 uv = ivec2(gl_FragCoord.xy);\n" // We switch top/bottom here. TODO: move this to screen blit. " ivec2 ts = textureSize(samp9, 0);\n" - " vec4 c0 = texelFetch(samp9, ivec2(uv.x/2, ts.y-uv.y-1), 0);\n" + " vec4 c0 = texelFetch(samp9, ivec2(uv.x>>1, ts.y-uv.y-1), 0);\n" " float y = mix(c0.b, c0.r, (uv.x & 1) == 1);\n" " float yComp = 1.164 * (y - 0.0625);\n" " float uComp = c0.g - 0.5;\n" From f99c8a0b70c604565731d2c7c000629806d755b4 Mon Sep 17 00:00:00 2001 From: degasus Date: Sun, 16 Feb 2014 19:13:48 +0100 Subject: [PATCH 2/9] merge common parts of encoding shaders --- Source/Core/VideoCommon/TextureConversionShader.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp index a7db33418f..6b9e1af370 100644 --- a/Source/Core/VideoCommon/TextureConversionShader.cpp +++ b/Source/Core/VideoCommon/TextureConversionShader.cpp @@ -105,6 +105,7 @@ void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) WRITE(p, " sampleUv.x = xib + halfxb * %d;\n", blkW); WRITE(p, " sampleUv.y = yb + xoff;\n"); + WRITE(p, " bool first = xb == (halfxb * 2);\n"); } void WriteSampleColor(char*& p, const char* colorComp, const char* dest, int xoffset, API_TYPE ApiType) @@ -373,8 +374,6 @@ void WriteRGBA8Encoder(char* p,API_TYPE ApiType) { WriteSwizzler(p, GX_TF_RGBA8, ApiType); - WRITE(p, " bool first = xb == (halfxb * 2);\n"); - WRITE(p, " float4 texSample;\n"); WRITE(p, " float4 color0;\n"); WRITE(p, " float4 color1;\n"); @@ -563,8 +562,6 @@ void WriteZ24Encoder(char* p, API_TYPE ApiType) { WriteSwizzler(p, GX_TF_Z24X8, ApiType); - WRITE(p, " bool first = xb == (halfxb * 2);\n"); - WRITE(p, " float depth0;\n"); WRITE(p, " float depth1;\n"); WRITE(p, " float3 expanded0;\n"); From 94da4e1aa29a68cb723898a63bd01568bbbd8c4f Mon Sep 17 00:00:00 2001 From: degasus Date: Mon, 24 Feb 2014 10:20:53 +0100 Subject: [PATCH 3/9] MathUtil: Change Log2 return value to int Log2(u64) can't be bigger than 63, so there is no need in forcing a 64 bit value. So just using a common int seems more natural. --- Source/Core/Common/MathUtil.h | 4 ++-- Source/Core/DolphinWX/GameListCtrl.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Source/Core/Common/MathUtil.h b/Source/Core/Common/MathUtil.h index da12822f5d..14b9309fc6 100644 --- a/Source/Core/Common/MathUtil.h +++ b/Source/Core/Common/MathUtil.h @@ -150,7 +150,7 @@ float MathFloatVectorSum(const std::vector&); #define ROUND_DOWN(x, a) ((x) & ~((a) - 1)) // Rounds down. 0 -> undefined -inline u64 Log2(u64 val) +inline int Log2(u64 val) { #if defined(__GNUC__) return 63 - __builtin_clzll(val); @@ -161,7 +161,7 @@ inline u64 Log2(u64 val) return result; #else - u64 result = -1; + int result = -1; while (val != 0) { val >>= 1; diff --git a/Source/Core/DolphinWX/GameListCtrl.cpp b/Source/Core/DolphinWX/GameListCtrl.cpp index bd420f8156..6029c741a4 100644 --- a/Source/Core/DolphinWX/GameListCtrl.cpp +++ b/Source/Core/DolphinWX/GameListCtrl.cpp @@ -413,12 +413,12 @@ wxString NiceSizeFormat(u64 _size) { const char* const unit_symbols[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"}; - auto const unit = Log2(std::max(_size, 1)) / 10; - auto const unit_size = (1 << (unit * 10)); + const u64 unit = Log2(std::max(_size, 1)) / 10; + const u64 unit_size = (1 << (unit * 10)); // ugly rounding integer math - auto const value = (_size + unit_size / 2) / unit_size; - auto const frac = (_size % unit_size * 10 + unit_size / 2) / unit_size % 10; + const u64 value = (_size + unit_size / 2) / unit_size; + const u64 frac = (_size % unit_size * 10 + unit_size / 2) / unit_size % 10; return StrToWxStr(StringFromFormat("%" PRIu64 ".%" PRIu64 " %s", value, frac, unit_symbols[unit])); } From bd3beeb1843be807629bf9680fe7d27497606801 Mon Sep 17 00:00:00 2001 From: degasus Date: Mon, 24 Feb 2014 16:09:20 +0100 Subject: [PATCH 4/9] TextureConverter: Use Log2() and shifts instead of multiplications/divisions --- .../VideoCommon/TextureConversionShader.cpp | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp index 6b9e1af370..0fca9df552 100644 --- a/Source/Core/VideoCommon/TextureConversionShader.cpp +++ b/Source/Core/VideoCommon/TextureConversionShader.cpp @@ -14,6 +14,7 @@ #include "VideoCommon/TextureConversionShader.h" #include "VideoCommon/TextureDecoder.h" #include "VideoCommon/VideoConfig.h" +#include "Common/MathUtil.h" #define WRITE p+=sprintf @@ -90,22 +91,22 @@ void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) " float2 uv0 = float2(0.0, 0.0);\n" ); - WRITE(p, " uv1.x = uv1.x * %d;\n", samples); + WRITE(p, " uv1.x = uv1.x << %d;\n", Log2(samples)); - WRITE(p, " int yl = uv1.y / %d;\n", blkH); - WRITE(p, " int yb = yl * %d;\n", blkH); + WRITE(p, " int yl = uv1.y >> %d;\n", Log2(blkH)); + WRITE(p, " int yb = yl << %d;\n", Log2(blkH)); WRITE(p, " int yoff = uv1.y - yb;\n"); WRITE(p, " int xp = uv1.x + yoff * position.z;\n"); - WRITE(p, " int xel = xp / %d;\n", samples == 1 ? factor : blkW); - WRITE(p, " int xb = xel / %d;\n", blkH); - WRITE(p, " int xoff = xel - xb * %d;\n", blkH); - WRITE(p, " int xl = uv1.x * %d / %d;\n", factor, blkW); - WRITE(p, " int xib = uv1.x * %d - xl * %d;\n", factor, blkW); - WRITE(p, " int halfxb = xb / %d;\n", factor); + WRITE(p, " int xel = xp >> %d;\n", Log2(samples == 1 ? factor : blkW)); + WRITE(p, " int xb = xel >> %d;\n", Log2(blkH)); + WRITE(p, " int xoff = xel - (xb << %d);\n", Log2(blkH)); + WRITE(p, " int xl = (uv1.x << %d) >> %d;\n", Log2(factor), Log2(blkW)); + WRITE(p, " int xib = (uv1.x << %d) - (xl << %d);\n", Log2(factor), Log2(blkW)); + WRITE(p, " int halfxb = xb >> %d;\n", Log2(factor)); - WRITE(p, " sampleUv.x = xib + halfxb * %d;\n", blkW); + WRITE(p, " sampleUv.x = xib + (halfxb << %d);\n", Log2(blkW)); WRITE(p, " sampleUv.y = yb + xoff;\n"); - WRITE(p, " bool first = xb == (halfxb * 2);\n"); + WRITE(p, " bool first = xb == (halfxb << 1);\n"); } void WriteSampleColor(char*& p, const char* colorComp, const char* dest, int xoffset, API_TYPE ApiType) From 8a4aa8c1f5ff0db2b531405bb087c1f53d2da4be Mon Sep 17 00:00:00 2001 From: degasus Date: Mon, 24 Feb 2014 16:15:44 +0100 Subject: [PATCH 5/9] Rewrite texture tiling implementation inline halfxb So we know which is the first pixel by masking. inline xl inline xb a bit inline yl inline uv1.x shift remove likely wrong guessed ternary operator add pixel layout comment inline xel optimize the shifts a bit inline xb optimize shifts in a second step extract xb rename all variables calculate cache line by position.x Revert 5115b459f40d53044cd7a858f52e6e876e1211b4 "optimize the shifts a bit" It seems I was wrong, the other way is the more natural. use x_virtual_position instead of uv1.x for x_offset_in_block This looks more natural and the offset should be masked anyway. substitude factor with cache_lines move 32bit logic in a conditional block --- .../VideoCommon/TextureConversionShader.cpp | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp index 0fca9df552..c85c028b93 100644 --- a/Source/Core/VideoCommon/TextureConversionShader.cpp +++ b/Source/Core/VideoCommon/TextureConversionShader.cpp @@ -66,8 +66,7 @@ void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) int blkW = TexDecoder_GetBlockWidthInTexels(format); int blkH = TexDecoder_GetBlockHeightInTexels(format); int samples = GetEncodedSampleCount(format); - // 32 bit textures (RGBA8 and Z24) are store in 2 cache line increments - int factor = samples == 1 ? 2 : 1; + if (ApiType == API_OPENGL) { WRITE(p, "#define samp0 samp9\n"); @@ -91,22 +90,21 @@ void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) " float2 uv0 = float2(0.0, 0.0);\n" ); - WRITE(p, " uv1.x = uv1.x << %d;\n", Log2(samples)); + WRITE(p, " int y_block_position = uv1.y & ~(%d - 1);\n", blkH); + WRITE(p, " int y_offset_in_block = uv1.y & (%d - 1);\n", blkH); + WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples)); + WRITE(p, " int x_block_position = (x_virtual_position >> %d) & ~(%d - 1);\n", Log2(blkH), blkW); + if (samples == 1) + { + // 32 bit textures (RGBA8 and Z24) are store in 2 cache line increments + WRITE(p, " bool first = 0 == (x_virtual_position & %d);\n", 8 * samples); // first cache line, used in the encoders + WRITE(p, " x_virtual_position = x_virtual_position << 1;\n"); + } + WRITE(p, " int x_offset_in_block = x_virtual_position & (%d - 1);\n", blkW); + WRITE(p, " int y_offset = (x_virtual_position >> %d) & (%d - 1);\n", Log2(blkW), blkH); - WRITE(p, " int yl = uv1.y >> %d;\n", Log2(blkH)); - WRITE(p, " int yb = yl << %d;\n", Log2(blkH)); - WRITE(p, " int yoff = uv1.y - yb;\n"); - WRITE(p, " int xp = uv1.x + yoff * position.z;\n"); - WRITE(p, " int xel = xp >> %d;\n", Log2(samples == 1 ? factor : blkW)); - WRITE(p, " int xb = xel >> %d;\n", Log2(blkH)); - WRITE(p, " int xoff = xel - (xb << %d);\n", Log2(blkH)); - WRITE(p, " int xl = (uv1.x << %d) >> %d;\n", Log2(factor), Log2(blkW)); - WRITE(p, " int xib = (uv1.x << %d) - (xl << %d);\n", Log2(factor), Log2(blkW)); - WRITE(p, " int halfxb = xb >> %d;\n", Log2(factor)); - - WRITE(p, " sampleUv.x = xib + (halfxb << %d);\n", Log2(blkW)); - WRITE(p, " sampleUv.y = yb + xoff;\n"); - WRITE(p, " bool first = xb == (halfxb << 1);\n"); + WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n"); + WRITE(p, " sampleUv.y = y_block_position + y_offset;\n"); } void WriteSampleColor(char*& p, const char* colorComp, const char* dest, int xoffset, API_TYPE ApiType) From 11efa88157f02e1e79edf947041cb0c6f2ca6c30 Mon Sep 17 00:00:00 2001 From: degasus Date: Tue, 25 Feb 2014 16:06:55 +0100 Subject: [PATCH 6/9] calculate constant values on shader compilation --- Source/Core/VideoCommon/TextureConversionShader.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp index c85c028b93..ce7f907bed 100644 --- a/Source/Core/VideoCommon/TextureConversionShader.cpp +++ b/Source/Core/VideoCommon/TextureConversionShader.cpp @@ -90,18 +90,18 @@ void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) " float2 uv0 = float2(0.0, 0.0);\n" ); - WRITE(p, " int y_block_position = uv1.y & ~(%d - 1);\n", blkH); - WRITE(p, " int y_offset_in_block = uv1.y & (%d - 1);\n", blkH); + WRITE(p, " int y_block_position = uv1.y & %d;\n", ~(blkH - 1)); + WRITE(p, " int y_offset_in_block = uv1.y & %d;\n", blkH - 1); WRITE(p, " int x_virtual_position = (uv1.x << %d) + y_offset_in_block * position.z;\n", Log2(samples)); - WRITE(p, " int x_block_position = (x_virtual_position >> %d) & ~(%d - 1);\n", Log2(blkH), blkW); + WRITE(p, " int x_block_position = (x_virtual_position >> %d) & %d;\n", Log2(blkH), ~(blkW - 1)); if (samples == 1) { // 32 bit textures (RGBA8 and Z24) are store in 2 cache line increments WRITE(p, " bool first = 0 == (x_virtual_position & %d);\n", 8 * samples); // first cache line, used in the encoders WRITE(p, " x_virtual_position = x_virtual_position << 1;\n"); } - WRITE(p, " int x_offset_in_block = x_virtual_position & (%d - 1);\n", blkW); - WRITE(p, " int y_offset = (x_virtual_position >> %d) & (%d - 1);\n", Log2(blkW), blkH); + WRITE(p, " int x_offset_in_block = x_virtual_position & %d;\n", blkW - 1); + WRITE(p, " int y_offset = (x_virtual_position >> %d) & %d;\n", Log2(blkW), blkH - 1); WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n"); WRITE(p, " sampleUv.y = y_block_position + y_offset;\n"); From 1d0b6a11564644f7fa86faedd164833b3c05f669 Mon Sep 17 00:00:00 2001 From: degasus Date: Tue, 25 Feb 2014 22:21:15 +0100 Subject: [PATCH 7/9] Merge duplicate parts of sampler into header --- .../VideoCommon/TextureConversionShader.cpp | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Source/Core/VideoCommon/TextureConversionShader.cpp b/Source/Core/VideoCommon/TextureConversionShader.cpp index ce7f907bed..1d2e01b8f2 100644 --- a/Source/Core/VideoCommon/TextureConversionShader.cpp +++ b/Source/Core/VideoCommon/TextureConversionShader.cpp @@ -87,7 +87,6 @@ void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) WRITE(p, "{\n" " int2 sampleUv;\n" " int2 uv1 = int2(gl_FragCoord.xy);\n" - " float2 uv0 = float2(0.0, 0.0);\n" ); WRITE(p, " int y_block_position = uv1.y & %d;\n", ~(blkH - 1)); @@ -105,19 +104,21 @@ void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) WRITE(p, " sampleUv.x = x_offset_in_block + x_block_position;\n"); WRITE(p, " sampleUv.y = y_block_position + y_offset;\n"); + + WRITE(p, " float2 uv0 = float2(sampleUv);\n"); // sampleUv is the sample position in (int)gx_coords + WRITE(p, " uv0 += float2(0.5, 0.5);\n"); // move to center of pixel + WRITE(p, " uv0 *= float(position.w);\n"); // scale by two if needed (this will move to pixels border to filter linear) + WRITE(p, " uv0 += float2(position.xy);\n"); // move to copyed rect + WRITE(p, " uv0 /= float2(%d, %d);\n", EFB_WIDTH, EFB_HEIGHT); // normlize to [0:1] + WRITE(p, " uv0.y = 1.0-uv0.y;\n"); // ogl foo (disable this line for d3d) + + WRITE(p, " float sample_offset = position.w / float(%d);\n", EFB_WIDTH); } void WriteSampleColor(char*& p, const char* colorComp, const char* dest, int xoffset, API_TYPE ApiType) { - WRITE(p, // sampleUv is the sample position in (int)gx_coords - "uv0 = float2(sampleUv + int2(%d, 0));\n" // pixel offset (if more than one pixel is samped) - "uv0 += float2(0.5, 0.5);\n" // move to center of pixel - "uv0 *= float(position.w);\n" // scale by two if needed (this will move to pixels border to filter linear) - "uv0 += float2(position.xy);\n" // move to copyed rect - "uv0 /= float2(%d, %d);\n" // normlize to [0:1] - "uv0.y = 1.0-uv0.y;\n" // ogl foo (disable this line for d3d) - "%s = texture(samp0, uv0).%s;\n", - xoffset, EFB_WIDTH, EFB_HEIGHT, dest, colorComp + WRITE(p, " %s = texture(samp0, uv0 + float2(%d, 0) * sample_offset).%s;\n", + dest, xoffset, colorComp ); } From aaaa5af0b23b25b1eab3cf37ea6f0cb70c125559 Mon Sep 17 00:00:00 2001 From: degasus Date: Tue, 25 Feb 2014 15:52:22 +0100 Subject: [PATCH 8/9] remove (ATTR|VARY)(IN|OUT) macros --- .../VideoBackends/OGL/ProgramShaderCache.cpp | 10 ++----- Source/Core/VideoBackends/OGL/RasterFont.cpp | 8 +++--- Source/Core/VideoBackends/OGL/Render.cpp | 8 +++--- .../Core/VideoBackends/OGL/TextureCache.cpp | 6 ++--- .../VideoBackends/OGL/TextureConverter.cpp | 6 ++--- Source/Core/VideoCommon/PixelShaderGen.cpp | 10 +++---- Source/Core/VideoCommon/VertexShaderGen.cpp | 26 +++++++++---------- 7 files changed, 34 insertions(+), 40 deletions(-) diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp index 79826860a7..954755cb95 100644 --- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp +++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp @@ -452,12 +452,6 @@ void ProgramShaderCache::CreateHeader ( void ) // Precision defines for GLSLES3 "%s\n" - "\n"// A few required defines and ones that will make our lives a lot easier - "#define ATTRIN in\n" - "#define ATTROUT out\n" - "#define VARYIN %s\n" - "#define VARYOUT %s\n" - // Silly differences "#define float2 vec2\n" "#define float3 vec3\n" @@ -472,6 +466,7 @@ void ProgramShaderCache::CreateHeader ( void ) // Terrible hack, look at DriverDetails.h "%s\n" + "%s\n" , v==GLSLES3 ? "#version 300 es" : v==GLSL_130 ? "#version 130" : v==GLSL_140 ? "#version 140" : "#version 150" , v Date: Wed, 26 Feb 2014 12:48:52 +0100 Subject: [PATCH 9/9] comment fixes --- .../Core/VideoBackends/OGL/ProgramShaderCache.cpp | 6 +++--- Source/Core/VideoCommon/TextureConversionShader.cpp | 13 ++++++++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp index 954755cb95..0de9034f51 100644 --- a/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp +++ b/Source/Core/VideoBackends/OGL/ProgramShaderCache.cpp @@ -464,9 +464,9 @@ void ProgramShaderCache::CreateHeader ( void ) "#define frac fract\n" "#define lerp mix\n" - // Terrible hack, look at DriverDetails.h - "%s\n" - "%s\n" + // Terrible hacks, look at DriverDetails.h + "%s\n" // replace textureSize as constant + "%s\n" // wipe out all centroid usages , v==GLSLES3 ? "#version 300 es" : v==GLSL_130 ? "#version 130" : v==GLSL_140 ? "#version 140" : "#version 150" , v> %d) & %d;\n", Log2(blkH), ~(blkW - 1)); if (samples == 1) { - // 32 bit textures (RGBA8 and Z24) are store in 2 cache line increments + // 32 bit textures (RGBA8 and Z24) are stored in 2 cache line increments WRITE(p, " bool first = 0 == (x_virtual_position & %d);\n", 8 * samples); // first cache line, used in the encoders WRITE(p, " x_virtual_position = x_virtual_position << 1;\n"); } @@ -107,10 +107,13 @@ void WriteSwizzler(char*& p, u32 format, API_TYPE ApiType) WRITE(p, " float2 uv0 = float2(sampleUv);\n"); // sampleUv is the sample position in (int)gx_coords WRITE(p, " uv0 += float2(0.5, 0.5);\n"); // move to center of pixel - WRITE(p, " uv0 *= float(position.w);\n"); // scale by two if needed (this will move to pixels border to filter linear) - WRITE(p, " uv0 += float2(position.xy);\n"); // move to copyed rect - WRITE(p, " uv0 /= float2(%d, %d);\n", EFB_WIDTH, EFB_HEIGHT); // normlize to [0:1] - WRITE(p, " uv0.y = 1.0-uv0.y;\n"); // ogl foo (disable this line for d3d) + WRITE(p, " uv0 *= float(position.w);\n"); // scale by two if needed (also move to pixel borders so that linear filtering will average adjacent pixel) + WRITE(p, " uv0 += float2(position.xy);\n"); // move to copied rect + WRITE(p, " uv0 /= float2(%d, %d);\n", EFB_WIDTH, EFB_HEIGHT); // normalize to [0:1] + if (ApiType == API_OPENGL) // ogl has to flip up and down + { + WRITE(p, " uv0.y = 1.0-uv0.y;\n"); + } WRITE(p, " float sample_offset = position.w / float(%d);\n", EFB_WIDTH); }