diff --git a/Source/Core/Core/Config/GraphicsSettings.cpp b/Source/Core/Core/Config/GraphicsSettings.cpp index c9091db2e2..beb2e6f017 100644 --- a/Source/Core/Core/Config/GraphicsSettings.cpp +++ b/Source/Core/Core/Config/GraphicsSettings.cpp @@ -87,6 +87,11 @@ const Info GFX_SAVE_TEXTURE_CACHE_TO_STATE{ const Info GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION{ {System::GFX, "Settings", "PreferVSForLinePointExpansion"}, false}; +const Info GFX_MTL_MANUALLY_UPLOAD_BUFFERS{ + {System::GFX, "Settings", "ManuallyUploadBuffers"}, TriState::Auto}; +const Info GFX_MTL_USE_PRESENT_DRAWABLE{{System::GFX, "Settings", "MTLUsePresentDrawable"}, + false}; + const Info GFX_SW_DUMP_OBJECTS{{System::GFX, "Settings", "SWDumpObjects"}, false}; const Info GFX_SW_DUMP_TEV_STAGES{{System::GFX, "Settings", "SWDumpTevStages"}, false}; const Info GFX_SW_DUMP_TEV_TEX_FETCHES{{System::GFX, "Settings", "SWDumpTevTexFetches"}, diff --git a/Source/Core/Core/Config/GraphicsSettings.h b/Source/Core/Core/Config/GraphicsSettings.h index 3497aa6281..0eed7c88e2 100644 --- a/Source/Core/Core/Config/GraphicsSettings.h +++ b/Source/Core/Core/Config/GraphicsSettings.h @@ -11,6 +11,7 @@ enum class AspectMode : int; enum class ShaderCompilationMode : int; enum class StereoMode : int; enum class FreelookControlType : int; +enum class TriState : int; namespace Config { @@ -75,6 +76,9 @@ extern const Info GFX_SHADER_PRECOMPILER_THREADS; extern const Info GFX_SAVE_TEXTURE_CACHE_TO_STATE; extern const Info GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION; +extern const Info GFX_MTL_MANUALLY_UPLOAD_BUFFERS; +extern const Info GFX_MTL_USE_PRESENT_DRAWABLE; + extern const Info GFX_SW_DUMP_OBJECTS; extern const Info GFX_SW_DUMP_TEV_STAGES; extern const Info GFX_SW_DUMP_TEV_TEX_FETCHES; diff --git a/Source/Core/VideoBackends/Metal/CMakeLists.txt b/Source/Core/VideoBackends/Metal/CMakeLists.txt index 698ed2678b..7be576f52a 100644 --- a/Source/Core/VideoBackends/Metal/CMakeLists.txt +++ b/Source/Core/VideoBackends/Metal/CMakeLists.txt @@ -39,3 +39,5 @@ PRIVATE ${METAL_LIBRARY} ${QUARTZCORE_LIBRARY} ) + +target_compile_options(videometal PRIVATE -fno-objc-arc) diff --git a/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm b/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm index 4c1a6ebd48..3ff48a22c2 100644 --- a/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm +++ b/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm @@ -36,6 +36,7 @@ std::vector Metal::BoundingBox::Read(u32 index, u32 length) { g_state_tracker->EndRenderPass(); g_state_tracker->FlushEncoders(); + g_state_tracker->NotifyOfCPUGPUSync(); g_state_tracker->WaitForFlushedEncoders(); return std::vector(m_cpu_buffer_ptr + index, m_cpu_buffer_ptr + index + length); } diff --git a/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm b/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm index 42139e63bf..cd65b37b58 100644 --- a/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm +++ b/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm @@ -56,6 +56,7 @@ void Metal::PerfQuery::FlushResults() // There's a possibility that some active performance queries are unflushed g_state_tracker->FlushEncoders(); + g_state_tracker->NotifyOfCPUGPUSync(); std::unique_lock lock(m_results_mtx); while (!IsFlushed()) diff --git a/Source/Core/VideoBackends/Metal/MTLRenderer.mm b/Source/Core/VideoBackends/Metal/MTLRenderer.mm index 3639b01241..7d6c583599 100644 --- a/Source/Core/VideoBackends/Metal/MTLRenderer.mm +++ b/Source/Core/VideoBackends/Metal/MTLRenderer.mm @@ -20,6 +20,7 @@ Metal::Renderer::Renderer(MRCOwned layer, int width, int height, m_layer(std::move(layer)) { UpdateActiveConfig(); + [m_layer setDisplaySyncEnabled:g_ActiveConfig.bVSyncActive]; } Metal::Renderer::~Renderer() = default; @@ -454,8 +455,15 @@ void Metal::Renderer::PresentBackbuffer() g_state_tracker->EndRenderPass(); if (m_drawable) { - [g_state_tracker->GetRenderCmdBuf() - addScheduledHandler:[drawable = std::move(m_drawable)](id) { [drawable present]; }]; + // PresentDrawable refuses to allow Dolphin to present faster than the display's refresh rate + // when windowed (or fullscreen with vsync enabled, but that's more understandable). + // On the other hand, it helps Xcode's GPU captures start and stop on frame boundaries + // which is convenient. Put it here as a default-off config, which we can override in Xcode. + if (g_ActiveConfig.bUsePresentDrawable) + [g_state_tracker->GetRenderCmdBuf() presentDrawable:m_drawable]; + else + [g_state_tracker->GetRenderCmdBuf() + addScheduledHandler:[drawable = std::move(m_drawable)](id) { [drawable present]; }]; m_bb_texture->SetMTLTexture(nullptr); m_drawable = nullptr; } diff --git a/Source/Core/VideoBackends/Metal/MTLStateTracker.h b/Source/Core/VideoBackends/Metal/MTLStateTracker.h index 2ec2e2ae27..3e0cb38afa 100644 --- a/Source/Core/VideoBackends/Metal/MTLStateTracker.h +++ b/Source/Core/VideoBackends/Metal/MTLStateTracker.h @@ -34,7 +34,6 @@ public: Uniform, Vertex, Index, - TextureData, Texels, Last = Texels }; @@ -75,6 +74,14 @@ public: return m_current_draw != 1 + m_last_finished_draw.load(std::memory_order_acquire); } void ReloadSamplers(); + void NotifyOfCPUGPUSync() + { + if (!g_features.manual_buffer_upload || !m_manual_buffer_upload) + return; + if (m_upload_cmdbuf || m_current_render_cmdbuf) + return; + SetManualBufferUpload(false); + } void SetPipeline(const Pipeline* pipe); void SetPipeline(const ComputePipeline* pipe); @@ -106,6 +113,7 @@ public: { return (amt + static_cast(align)) & ~static_cast(align); } + Map AllocateForTextureUpload(size_t amt); Map Allocate(UploadBuffer buffer_idx, size_t amt, AlignMask align) { Preallocate(buffer_idx, amt); @@ -119,6 +127,7 @@ public: static_cast(align)) == 0); return CommitPreallocation(buffer_idx, Align(amt, align)); } + id GetUploadEncoder(); id GetTextureUploadEncoder(); id GetRenderCmdBuf(); @@ -142,18 +151,28 @@ private: void Reset(size_t new_size); }; - struct Buffer + struct CPUBuffer { UsageTracker usage; MRCOwned> mtlbuffer; void* buffer = nullptr; }; + struct BufferPair + { + UsageTracker usage; + MRCOwned> cpubuffer; + MRCOwned> gpubuffer; + void* buffer = nullptr; + size_t last_upload = 0; + }; + struct Backref; struct PerfQueryTracker; std::shared_ptr m_backref; std::vector> m_perf_query_tracker_cache; + MRCOwned> m_fence; MRCOwned> m_upload_cmdbuf; MRCOwned> m_upload_encoder; MRCOwned> m_texture_upload_cmdbuf; @@ -165,7 +184,8 @@ private: MRCOwned m_render_pass_desc[3]; MRCOwned m_resolve_pass_desc; Framebuffer* m_current_framebuffer; - Buffer m_upload_buffers[static_cast(UploadBuffer::Last) + 1]; + CPUBuffer m_texture_upload_buffer; + BufferPair m_upload_buffers[static_cast(UploadBuffer::Last) + 1]; u64 m_current_draw = 1; std::atomic m_last_finished_draw{0}; @@ -250,9 +270,12 @@ private: } m_state; u32 m_perf_query_tracker_counter = 0; + bool m_manual_buffer_upload = false; + void SetManualBufferUpload(bool enable); std::shared_ptr NewPerfQueryTracker(); void SetSamplerForce(u32 idx, const SamplerState& sampler); + void Sync(BufferPair& buffer); Map CommitPreallocation(UploadBuffer buffer_idx, size_t actual_amt); void CheckViewport(); void CheckScissor(); diff --git a/Source/Core/VideoBackends/Metal/MTLStateTracker.mm b/Source/Core/VideoBackends/Metal/MTLStateTracker.mm index e25e33fe02..7cfb37e751 100644 --- a/Source/Core/VideoBackends/Metal/MTLStateTracker.mm +++ b/Source/Core/VideoBackends/Metal/MTLStateTracker.mm @@ -45,12 +45,11 @@ static NSString* GetName(Metal::StateTracker::UploadBuffer buffer) // clang-format off switch (buffer) { - case Metal::StateTracker::UploadBuffer::TextureData: return @"Texture Data"; - case Metal::StateTracker::UploadBuffer::Texels: return @"Texels"; - case Metal::StateTracker::UploadBuffer::Vertex: return @"Vertices"; - case Metal::StateTracker::UploadBuffer::Index: return @"Indices"; - case Metal::StateTracker::UploadBuffer::Uniform: return @"Uniforms"; - case Metal::StateTracker::UploadBuffer::Other: return @"Generic Upload"; + case Metal::StateTracker::UploadBuffer::Texels: return @"Texels"; + case Metal::StateTracker::UploadBuffer::Vertex: return @"Vertices"; + case Metal::StateTracker::UploadBuffer::Index: return @"Indices"; + case Metal::StateTracker::UploadBuffer::Uniform: return @"Uniforms"; + case Metal::StateTracker::UploadBuffer::Other: return @"Generic Upload"; } // clang-format on } @@ -105,6 +104,7 @@ void Metal::StateTracker::UsageTracker::Reset(size_t new_size) Metal::StateTracker::StateTracker() : m_backref(std::make_shared(this)) { m_flags.should_apply_label = true; + m_fence = MRCTransfer([g_device newFence]); for (MRCOwned& rpdesc : m_render_pass_desc) { rpdesc = MRCTransfer([MTLRenderPassDescriptor new]); @@ -141,9 +141,10 @@ Metal::StateTracker::~StateTracker() // MARK: BufferPair Ops -std::pair Metal::StateTracker::Preallocate(UploadBuffer buffer_idx, size_t amt) +Metal::StateTracker::Map Metal::StateTracker::AllocateForTextureUpload(size_t amt) { - Buffer& buffer = m_upload_buffers[static_cast(buffer_idx)]; + amt = (amt + 15) & ~15ull; + CPUBuffer& buffer = m_texture_upload_buffer; u64 last_draw = m_last_finished_draw.load(std::memory_order_acquire); bool needs_new = buffer.usage.PrepareForAllocation(last_draw, amt); if (__builtin_expect(needs_new, false)) @@ -155,11 +156,61 @@ std::pair Metal::StateTracker::Preallocate(UploadBuffer buffer_id MTLResourceOptions options = MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined; buffer.mtlbuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]); - [buffer.mtlbuffer setLabel:GetName(buffer_idx)]; + [buffer.mtlbuffer setLabel:@"Texture Upload Buffer"]; ASSERT_MSG(VIDEO, buffer.mtlbuffer, "Failed to allocate MTLBuffer (out of memory?)"); buffer.buffer = [buffer.mtlbuffer contents]; buffer.usage.Reset(newsize); } + + size_t pos = buffer.usage.Allocate(m_current_draw, amt); + + Map ret = {buffer.mtlbuffer, pos, reinterpret_cast(buffer.buffer) + pos}; + DEBUG_ASSERT(pos <= buffer.usage.Size() && + "Previous code should have guaranteed there was enough space"); + return ret; +} + +std::pair Metal::StateTracker::Preallocate(UploadBuffer buffer_idx, size_t amt) +{ + BufferPair& buffer = m_upload_buffers[static_cast(buffer_idx)]; + u64 last_draw = m_last_finished_draw.load(std::memory_order_acquire); + size_t base_pos = buffer.usage.Pos(); + bool needs_new = buffer.usage.PrepareForAllocation(last_draw, amt); + bool needs_upload = needs_new || buffer.usage.Pos() == 0; + if (m_manual_buffer_upload && needs_upload) + { + if (base_pos != buffer.last_upload) + { + id encoder = GetUploadEncoder(); + [encoder copyFromBuffer:buffer.cpubuffer + sourceOffset:buffer.last_upload + toBuffer:buffer.gpubuffer + destinationOffset:buffer.last_upload + size:base_pos - buffer.last_upload]; + } + buffer.last_upload = 0; + } + if (__builtin_expect(needs_new, false)) + { + // Orphan buffer + size_t newsize = std::max(buffer.usage.Size() * 2, 4096); + while (newsize < amt) + newsize *= 2; + MTLResourceOptions options = + MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined; + buffer.cpubuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]); + [buffer.cpubuffer setLabel:GetName(buffer_idx)]; + ASSERT_MSG(VIDEO, buffer.cpubuffer, "Failed to allocate MTLBuffer (out of memory?)"); + buffer.buffer = [buffer.cpubuffer contents]; + buffer.usage.Reset(newsize); + if (g_features.manual_buffer_upload) + { + options = MTLResourceStorageModePrivate | MTLResourceHazardTrackingModeUntracked; + buffer.gpubuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]); + [buffer.gpubuffer setLabel:GetName(buffer_idx)]; + ASSERT_MSG(VIDEO, buffer.gpubuffer, "Failed to allocate MTLBuffer (out of memory?)"); + } + } size_t pos = buffer.usage.Pos(); return std::make_pair(reinterpret_cast(buffer.buffer) + pos, pos); } @@ -167,17 +218,46 @@ std::pair Metal::StateTracker::Preallocate(UploadBuffer buffer_id Metal::StateTracker::Map Metal::StateTracker::CommitPreallocation(UploadBuffer buffer_idx, size_t amt) { - Buffer& buffer = m_upload_buffers[static_cast(buffer_idx)]; + BufferPair& buffer = m_upload_buffers[static_cast(buffer_idx)]; size_t pos = buffer.usage.Allocate(m_current_draw, amt); Map ret = {nil, pos, reinterpret_cast(buffer.buffer) + pos}; - ret.gpu_buffer = buffer.mtlbuffer; + ret.gpu_buffer = m_manual_buffer_upload ? buffer.gpubuffer : buffer.cpubuffer; DEBUG_ASSERT(pos <= buffer.usage.Size() && "Previous code should have guaranteed there was enough space"); return ret; } +void Metal::StateTracker::Sync(BufferPair& buffer) +{ + if (!m_manual_buffer_upload || buffer.usage.Pos() == buffer.last_upload) + return; + + id encoder = GetUploadEncoder(); + [encoder copyFromBuffer:buffer.cpubuffer + sourceOffset:buffer.last_upload + toBuffer:buffer.gpubuffer + destinationOffset:buffer.last_upload + size:buffer.usage.Pos() - buffer.last_upload]; + buffer.last_upload = buffer.usage.Pos(); +} + // MARK: Render Pass / Encoder Management +id Metal::StateTracker::GetUploadEncoder() +{ + if (!m_upload_cmdbuf) + { + @autoreleasepool + { + m_upload_cmdbuf = MRCRetain([g_queue commandBuffer]); + [m_upload_cmdbuf setLabel:@"Vertex Upload"]; + m_upload_encoder = MRCRetain([m_upload_cmdbuf blitCommandEncoder]); + [m_upload_encoder setLabel:@"Vertex Upload"]; + } + } + return m_upload_encoder; +} + id Metal::StateTracker::GetTextureUploadEncoder() { if (!m_texture_upload_cmdbuf) @@ -270,6 +350,8 @@ void Metal::StateTracker::BeginRenderPass(MTLRenderPassDescriptor* descriptor) MRCRetain([GetRenderCmdBuf() renderCommandEncoderWithDescriptor:descriptor]); if (m_current_perf_query) [descriptor setVisibilityResultBuffer:nil]; + if (m_manual_buffer_upload) + [m_current_render_encoder waitForFence:m_fence beforeStages:MTLRenderStageVertex]; AbstractTexture* attachment = m_current_framebuffer->GetColorAttachment(); if (!attachment) attachment = m_current_framebuffer->GetDepthAttachment(); @@ -299,6 +381,8 @@ void Metal::StateTracker::BeginComputePass() EndRenderPass(); m_current_compute_encoder = MRCRetain([GetRenderCmdBuf() computeCommandEncoder]); [m_current_compute_encoder setLabel:@"Compute"]; + if (m_manual_buffer_upload) + [m_current_compute_encoder waitForFence:m_fence]; m_flags.NewEncoder(); m_dirty_samplers = 0xff; m_dirty_textures = 0xff; @@ -326,6 +410,20 @@ void Metal::StateTracker::FlushEncoders() if (!m_current_render_cmdbuf) return; EndRenderPass(); + for (int i = 0; i <= static_cast(UploadBuffer::Last); ++i) + Sync(m_upload_buffers[i]); + if (!m_manual_buffer_upload) + { + ASSERT(!m_upload_cmdbuf && "Should never be used!"); + } + else if (m_upload_cmdbuf) + { + [m_upload_encoder updateFence:m_fence]; + [m_upload_encoder endEncoding]; + [m_upload_cmdbuf commit]; + m_upload_encoder = nullptr; + m_upload_cmdbuf = nullptr; + } if (m_texture_upload_cmdbuf) { [m_texture_upload_encoder endEncoding]; @@ -355,6 +453,8 @@ void Metal::StateTracker::FlushEncoders() m_last_render_cmdbuf = std::move(m_current_render_cmdbuf); m_current_render_cmdbuf = nullptr; m_current_draw++; + if (g_features.manual_buffer_upload && !m_manual_buffer_upload) + SetManualBufferUpload(true); } void Metal::StateTracker::WaitForFlushedEncoders() @@ -368,6 +468,23 @@ void Metal::StateTracker::ReloadSamplers() m_state.samplers[i] = g_object_cache->GetSampler(m_state.sampler_states[i]); } +void Metal::StateTracker::SetManualBufferUpload(bool enabled) +{ + // When a game does something that needs CPU-GPU sync (e.g. bbox, texture download, etc), + // the next command buffer will be done with manual buffer upload disabled, + // since overlapping the upload with the previous draw won't be possible (due to sync). + // This greatly improves performance in heavy bbox games like Super Paper Mario. + m_manual_buffer_upload = enabled; + if (enabled) + { + for (BufferPair& buffer : m_upload_buffers) + { + // Update sync positions, since Sync doesn't do it when manual buffer upload is off + buffer.last_upload = buffer.usage.Pos(); + } + } +} + // MARK: State Setters void Metal::StateTracker::SetPipeline(const Pipeline* pipe) diff --git a/Source/Core/VideoBackends/Metal/MTLTexture.mm b/Source/Core/VideoBackends/Metal/MTLTexture.mm index fd0358e10e..52a857f5d8 100644 --- a/Source/Core/VideoBackends/Metal/MTLTexture.mm +++ b/Source/Core/VideoBackends/Metal/MTLTexture.mm @@ -6,6 +6,7 @@ #include "Common/Align.h" #include "Common/Assert.h" +#include "VideoBackends/Metal/MTLRenderer.h" #include "VideoBackends/Metal/MTLStateTracker.h" Metal::Texture::Texture(MRCOwned> tex, const TextureConfig& config) @@ -50,6 +51,10 @@ void Metal::Texture::ResolveFromTexture(const AbstractTexture* src, g_state_tracker->ResolveTexture(src_tex, m_tex, layer, level); } +// Use a temporary texture for large texture loads +// (Since the main upload buffer doesn't shrink after it grows) +static constexpr u32 STAGING_TEXTURE_UPLOAD_THRESHOLD = 1024 * 1024 * 4; + void Metal::Texture::Load(u32 level, u32 width, u32 height, u32 row_length, // const u8* buffer, size_t buffer_size) { @@ -59,8 +64,23 @@ void Metal::Texture::Load(u32 level, u32 width, u32 height, u32 row_length, // const u32 num_rows = Common::AlignUp(height, block_size) / block_size; const u32 source_pitch = CalculateStrideForFormat(m_config.format, row_length); const u32 upload_size = source_pitch * num_rows; - StateTracker::Map map = g_state_tracker->Allocate(StateTracker::UploadBuffer::TextureData, - upload_size, StateTracker::AlignMask::Other); + MRCOwned> tmp_buffer; + StateTracker::Map map; + if (upload_size > STAGING_TEXTURE_UPLOAD_THRESHOLD) + { + tmp_buffer = MRCTransfer([g_device + newBufferWithLength:upload_size + options:MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined]); + [tmp_buffer setLabel:@"Temp Texture Upload"]; + map.gpu_buffer = tmp_buffer; + map.gpu_offset = 0; + map.cpu_buffer = [tmp_buffer contents]; + } + else + { + map = g_state_tracker->AllocateForTextureUpload(upload_size); + } + memcpy(map.cpu_buffer, buffer, upload_size); id encoder = g_state_tracker->GetTextureUploadEncoder(); [encoder copyFromBuffer:map.gpu_buffer @@ -163,6 +183,7 @@ void Metal::StagingTexture::Flush() { // Flush while we wait, since who knows how long we'll be sitting here g_state_tracker->FlushEncoders(); + g_state_tracker->NotifyOfCPUGPUSync(); [m_wait_buffer waitUntilCompleted]; } m_wait_buffer = nullptr; diff --git a/Source/Core/VideoBackends/Metal/MTLUtil.h b/Source/Core/VideoBackends/Metal/MTLUtil.h index dfedecd7c6..28fb57cb0b 100644 --- a/Source/Core/VideoBackends/Metal/MTLUtil.h +++ b/Source/Core/VideoBackends/Metal/MTLUtil.h @@ -16,6 +16,10 @@ namespace Metal { struct DeviceFeatures { + /// Manually copy buffer data to the GPU (instead of letting the GPU read from system memory) + /// On discrete GPUs, this tends to be faster if the copy is able to operate in parallel with a + /// previous render. This is the case unless a game uses features like bbox or texture downloads. + bool manual_buffer_upload; bool subgroup_ops; }; diff --git a/Source/Core/VideoBackends/Metal/MTLUtil.mm b/Source/Core/VideoBackends/Metal/MTLUtil.mm index aa379c0851..1baf532943 100644 --- a/Source/Core/VideoBackends/Metal/MTLUtil.mm +++ b/Source/Core/VideoBackends/Metal/MTLUtil.mm @@ -217,6 +217,27 @@ void Metal::Util::PopulateBackendInfoFeatures(VideoConfig* config, id config->backend_info.AAModes.push_back(i); } + switch (config->iManuallyUploadBuffers) + { + case TriState::Off: + g_features.manual_buffer_upload = false; + break; + case TriState::On: + g_features.manual_buffer_upload = true; + break; + case TriState::Auto: +#if TARGET_OS_OSX + g_features.manual_buffer_upload = false; + if (@available(macOS 10.15, *)) + if (![device hasUnifiedMemory]) + g_features.manual_buffer_upload = true; +#else + // All iOS devices have unified memory + g_features.manual_buffer_upload = false; +#endif + break; + } + g_features.subgroup_ops = false; if (@available(macOS 10.15, iOS 13, *)) { @@ -225,7 +246,7 @@ void Metal::Util::PopulateBackendInfoFeatures(VideoConfig* config, id [device supportsFamily:MTLGPUFamilyMac2] || [device supportsFamily:MTLGPUFamilyApple6]; config->backend_info.bSupportsFramebufferFetch = [device supportsFamily:MTLGPUFamilyApple1]; } - if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_INVOCATION_ID)) + if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_OPS)) g_features.subgroup_ops = false; #if TARGET_OS_OSX if (@available(macOS 11, *)) @@ -378,6 +399,12 @@ static const std::string_view MSL_HEADER = // These are usually when the compiler doesn't think a switch is exhaustive "#pragma clang diagnostic ignored \"-Wreturn-type\"\n"; +static constexpr std::pair MSL_FIXUPS[] = { + // Force-unroll the lighting loop in ubershaders, which greatly reduces register pressure on AMD + {"for (uint chan = 0u; chan < 2u; chan++)", + "_Pragma(\"unroll\") for (uint chan = 0u; chan < 2u; chan++)"}, +}; + static constexpr spirv_cross::MSLResourceBinding MakeResourceBinding(spv::ExecutionModel stage, u32 set, u32 binding, // u32 msl_buffer, u32 msl_texture, u32 msl_sampler) @@ -474,7 +501,27 @@ std::optional Metal::Util::TranslateShaderToMSL(ShaderStage stage, for (auto& binding : resource_bindings) compiler.add_msl_resource_binding(binding); - std::string msl(MSL_HEADER); - msl += compiler.compile(); - return msl; + std::string output(MSL_HEADER); + std::string compiled = compiler.compile(); + std::string_view remaining = compiled; + while (!remaining.empty()) + { + // Apply fixups + std::string_view piece = remaining; + std::string_view fixup_piece = {}; + size_t next = piece.size(); + for (const auto& fixup : MSL_FIXUPS) + { + size_t found = piece.find(fixup.first); + if (found == std::string_view::npos) + continue; + piece = piece.substr(0, found); + fixup_piece = fixup.second; + next = found + fixup.first.size(); + } + output += piece; + output += fixup_piece; + remaining = remaining.substr(next); + } + return output; } diff --git a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp index 8fce877092..3275cb9417 100644 --- a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp +++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp @@ -918,7 +918,7 @@ void VulkanContext::PopulateShaderSubgroupSupport() m_supports_shader_subgroup_operations = (subgroup_properties.supportedOperations & required_operations) == required_operations && subgroup_properties.supportedStages & VK_SHADER_STAGE_FRAGMENT_BIT && - !DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_INVOCATION_ID); + !DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_OPS); } bool VulkanContext::SupportsExclusiveFullscreen(const WindowSystemInfo& wsi, VkSurfaceKHR surface) diff --git a/Source/Core/VideoCommon/DriverDetails.cpp b/Source/Core/VideoCommon/DriverDetails.cpp index 09fef47a4f..ffd5fa48d7 100644 --- a/Source/Core/VideoCommon/DriverDetails.cpp +++ b/Source/Core/VideoCommon/DriverDetails.cpp @@ -132,10 +132,14 @@ constexpr BugInfo m_known_bugs[] = { -1.0, -1.0, true}, {API_VULKAN, OS_ALL, VENDOR_ARM, DRIVER_ARM, Family::UNKNOWN, BUG_BROKEN_VECTOR_BITWISE_AND, -1.0, -1.0, true}, - {API_VULKAN, OS_OSX, VENDOR_ATI, DRIVER_PORTABILITY, Family::UNKNOWN, - BUG_BROKEN_SUBGROUP_INVOCATION_ID, -1.0, -1.0, true}, - {API_METAL, OS_OSX, VENDOR_ATI, DRIVER_APPLE, Family::UNKNOWN, - BUG_BROKEN_SUBGROUP_INVOCATION_ID, -1.0, -1.0, true}, + {API_VULKAN, OS_OSX, VENDOR_ATI, DRIVER_PORTABILITY, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS, + -1.0, -1.0, true}, + {API_VULKAN, OS_OSX, VENDOR_INTEL, DRIVER_PORTABILITY, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS, + -1.0, -1.0, true}, + {API_METAL, OS_OSX, VENDOR_ATI, DRIVER_APPLE, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS, -1.0, + -1.0, true}, + {API_METAL, OS_OSX, VENDOR_INTEL, DRIVER_APPLE, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS, -1.0, + -1.0, true}, {API_OPENGL, OS_ANDROID, VENDOR_ALL, DRIVER_ALL, Family::UNKNOWN, BUG_BROKEN_MULTITHREADED_SHADER_PRECOMPILATION, -1.0, -1.0, true}, {API_VULKAN, OS_ANDROID, VENDOR_ALL, DRIVER_ALL, Family::UNKNOWN, diff --git a/Source/Core/VideoCommon/DriverDetails.h b/Source/Core/VideoCommon/DriverDetails.h index 02827c35d5..3e59e576c6 100644 --- a/Source/Core/VideoCommon/DriverDetails.h +++ b/Source/Core/VideoCommon/DriverDetails.h @@ -306,10 +306,15 @@ enum Bug BUG_BROKEN_VECTOR_BITWISE_AND, // BUG: Accessing gl_SubgroupInvocationID causes the Metal shader compiler to crash. - // Affected devices: AMD (macOS) + // Affected devices: AMD (older macOS) + // BUG: gl_HelperInvocation always returns true, even for non-helper invocations + // Affected devices: AMD (newer macOS) + // BUG: Using subgroupMax in a shader that can discard results in garbage data + // (For some reason, this only happens at 4x+ IR on Metal, but 2x+ IR on MoltenVK) + // Affected devices: Intel (macOS) // Started version: -1 // Ended version: -1 - BUG_BROKEN_SUBGROUP_INVOCATION_ID, + BUG_BROKEN_SUBGROUP_OPS, // BUG: Multi-threaded shader pre-compilation sometimes crashes // Used primarily in Videoconfig.cpp's GetNumAutoShaderPreCompilerThreads() diff --git a/Source/Core/VideoCommon/VideoConfig.cpp b/Source/Core/VideoCommon/VideoConfig.cpp index 91df848c94..760e185304 100644 --- a/Source/Core/VideoCommon/VideoConfig.cpp +++ b/Source/Core/VideoCommon/VideoConfig.cpp @@ -55,6 +55,8 @@ void VideoConfig::Refresh() bVSync = Config::Get(Config::GFX_VSYNC); iAdapter = Config::Get(Config::GFX_ADAPTER); + iManuallyUploadBuffers = Config::Get(Config::GFX_MTL_MANUALLY_UPLOAD_BUFFERS); + bUsePresentDrawable = Config::Get(Config::GFX_MTL_USE_PRESENT_DRAWABLE); bWidescreenHack = Config::Get(Config::GFX_WIDESCREEN_HACK); aspect_mode = Config::Get(Config::GFX_ASPECT_RATIO); diff --git a/Source/Core/VideoCommon/VideoConfig.h b/Source/Core/VideoCommon/VideoConfig.h index d0863a9d84..8b4cc40657 100644 --- a/Source/Core/VideoCommon/VideoConfig.h +++ b/Source/Core/VideoCommon/VideoConfig.h @@ -45,6 +45,13 @@ enum class ShaderCompilationMode : int AsynchronousSkipRendering }; +enum class TriState : int +{ + Off, + On, + Auto +}; + // NEVER inherit from this class. struct VideoConfig final { @@ -149,6 +156,10 @@ struct VideoConfig final // D3D only config, mostly to be merged into the above int iAdapter = 0; + // Metal only config + TriState iManuallyUploadBuffers = TriState::Auto; + bool bUsePresentDrawable = false; + // Enable API validation layers, currently only supported with Vulkan. bool bEnableValidationLayer = false;