diff --git a/Source/Core/Core/Config/GraphicsSettings.cpp b/Source/Core/Core/Config/GraphicsSettings.cpp
index c9091db2e2..beb2e6f017 100644
--- a/Source/Core/Core/Config/GraphicsSettings.cpp
+++ b/Source/Core/Core/Config/GraphicsSettings.cpp
@@ -87,6 +87,11 @@ const Info<bool> GFX_SAVE_TEXTURE_CACHE_TO_STATE{
 const Info<bool> GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION{
     {System::GFX, "Settings", "PreferVSForLinePointExpansion"}, false};
 
+const Info<TriState> GFX_MTL_MANUALLY_UPLOAD_BUFFERS{
+    {System::GFX, "Settings", "ManuallyUploadBuffers"}, TriState::Auto};
+const Info<bool> GFX_MTL_USE_PRESENT_DRAWABLE{{System::GFX, "Settings", "MTLUsePresentDrawable"},
+                                              false};
+
 const Info<bool> GFX_SW_DUMP_OBJECTS{{System::GFX, "Settings", "SWDumpObjects"}, false};
 const Info<bool> GFX_SW_DUMP_TEV_STAGES{{System::GFX, "Settings", "SWDumpTevStages"}, false};
 const Info<bool> GFX_SW_DUMP_TEV_TEX_FETCHES{{System::GFX, "Settings", "SWDumpTevTexFetches"},
diff --git a/Source/Core/Core/Config/GraphicsSettings.h b/Source/Core/Core/Config/GraphicsSettings.h
index 3497aa6281..0eed7c88e2 100644
--- a/Source/Core/Core/Config/GraphicsSettings.h
+++ b/Source/Core/Core/Config/GraphicsSettings.h
@@ -11,6 +11,7 @@ enum class AspectMode : int;
 enum class ShaderCompilationMode : int;
 enum class StereoMode : int;
 enum class FreelookControlType : int;
+enum class TriState : int;
 
 namespace Config
 {
@@ -75,6 +76,9 @@ extern const Info<int> GFX_SHADER_PRECOMPILER_THREADS;
 extern const Info<bool> GFX_SAVE_TEXTURE_CACHE_TO_STATE;
 extern const Info<bool> GFX_PREFER_VS_FOR_LINE_POINT_EXPANSION;
 
+extern const Info<TriState> GFX_MTL_MANUALLY_UPLOAD_BUFFERS;
+extern const Info<bool> GFX_MTL_USE_PRESENT_DRAWABLE;
+
 extern const Info<bool> GFX_SW_DUMP_OBJECTS;
 extern const Info<bool> GFX_SW_DUMP_TEV_STAGES;
 extern const Info<bool> GFX_SW_DUMP_TEV_TEX_FETCHES;
diff --git a/Source/Core/VideoBackends/Metal/CMakeLists.txt b/Source/Core/VideoBackends/Metal/CMakeLists.txt
index 698ed2678b..7be576f52a 100644
--- a/Source/Core/VideoBackends/Metal/CMakeLists.txt
+++ b/Source/Core/VideoBackends/Metal/CMakeLists.txt
@@ -39,3 +39,5 @@ PRIVATE
   ${METAL_LIBRARY}
   ${QUARTZCORE_LIBRARY}
 )
+
+target_compile_options(videometal PRIVATE -fno-objc-arc)
diff --git a/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm b/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm
index 4c1a6ebd48..3ff48a22c2 100644
--- a/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm
+++ b/Source/Core/VideoBackends/Metal/MTLBoundingBox.mm
@@ -36,6 +36,7 @@ std::vector<BBoxType> Metal::BoundingBox::Read(u32 index, u32 length)
   {
     g_state_tracker->EndRenderPass();
     g_state_tracker->FlushEncoders();
+    g_state_tracker->NotifyOfCPUGPUSync();
     g_state_tracker->WaitForFlushedEncoders();
     return std::vector<BBoxType>(m_cpu_buffer_ptr + index, m_cpu_buffer_ptr + index + length);
   }
diff --git a/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm b/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm
index 42139e63bf..cd65b37b58 100644
--- a/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm
+++ b/Source/Core/VideoBackends/Metal/MTLPerfQuery.mm
@@ -56,6 +56,7 @@ void Metal::PerfQuery::FlushResults()
 
   // There's a possibility that some active performance queries are unflushed
   g_state_tracker->FlushEncoders();
+  g_state_tracker->NotifyOfCPUGPUSync();
 
   std::unique_lock<std::mutex> lock(m_results_mtx);
   while (!IsFlushed())
diff --git a/Source/Core/VideoBackends/Metal/MTLRenderer.mm b/Source/Core/VideoBackends/Metal/MTLRenderer.mm
index 3639b01241..7d6c583599 100644
--- a/Source/Core/VideoBackends/Metal/MTLRenderer.mm
+++ b/Source/Core/VideoBackends/Metal/MTLRenderer.mm
@@ -20,6 +20,7 @@ Metal::Renderer::Renderer(MRCOwned<CAMetalLayer*> layer, int width, int height,
       m_layer(std::move(layer))
 {
   UpdateActiveConfig();
+  [m_layer setDisplaySyncEnabled:g_ActiveConfig.bVSyncActive];
 }
 
 Metal::Renderer::~Renderer() = default;
@@ -454,8 +455,15 @@ void Metal::Renderer::PresentBackbuffer()
     g_state_tracker->EndRenderPass();
     if (m_drawable)
     {
-      [g_state_tracker->GetRenderCmdBuf()
-          addScheduledHandler:[drawable = std::move(m_drawable)](id) { [drawable present]; }];
+      // PresentDrawable refuses to allow Dolphin to present faster than the display's refresh rate
+      // when windowed (or fullscreen with vsync enabled, but that's more understandable).
+      // On the other hand, it helps Xcode's GPU captures start and stop on frame boundaries
+      // which is convenient.  Put it here as a default-off config, which we can override in Xcode.
+      if (g_ActiveConfig.bUsePresentDrawable)
+        [g_state_tracker->GetRenderCmdBuf() presentDrawable:m_drawable];
+      else
+        [g_state_tracker->GetRenderCmdBuf()
+            addScheduledHandler:[drawable = std::move(m_drawable)](id) { [drawable present]; }];
       m_bb_texture->SetMTLTexture(nullptr);
       m_drawable = nullptr;
     }
diff --git a/Source/Core/VideoBackends/Metal/MTLStateTracker.h b/Source/Core/VideoBackends/Metal/MTLStateTracker.h
index 2ec2e2ae27..3e0cb38afa 100644
--- a/Source/Core/VideoBackends/Metal/MTLStateTracker.h
+++ b/Source/Core/VideoBackends/Metal/MTLStateTracker.h
@@ -34,7 +34,6 @@ public:
     Uniform,
     Vertex,
     Index,
-    TextureData,
     Texels,
     Last = Texels
   };
@@ -75,6 +74,14 @@ public:
     return m_current_draw != 1 + m_last_finished_draw.load(std::memory_order_acquire);
   }
   void ReloadSamplers();
+  void NotifyOfCPUGPUSync()
+  {
+    if (!g_features.manual_buffer_upload || !m_manual_buffer_upload)
+      return;
+    if (m_upload_cmdbuf || m_current_render_cmdbuf)
+      return;
+    SetManualBufferUpload(false);
+  }
 
   void SetPipeline(const Pipeline* pipe);
   void SetPipeline(const ComputePipeline* pipe);
@@ -106,6 +113,7 @@ public:
   {
     return (amt + static_cast<size_t>(align)) & ~static_cast<size_t>(align);
   }
+  Map AllocateForTextureUpload(size_t amt);
   Map Allocate(UploadBuffer buffer_idx, size_t amt, AlignMask align)
   {
     Preallocate(buffer_idx, amt);
@@ -119,6 +127,7 @@ public:
                   static_cast<size_t>(align)) == 0);
     return CommitPreallocation(buffer_idx, Align(amt, align));
   }
+  id<MTLBlitCommandEncoder> GetUploadEncoder();
   id<MTLBlitCommandEncoder> GetTextureUploadEncoder();
   id<MTLCommandBuffer> GetRenderCmdBuf();
 
@@ -142,18 +151,28 @@ private:
     void Reset(size_t new_size);
   };
 
-  struct Buffer
+  struct CPUBuffer
   {
     UsageTracker usage;
     MRCOwned<id<MTLBuffer>> mtlbuffer;
     void* buffer = nullptr;
   };
 
+  struct BufferPair
+  {
+    UsageTracker usage;
+    MRCOwned<id<MTLBuffer>> cpubuffer;
+    MRCOwned<id<MTLBuffer>> gpubuffer;
+    void* buffer = nullptr;
+    size_t last_upload = 0;
+  };
+
   struct Backref;
   struct PerfQueryTracker;
 
   std::shared_ptr<Backref> m_backref;
   std::vector<std::shared_ptr<PerfQueryTracker>> m_perf_query_tracker_cache;
+  MRCOwned<id<MTLFence>> m_fence;
   MRCOwned<id<MTLCommandBuffer>> m_upload_cmdbuf;
   MRCOwned<id<MTLBlitCommandEncoder>> m_upload_encoder;
   MRCOwned<id<MTLCommandBuffer>> m_texture_upload_cmdbuf;
@@ -165,7 +184,8 @@ private:
   MRCOwned<MTLRenderPassDescriptor*> m_render_pass_desc[3];
   MRCOwned<MTLRenderPassDescriptor*> m_resolve_pass_desc;
   Framebuffer* m_current_framebuffer;
-  Buffer m_upload_buffers[static_cast<int>(UploadBuffer::Last) + 1];
+  CPUBuffer m_texture_upload_buffer;
+  BufferPair m_upload_buffers[static_cast<int>(UploadBuffer::Last) + 1];
   u64 m_current_draw = 1;
   std::atomic<u64> m_last_finished_draw{0};
 
@@ -250,9 +270,12 @@ private:
   } m_state;
 
   u32 m_perf_query_tracker_counter = 0;
+  bool m_manual_buffer_upload = false;
 
+  void SetManualBufferUpload(bool enable);
   std::shared_ptr<PerfQueryTracker> NewPerfQueryTracker();
   void SetSamplerForce(u32 idx, const SamplerState& sampler);
+  void Sync(BufferPair& buffer);
   Map CommitPreallocation(UploadBuffer buffer_idx, size_t actual_amt);
   void CheckViewport();
   void CheckScissor();
diff --git a/Source/Core/VideoBackends/Metal/MTLStateTracker.mm b/Source/Core/VideoBackends/Metal/MTLStateTracker.mm
index e25e33fe02..7cfb37e751 100644
--- a/Source/Core/VideoBackends/Metal/MTLStateTracker.mm
+++ b/Source/Core/VideoBackends/Metal/MTLStateTracker.mm
@@ -45,12 +45,11 @@ static NSString* GetName(Metal::StateTracker::UploadBuffer buffer)
   // clang-format off
   switch (buffer)
   {
-    case Metal::StateTracker::UploadBuffer::TextureData: return @"Texture Data";
-    case Metal::StateTracker::UploadBuffer::Texels:      return @"Texels";
-    case Metal::StateTracker::UploadBuffer::Vertex:      return @"Vertices";
-    case Metal::StateTracker::UploadBuffer::Index:       return @"Indices";
-    case Metal::StateTracker::UploadBuffer::Uniform:     return @"Uniforms";
-    case Metal::StateTracker::UploadBuffer::Other:       return @"Generic Upload";
+    case Metal::StateTracker::UploadBuffer::Texels:  return @"Texels";
+    case Metal::StateTracker::UploadBuffer::Vertex:  return @"Vertices";
+    case Metal::StateTracker::UploadBuffer::Index:   return @"Indices";
+    case Metal::StateTracker::UploadBuffer::Uniform: return @"Uniforms";
+    case Metal::StateTracker::UploadBuffer::Other:   return @"Generic Upload";
   }
   // clang-format on
 }
@@ -105,6 +104,7 @@ void Metal::StateTracker::UsageTracker::Reset(size_t new_size)
 Metal::StateTracker::StateTracker() : m_backref(std::make_shared<Backref>(this))
 {
   m_flags.should_apply_label = true;
+  m_fence = MRCTransfer([g_device newFence]);
   for (MRCOwned<MTLRenderPassDescriptor*>& rpdesc : m_render_pass_desc)
   {
     rpdesc = MRCTransfer([MTLRenderPassDescriptor new]);
@@ -141,9 +141,10 @@ Metal::StateTracker::~StateTracker()
 
 // MARK: BufferPair Ops
 
-std::pair<void*, size_t> Metal::StateTracker::Preallocate(UploadBuffer buffer_idx, size_t amt)
+Metal::StateTracker::Map Metal::StateTracker::AllocateForTextureUpload(size_t amt)
 {
-  Buffer& buffer = m_upload_buffers[static_cast<int>(buffer_idx)];
+  amt = (amt + 15) & ~15ull;
+  CPUBuffer& buffer = m_texture_upload_buffer;
   u64 last_draw = m_last_finished_draw.load(std::memory_order_acquire);
   bool needs_new = buffer.usage.PrepareForAllocation(last_draw, amt);
   if (__builtin_expect(needs_new, false))
@@ -155,11 +156,61 @@ std::pair<void*, size_t> Metal::StateTracker::Preallocate(UploadBuffer buffer_id
     MTLResourceOptions options =
         MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined;
     buffer.mtlbuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]);
-    [buffer.mtlbuffer setLabel:GetName(buffer_idx)];
+    [buffer.mtlbuffer setLabel:@"Texture Upload Buffer"];
     ASSERT_MSG(VIDEO, buffer.mtlbuffer, "Failed to allocate MTLBuffer (out of memory?)");
     buffer.buffer = [buffer.mtlbuffer contents];
     buffer.usage.Reset(newsize);
   }
+
+  size_t pos = buffer.usage.Allocate(m_current_draw, amt);
+
+  Map ret = {buffer.mtlbuffer, pos, reinterpret_cast<char*>(buffer.buffer) + pos};
+  DEBUG_ASSERT(pos <= buffer.usage.Size() &&
+               "Previous code should have guaranteed there was enough space");
+  return ret;
+}
+
+std::pair<void*, size_t> Metal::StateTracker::Preallocate(UploadBuffer buffer_idx, size_t amt)
+{
+  BufferPair& buffer = m_upload_buffers[static_cast<int>(buffer_idx)];
+  u64 last_draw = m_last_finished_draw.load(std::memory_order_acquire);
+  size_t base_pos = buffer.usage.Pos();
+  bool needs_new = buffer.usage.PrepareForAllocation(last_draw, amt);
+  bool needs_upload = needs_new || buffer.usage.Pos() == 0;
+  if (m_manual_buffer_upload && needs_upload)
+  {
+    if (base_pos != buffer.last_upload)
+    {
+      id<MTLBlitCommandEncoder> encoder = GetUploadEncoder();
+      [encoder copyFromBuffer:buffer.cpubuffer
+                 sourceOffset:buffer.last_upload
+                     toBuffer:buffer.gpubuffer
+            destinationOffset:buffer.last_upload
+                         size:base_pos - buffer.last_upload];
+    }
+    buffer.last_upload = 0;
+  }
+  if (__builtin_expect(needs_new, false))
+  {
+    // Orphan buffer
+    size_t newsize = std::max<size_t>(buffer.usage.Size() * 2, 4096);
+    while (newsize < amt)
+      newsize *= 2;
+    MTLResourceOptions options =
+        MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined;
+    buffer.cpubuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]);
+    [buffer.cpubuffer setLabel:GetName(buffer_idx)];
+    ASSERT_MSG(VIDEO, buffer.cpubuffer, "Failed to allocate MTLBuffer (out of memory?)");
+    buffer.buffer = [buffer.cpubuffer contents];
+    buffer.usage.Reset(newsize);
+    if (g_features.manual_buffer_upload)
+    {
+      options = MTLResourceStorageModePrivate | MTLResourceHazardTrackingModeUntracked;
+      buffer.gpubuffer = MRCTransfer([g_device newBufferWithLength:newsize options:options]);
+      [buffer.gpubuffer setLabel:GetName(buffer_idx)];
+      ASSERT_MSG(VIDEO, buffer.gpubuffer, "Failed to allocate MTLBuffer (out of memory?)");
+    }
+  }
   size_t pos = buffer.usage.Pos();
   return std::make_pair(reinterpret_cast<char*>(buffer.buffer) + pos, pos);
 }
@@ -167,17 +218,46 @@ std::pair<void*, size_t> Metal::StateTracker::Preallocate(UploadBuffer buffer_id
 Metal::StateTracker::Map Metal::StateTracker::CommitPreallocation(UploadBuffer buffer_idx,
                                                                   size_t amt)
 {
-  Buffer& buffer = m_upload_buffers[static_cast<int>(buffer_idx)];
+  BufferPair& buffer = m_upload_buffers[static_cast<int>(buffer_idx)];
   size_t pos = buffer.usage.Allocate(m_current_draw, amt);
   Map ret = {nil, pos, reinterpret_cast<char*>(buffer.buffer) + pos};
-  ret.gpu_buffer = buffer.mtlbuffer;
+  ret.gpu_buffer = m_manual_buffer_upload ? buffer.gpubuffer : buffer.cpubuffer;
   DEBUG_ASSERT(pos <= buffer.usage.Size() &&
                "Previous code should have guaranteed there was enough space");
   return ret;
 }
 
+void Metal::StateTracker::Sync(BufferPair& buffer)
+{
+  if (!m_manual_buffer_upload || buffer.usage.Pos() == buffer.last_upload)
+    return;
+
+  id<MTLBlitCommandEncoder> encoder = GetUploadEncoder();
+  [encoder copyFromBuffer:buffer.cpubuffer
+             sourceOffset:buffer.last_upload
+                 toBuffer:buffer.gpubuffer
+        destinationOffset:buffer.last_upload
+                     size:buffer.usage.Pos() - buffer.last_upload];
+  buffer.last_upload = buffer.usage.Pos();
+}
+
 // MARK: Render Pass / Encoder Management
 
+id<MTLBlitCommandEncoder> Metal::StateTracker::GetUploadEncoder()
+{
+  if (!m_upload_cmdbuf)
+  {
+    @autoreleasepool
+    {
+      m_upload_cmdbuf = MRCRetain([g_queue commandBuffer]);
+      [m_upload_cmdbuf setLabel:@"Vertex Upload"];
+      m_upload_encoder = MRCRetain([m_upload_cmdbuf blitCommandEncoder]);
+      [m_upload_encoder setLabel:@"Vertex Upload"];
+    }
+  }
+  return m_upload_encoder;
+}
+
 id<MTLBlitCommandEncoder> Metal::StateTracker::GetTextureUploadEncoder()
 {
   if (!m_texture_upload_cmdbuf)
@@ -270,6 +350,8 @@ void Metal::StateTracker::BeginRenderPass(MTLRenderPassDescriptor* descriptor)
       MRCRetain([GetRenderCmdBuf() renderCommandEncoderWithDescriptor:descriptor]);
   if (m_current_perf_query)
     [descriptor setVisibilityResultBuffer:nil];
+  if (m_manual_buffer_upload)
+    [m_current_render_encoder waitForFence:m_fence beforeStages:MTLRenderStageVertex];
   AbstractTexture* attachment = m_current_framebuffer->GetColorAttachment();
   if (!attachment)
     attachment = m_current_framebuffer->GetDepthAttachment();
@@ -299,6 +381,8 @@ void Metal::StateTracker::BeginComputePass()
   EndRenderPass();
   m_current_compute_encoder = MRCRetain([GetRenderCmdBuf() computeCommandEncoder]);
   [m_current_compute_encoder setLabel:@"Compute"];
+  if (m_manual_buffer_upload)
+    [m_current_compute_encoder waitForFence:m_fence];
   m_flags.NewEncoder();
   m_dirty_samplers = 0xff;
   m_dirty_textures = 0xff;
@@ -326,6 +410,20 @@ void Metal::StateTracker::FlushEncoders()
   if (!m_current_render_cmdbuf)
     return;
   EndRenderPass();
+  for (int i = 0; i <= static_cast<int>(UploadBuffer::Last); ++i)
+    Sync(m_upload_buffers[i]);
+  if (!m_manual_buffer_upload)
+  {
+    ASSERT(!m_upload_cmdbuf && "Should never be used!");
+  }
+  else if (m_upload_cmdbuf)
+  {
+    [m_upload_encoder updateFence:m_fence];
+    [m_upload_encoder endEncoding];
+    [m_upload_cmdbuf commit];
+    m_upload_encoder = nullptr;
+    m_upload_cmdbuf = nullptr;
+  }
   if (m_texture_upload_cmdbuf)
   {
     [m_texture_upload_encoder endEncoding];
@@ -355,6 +453,8 @@ void Metal::StateTracker::FlushEncoders()
   m_last_render_cmdbuf = std::move(m_current_render_cmdbuf);
   m_current_render_cmdbuf = nullptr;
   m_current_draw++;
+  if (g_features.manual_buffer_upload && !m_manual_buffer_upload)
+    SetManualBufferUpload(true);
 }
 
 void Metal::StateTracker::WaitForFlushedEncoders()
@@ -368,6 +468,23 @@ void Metal::StateTracker::ReloadSamplers()
     m_state.samplers[i] = g_object_cache->GetSampler(m_state.sampler_states[i]);
 }
 
+void Metal::StateTracker::SetManualBufferUpload(bool enabled)
+{
+  // When a game does something that needs CPU-GPU sync (e.g. bbox, texture download, etc),
+  // the next command buffer will be done with manual buffer upload disabled,
+  // since overlapping the upload with the previous draw won't be possible (due to sync).
+  // This greatly improves performance in heavy bbox games like Super Paper Mario.
+  m_manual_buffer_upload = enabled;
+  if (enabled)
+  {
+    for (BufferPair& buffer : m_upload_buffers)
+    {
+      // Update sync positions, since Sync doesn't do it when manual buffer upload is off
+      buffer.last_upload = buffer.usage.Pos();
+    }
+  }
+}
+
 // MARK: State Setters
 
 void Metal::StateTracker::SetPipeline(const Pipeline* pipe)
diff --git a/Source/Core/VideoBackends/Metal/MTLTexture.mm b/Source/Core/VideoBackends/Metal/MTLTexture.mm
index fd0358e10e..52a857f5d8 100644
--- a/Source/Core/VideoBackends/Metal/MTLTexture.mm
+++ b/Source/Core/VideoBackends/Metal/MTLTexture.mm
@@ -6,6 +6,7 @@
 #include "Common/Align.h"
 #include "Common/Assert.h"
 
+#include "VideoBackends/Metal/MTLRenderer.h"
 #include "VideoBackends/Metal/MTLStateTracker.h"
 
 Metal::Texture::Texture(MRCOwned<id<MTLTexture>> tex, const TextureConfig& config)
@@ -50,6 +51,10 @@ void Metal::Texture::ResolveFromTexture(const AbstractTexture* src,
   g_state_tracker->ResolveTexture(src_tex, m_tex, layer, level);
 }
 
+// Use a temporary texture for large texture loads
+// (Since the main upload buffer doesn't shrink after it grows)
+static constexpr u32 STAGING_TEXTURE_UPLOAD_THRESHOLD = 1024 * 1024 * 4;
+
 void Metal::Texture::Load(u32 level, u32 width, u32 height, u32 row_length,  //
                           const u8* buffer, size_t buffer_size)
 {
@@ -59,8 +64,23 @@ void Metal::Texture::Load(u32 level, u32 width, u32 height, u32 row_length,  //
     const u32 num_rows = Common::AlignUp(height, block_size) / block_size;
     const u32 source_pitch = CalculateStrideForFormat(m_config.format, row_length);
     const u32 upload_size = source_pitch * num_rows;
-    StateTracker::Map map = g_state_tracker->Allocate(StateTracker::UploadBuffer::TextureData,
-                                                      upload_size, StateTracker::AlignMask::Other);
+    MRCOwned<id<MTLBuffer>> tmp_buffer;
+    StateTracker::Map map;
+    if (upload_size > STAGING_TEXTURE_UPLOAD_THRESHOLD)
+    {
+      tmp_buffer = MRCTransfer([g_device
+          newBufferWithLength:upload_size
+                      options:MTLResourceStorageModeShared | MTLResourceCPUCacheModeWriteCombined]);
+      [tmp_buffer setLabel:@"Temp Texture Upload"];
+      map.gpu_buffer = tmp_buffer;
+      map.gpu_offset = 0;
+      map.cpu_buffer = [tmp_buffer contents];
+    }
+    else
+    {
+      map = g_state_tracker->AllocateForTextureUpload(upload_size);
+    }
+
     memcpy(map.cpu_buffer, buffer, upload_size);
     id<MTLBlitCommandEncoder> encoder = g_state_tracker->GetTextureUploadEncoder();
     [encoder copyFromBuffer:map.gpu_buffer
@@ -163,6 +183,7 @@ void Metal::StagingTexture::Flush()
   {
     // Flush while we wait, since who knows how long we'll be sitting here
     g_state_tracker->FlushEncoders();
+    g_state_tracker->NotifyOfCPUGPUSync();
     [m_wait_buffer waitUntilCompleted];
   }
   m_wait_buffer = nullptr;
diff --git a/Source/Core/VideoBackends/Metal/MTLUtil.h b/Source/Core/VideoBackends/Metal/MTLUtil.h
index dfedecd7c6..28fb57cb0b 100644
--- a/Source/Core/VideoBackends/Metal/MTLUtil.h
+++ b/Source/Core/VideoBackends/Metal/MTLUtil.h
@@ -16,6 +16,10 @@ namespace Metal
 {
 struct DeviceFeatures
 {
+  /// Manually copy buffer data to the GPU (instead of letting the GPU read from system memory)
+  /// On discrete GPUs, this tends to be faster if the copy is able to operate in parallel with a
+  /// previous render.  This is the case unless a game uses features like bbox or texture downloads.
+  bool manual_buffer_upload;
   bool subgroup_ops;
 };
 
diff --git a/Source/Core/VideoBackends/Metal/MTLUtil.mm b/Source/Core/VideoBackends/Metal/MTLUtil.mm
index aa379c0851..1baf532943 100644
--- a/Source/Core/VideoBackends/Metal/MTLUtil.mm
+++ b/Source/Core/VideoBackends/Metal/MTLUtil.mm
@@ -217,6 +217,27 @@ void Metal::Util::PopulateBackendInfoFeatures(VideoConfig* config, id<MTLDevice>
       config->backend_info.AAModes.push_back(i);
   }
 
+  switch (config->iManuallyUploadBuffers)
+  {
+  case TriState::Off:
+    g_features.manual_buffer_upload = false;
+    break;
+  case TriState::On:
+    g_features.manual_buffer_upload = true;
+    break;
+  case TriState::Auto:
+#if TARGET_OS_OSX
+    g_features.manual_buffer_upload = false;
+    if (@available(macOS 10.15, *))
+      if (![device hasUnifiedMemory])
+        g_features.manual_buffer_upload = true;
+#else
+    // All iOS devices have unified memory
+    g_features.manual_buffer_upload = false;
+#endif
+    break;
+  }
+
   g_features.subgroup_ops = false;
   if (@available(macOS 10.15, iOS 13, *))
   {
@@ -225,7 +246,7 @@ void Metal::Util::PopulateBackendInfoFeatures(VideoConfig* config, id<MTLDevice>
         [device supportsFamily:MTLGPUFamilyMac2] || [device supportsFamily:MTLGPUFamilyApple6];
     config->backend_info.bSupportsFramebufferFetch = [device supportsFamily:MTLGPUFamilyApple1];
   }
-  if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_INVOCATION_ID))
+  if (DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_OPS))
     g_features.subgroup_ops = false;
 #if TARGET_OS_OSX
   if (@available(macOS 11, *))
@@ -378,6 +399,12 @@ static const std::string_view MSL_HEADER =
     // These are usually when the compiler doesn't think a switch is exhaustive
     "#pragma clang diagnostic ignored \"-Wreturn-type\"\n";
 
+static constexpr std::pair<std::string_view, std::string_view> MSL_FIXUPS[] = {
+    // Force-unroll the lighting loop in ubershaders, which greatly reduces register pressure on AMD
+    {"for (uint chan = 0u; chan < 2u; chan++)",
+     "_Pragma(\"unroll\") for (uint chan = 0u; chan < 2u; chan++)"},
+};
+
 static constexpr spirv_cross::MSLResourceBinding
 MakeResourceBinding(spv::ExecutionModel stage, u32 set, u32 binding,  //
                     u32 msl_buffer, u32 msl_texture, u32 msl_sampler)
@@ -474,7 +501,27 @@ std::optional<std::string> Metal::Util::TranslateShaderToMSL(ShaderStage stage,
   for (auto& binding : resource_bindings)
     compiler.add_msl_resource_binding(binding);
 
-  std::string msl(MSL_HEADER);
-  msl += compiler.compile();
-  return msl;
+  std::string output(MSL_HEADER);
+  std::string compiled = compiler.compile();
+  std::string_view remaining = compiled;
+  while (!remaining.empty())
+  {
+    // Apply fixups
+    std::string_view piece = remaining;
+    std::string_view fixup_piece = {};
+    size_t next = piece.size();
+    for (const auto& fixup : MSL_FIXUPS)
+    {
+      size_t found = piece.find(fixup.first);
+      if (found == std::string_view::npos)
+        continue;
+      piece = piece.substr(0, found);
+      fixup_piece = fixup.second;
+      next = found + fixup.first.size();
+    }
+    output += piece;
+    output += fixup_piece;
+    remaining = remaining.substr(next);
+  }
+  return output;
 }
diff --git a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
index 8fce877092..3275cb9417 100644
--- a/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
+++ b/Source/Core/VideoBackends/Vulkan/VulkanContext.cpp
@@ -918,7 +918,7 @@ void VulkanContext::PopulateShaderSubgroupSupport()
   m_supports_shader_subgroup_operations =
       (subgroup_properties.supportedOperations & required_operations) == required_operations &&
       subgroup_properties.supportedStages & VK_SHADER_STAGE_FRAGMENT_BIT &&
-      !DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_INVOCATION_ID);
+      !DriverDetails::HasBug(DriverDetails::BUG_BROKEN_SUBGROUP_OPS);
 }
 
 bool VulkanContext::SupportsExclusiveFullscreen(const WindowSystemInfo& wsi, VkSurfaceKHR surface)
diff --git a/Source/Core/VideoCommon/DriverDetails.cpp b/Source/Core/VideoCommon/DriverDetails.cpp
index 09fef47a4f..ffd5fa48d7 100644
--- a/Source/Core/VideoCommon/DriverDetails.cpp
+++ b/Source/Core/VideoCommon/DriverDetails.cpp
@@ -132,10 +132,14 @@ constexpr BugInfo m_known_bugs[] = {
      -1.0, -1.0, true},
     {API_VULKAN, OS_ALL, VENDOR_ARM, DRIVER_ARM, Family::UNKNOWN, BUG_BROKEN_VECTOR_BITWISE_AND,
      -1.0, -1.0, true},
-    {API_VULKAN, OS_OSX, VENDOR_ATI, DRIVER_PORTABILITY, Family::UNKNOWN,
-     BUG_BROKEN_SUBGROUP_INVOCATION_ID, -1.0, -1.0, true},
-    {API_METAL, OS_OSX, VENDOR_ATI, DRIVER_APPLE, Family::UNKNOWN,
-     BUG_BROKEN_SUBGROUP_INVOCATION_ID, -1.0, -1.0, true},
+    {API_VULKAN, OS_OSX, VENDOR_ATI, DRIVER_PORTABILITY, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS,
+     -1.0, -1.0, true},
+    {API_VULKAN, OS_OSX, VENDOR_INTEL, DRIVER_PORTABILITY, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS,
+     -1.0, -1.0, true},
+    {API_METAL, OS_OSX, VENDOR_ATI, DRIVER_APPLE, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS, -1.0,
+     -1.0, true},
+    {API_METAL, OS_OSX, VENDOR_INTEL, DRIVER_APPLE, Family::UNKNOWN, BUG_BROKEN_SUBGROUP_OPS, -1.0,
+     -1.0, true},
     {API_OPENGL, OS_ANDROID, VENDOR_ALL, DRIVER_ALL, Family::UNKNOWN,
      BUG_BROKEN_MULTITHREADED_SHADER_PRECOMPILATION, -1.0, -1.0, true},
     {API_VULKAN, OS_ANDROID, VENDOR_ALL, DRIVER_ALL, Family::UNKNOWN,
diff --git a/Source/Core/VideoCommon/DriverDetails.h b/Source/Core/VideoCommon/DriverDetails.h
index 02827c35d5..3e59e576c6 100644
--- a/Source/Core/VideoCommon/DriverDetails.h
+++ b/Source/Core/VideoCommon/DriverDetails.h
@@ -306,10 +306,15 @@ enum Bug
   BUG_BROKEN_VECTOR_BITWISE_AND,
 
   // BUG: Accessing gl_SubgroupInvocationID causes the Metal shader compiler to crash.
-  // Affected devices: AMD (macOS)
+  //      Affected devices: AMD (older macOS)
+  // BUG: gl_HelperInvocation always returns true, even for non-helper invocations
+  //      Affected devices: AMD (newer macOS)
+  // BUG: Using subgroupMax in a shader that can discard results in garbage data
+  //      (For some reason, this only happens at 4x+ IR on Metal, but 2x+ IR on MoltenVK)
+  //      Affected devices: Intel (macOS)
   // Started version: -1
   // Ended version: -1
-  BUG_BROKEN_SUBGROUP_INVOCATION_ID,
+  BUG_BROKEN_SUBGROUP_OPS,
 
   // BUG: Multi-threaded shader pre-compilation sometimes crashes
   // Used primarily in Videoconfig.cpp's GetNumAutoShaderPreCompilerThreads()
diff --git a/Source/Core/VideoCommon/VideoConfig.cpp b/Source/Core/VideoCommon/VideoConfig.cpp
index 91df848c94..760e185304 100644
--- a/Source/Core/VideoCommon/VideoConfig.cpp
+++ b/Source/Core/VideoCommon/VideoConfig.cpp
@@ -55,6 +55,8 @@ void VideoConfig::Refresh()
 
   bVSync = Config::Get(Config::GFX_VSYNC);
   iAdapter = Config::Get(Config::GFX_ADAPTER);
+  iManuallyUploadBuffers = Config::Get(Config::GFX_MTL_MANUALLY_UPLOAD_BUFFERS);
+  bUsePresentDrawable = Config::Get(Config::GFX_MTL_USE_PRESENT_DRAWABLE);
 
   bWidescreenHack = Config::Get(Config::GFX_WIDESCREEN_HACK);
   aspect_mode = Config::Get(Config::GFX_ASPECT_RATIO);
diff --git a/Source/Core/VideoCommon/VideoConfig.h b/Source/Core/VideoCommon/VideoConfig.h
index d0863a9d84..8b4cc40657 100644
--- a/Source/Core/VideoCommon/VideoConfig.h
+++ b/Source/Core/VideoCommon/VideoConfig.h
@@ -45,6 +45,13 @@ enum class ShaderCompilationMode : int
   AsynchronousSkipRendering
 };
 
+enum class TriState : int
+{
+  Off,
+  On,
+  Auto
+};
+
 // NEVER inherit from this class.
 struct VideoConfig final
 {
@@ -149,6 +156,10 @@ struct VideoConfig final
   // D3D only config, mostly to be merged into the above
   int iAdapter = 0;
 
+  // Metal only config
+  TriState iManuallyUploadBuffers = TriState::Auto;
+  bool bUsePresentDrawable = false;
+
   // Enable API validation layers, currently only supported with Vulkan.
   bool bEnableValidationLayer = false;