Fixed auto exposure on metal

Chman · Chman · commit 42c9283f1f8e · 2019-01-25T14:04:18.000+01:00
diff --git a/PostProcessing/Runtime/Utils/LogHistogram.cs b/PostProcessing/Runtime/Utils/LogHistogram.cs
@@ -7,20 +7,15 @@ internal sealed class LogHistogram
         
         // Don't forget to update 'ExposureHistogram.hlsl' if you change these values !
         const int k_Bins = 128;
-        int m_ThreadX;
-        int m_ThreadY;
 
         public ComputeBuffer data { get; private set; }
 
         public void Generate(PostProcessRenderContext context)
         {
             if (data == null)
-            {
-                m_ThreadX = 16;
-                m_ThreadY = RuntimeUtilities.isAndroidOpenGL ? 8 : 16;
                 data = new ComputeBuffer(k_Bins, sizeof(uint));
-            }
-            
+
+            uint threadX, threadY, threadZ;
             var scaleOffsetRes = GetHistogramScaleOffsetRes(context);
             var compute = context.resources.computeShaders.exposureHistogram;
             var cmd = context.command;
@@ -29,16 +24,19 @@ public void Generate(PostProcessRenderContext context)
             // Clear the buffer on every frame as we use it to accumulate luminance values on each frame
             int kernel = compute.FindKernel("KEyeHistogramClear");
             cmd.SetComputeBufferParam(compute, kernel, "_HistogramBuffer", data);
-            cmd.DispatchCompute(compute, kernel, Mathf.CeilToInt(k_Bins / (float)m_ThreadX), 1, 1);
+            compute.GetKernelThreadGroupSizes(kernel, out threadX, out threadY, out threadZ);
+            cmd.DispatchCompute(compute, kernel, Mathf.CeilToInt(k_Bins / (float)threadX), 1, 1);
 
             // Get a log histogram
             kernel = compute.FindKernel("KEyeHistogram");
             cmd.SetComputeBufferParam(compute, kernel, "_HistogramBuffer", data);
             cmd.SetComputeTextureParam(compute, kernel, "_Source", context.source);
             cmd.SetComputeVectorParam(compute, "_ScaleOffsetRes", scaleOffsetRes);
+
+            compute.GetKernelThreadGroupSizes(kernel, out threadX, out threadY, out threadZ);
             cmd.DispatchCompute(compute, kernel,
-                Mathf.CeilToInt(scaleOffsetRes.z / 2f / m_ThreadX),
-                Mathf.CeilToInt(scaleOffsetRes.w / 2f / m_ThreadY),
+                Mathf.CeilToInt(scaleOffsetRes.z / 2f / threadX),
+                Mathf.CeilToInt(scaleOffsetRes.w / 2f / threadY),
                 1
             );
 
diff --git a/PostProcessing/Shaders/Builtins/AutoExposure.compute b/PostProcessing/Shaders/Builtins/AutoExposure.compute
@@ -17,7 +17,7 @@ CBUFFER_START(Params)
     float4 _ScaleOffsetRes; // x: scale, y: offset, w: histogram pass width, h: histogram pass height
 CBUFFER_END
 
-groupshared uint gs_pyramid[HISTOGRAM_BINS];
+groupshared uint gs_pyramid[HISTOGRAM_REDUCTION_BINS];
 
 float GetExposureMultiplier(float avgLuminance)
 {
@@ -42,17 +42,22 @@ TRIVIAL_COMPUTE_KERNEL(MAIN)
 
 #else
 
-[numthreads(HISTOGRAM_THREAD_X, HISTOGRAM_BINS / HISTOGRAM_THREAD_X, 1)]
+[numthreads(HISTOGRAM_REDUCTION_THREAD_X, HISTOGRAM_REDUCTION_THREAD_Y, 1)]
 void MAIN(uint2 groupThreadId : SV_GroupThreadID)
 {
-    const uint thread_id = groupThreadId.y * HISTOGRAM_THREAD_X + groupThreadId.x;
+#if HISTOGRAM_REDUCTION_ALT_PATH
+    const uint thread_id = groupThreadId.y * HISTOGRAM_REDUCTION_THREAD_X + groupThreadId.x;
+    gs_pyramid[thread_id] = max(_HistogramBuffer[thread_id], _HistogramBuffer[thread_id + HISTOGRAM_REDUCTION_BINS]);
+#else
+    const uint thread_id = groupThreadId.y * HISTOGRAM_REDUCTION_THREAD_X + groupThreadId.x;
     gs_pyramid[thread_id] = _HistogramBuffer[thread_id];
+#endif
 
     GroupMemoryBarrierWithGroupSync();
 
     // Parallel reduction to find the max value
     UNITY_UNROLL
-    for (uint i = HISTOGRAM_BINS >> 1u; i > 0u; i >>= 1u)
+    for (uint i = HISTOGRAM_REDUCTION_BINS >> 1u; i > 0u; i >>= 1u)
     {
         if (thread_id < i)
             gs_pyramid[thread_id] = max(gs_pyramid[thread_id], gs_pyramid[thread_id + i]);
diff --git a/PostProcessing/Shaders/Builtins/ExposureHistogram.compute b/PostProcessing/Shaders/Builtins/ExposureHistogram.compute
@@ -30,16 +30,22 @@ TRIVIAL_COMPUTE_KERNEL(KEyeHistogramClear)
 [numthreads(HISTOGRAM_THREAD_X, HISTOGRAM_THREAD_Y, 1)]
 void KEyeHistogram(uint2 dispatchThreadId : SV_DispatchThreadID, uint2 groupThreadId : SV_GroupThreadID)
 {
-    // Pretty straightforward implementation of histogram gathering using atomic ops.
-    // I tried a few methods (no atomic ops / heavy LDS leveraging) but this one turned out to be
-    // the fastest on desktop (Nvidia - Kepler/Maxwell) and PS4. Still need to try it on GCN/desktop
-    // but considering it runs very fast on PS4 we can expect it to run well (?).
-
     const uint localThreadId = groupThreadId.y * HISTOGRAM_THREAD_X + groupThreadId.x;
 
     // Clears the shared memory
+#if HISTOGRAM_REDUCTION_ALT_PATH
+    uint localThreadIdOff = localThreadId << 1u;
+    if (localThreadIdOff < HISTOGRAM_BINS)
+    {
+        gs_histogram[localThreadIdOff    ] = 0u;
+        gs_histogram[localThreadIdOff + 1] = 0u;
+    }
+#else
     if (localThreadId < HISTOGRAM_BINS)
+    {
         gs_histogram[localThreadId] = 0u;
+    }
+#endif
 
     float2 ipos = float2(dispatchThreadId) * 2.0;
 
@@ -71,8 +77,18 @@ void KEyeHistogram(uint2 dispatchThreadId : SV_DispatchThreadID, uint2 groupThre
     GroupMemoryBarrierWithGroupSync();
 
     // Merge everything
+#if HISTOGRAM_REDUCTION_ALT_PATH
+    if (localThreadIdOff < HISTOGRAM_BINS)
+    {
+        InterlockedAdd(_HistogramBuffer[localThreadIdOff    ], gs_histogram[localThreadIdOff    ]);
+        InterlockedAdd(_HistogramBuffer[localThreadIdOff + 1], gs_histogram[localThreadIdOff + 1]);
+    }
+#else
     if (localThreadId < HISTOGRAM_BINS)
+    {
         InterlockedAdd(_HistogramBuffer[localThreadId], gs_histogram[localThreadId]);
+    }
+#endif
 }
 
 #pragma kernel KEyeHistogramClear
diff --git a/PostProcessing/Shaders/Builtins/ExposureHistogram.hlsl b/PostProcessing/Shaders/Builtins/ExposureHistogram.hlsl
@@ -1,19 +1,26 @@
 #ifndef UNITY_POSTFX_EXPOSURE_HISTOGRAM
 #define UNITY_POSTFX_EXPOSURE_HISTOGRAM
 
-// Optimal values for PS4/GCN
-// Using a group size of 32x32 seems to be a bit faster on Kepler/Maxwell
-// Don't forget to update 'AutoExposureRenderer.cs' if you change these values !
+// Don't forget to update 'LogHistogram.cs' if you change these values !
 #define HISTOGRAM_BINS          128
 #define HISTOGRAM_TEXELS        HISTOGRAM_BINS / 4
-#if SHADER_API_GLES3
-    #define HISTOGRAM_THREAD_X      16
-    #define HISTOGRAM_THREAD_Y      8
+
+#if SHADER_API_GLES3 || SHADER_API_METAL
+    #define HISTOGRAM_THREAD_X              8
+    #define HISTOGRAM_THREAD_Y              8
+    #define HISTOGRAM_REDUCTION_THREAD_X    8
+    #define HISTOGRAM_REDUCTION_THREAD_Y    8
+    #define HISTOGRAM_REDUCTION_ALT_PATH    1
 #else
-    #define HISTOGRAM_THREAD_X      16
-    #define HISTOGRAM_THREAD_Y      16
+    #define HISTOGRAM_THREAD_X              16
+    #define HISTOGRAM_THREAD_Y              16
+    #define HISTOGRAM_REDUCTION_THREAD_X    HISTOGRAM_THREAD_X
+    #define HISTOGRAM_REDUCTION_THREAD_Y    HISTOGRAM_BINS / HISTOGRAM_THREAD_Y
+    #define HISTOGRAM_REDUCTION_ALT_PATH    0
 #endif
 
+#define HISTOGRAM_REDUCTION_BINS HISTOGRAM_REDUCTION_THREAD_X * HISTOGRAM_REDUCTION_THREAD_Y
+
 float GetHistogramBinFromLuminance(float value, float2 scaleOffset)
 {
     return saturate(log2(value) * scaleOffset.x + scaleOffset.y);