Skip to content
This repository was archived by the owner on Nov 30, 2020. It is now read-only.

Commit 42c9283

Browse files
committed
Fixed auto exposure on metal
1 parent 86a0243 commit 42c9283

4 files changed

Lines changed: 53 additions & 27 deletions

File tree

PostProcessing/Runtime/Utils/LogHistogram.cs

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,15 @@ internal sealed class LogHistogram
77

88
// Don't forget to update 'ExposureHistogram.hlsl' if you change these values !
99
const int k_Bins = 128;
10-
int m_ThreadX;
11-
int m_ThreadY;
1210

1311
public ComputeBuffer data { get; private set; }
1412

1513
public void Generate(PostProcessRenderContext context)
1614
{
1715
if (data == null)
18-
{
19-
m_ThreadX = 16;
20-
m_ThreadY = RuntimeUtilities.isAndroidOpenGL ? 8 : 16;
2116
data = new ComputeBuffer(k_Bins, sizeof(uint));
22-
}
23-
17+
18+
uint threadX, threadY, threadZ;
2419
var scaleOffsetRes = GetHistogramScaleOffsetRes(context);
2520
var compute = context.resources.computeShaders.exposureHistogram;
2621
var cmd = context.command;
@@ -29,16 +24,19 @@ public void Generate(PostProcessRenderContext context)
2924
// Clear the buffer on every frame as we use it to accumulate luminance values on each frame
3025
int kernel = compute.FindKernel("KEyeHistogramClear");
3126
cmd.SetComputeBufferParam(compute, kernel, "_HistogramBuffer", data);
32-
cmd.DispatchCompute(compute, kernel, Mathf.CeilToInt(k_Bins / (float)m_ThreadX), 1, 1);
27+
compute.GetKernelThreadGroupSizes(kernel, out threadX, out threadY, out threadZ);
28+
cmd.DispatchCompute(compute, kernel, Mathf.CeilToInt(k_Bins / (float)threadX), 1, 1);
3329

3430
// Get a log histogram
3531
kernel = compute.FindKernel("KEyeHistogram");
3632
cmd.SetComputeBufferParam(compute, kernel, "_HistogramBuffer", data);
3733
cmd.SetComputeTextureParam(compute, kernel, "_Source", context.source);
3834
cmd.SetComputeVectorParam(compute, "_ScaleOffsetRes", scaleOffsetRes);
35+
36+
compute.GetKernelThreadGroupSizes(kernel, out threadX, out threadY, out threadZ);
3937
cmd.DispatchCompute(compute, kernel,
40-
Mathf.CeilToInt(scaleOffsetRes.z / 2f / m_ThreadX),
41-
Mathf.CeilToInt(scaleOffsetRes.w / 2f / m_ThreadY),
38+
Mathf.CeilToInt(scaleOffsetRes.z / 2f / threadX),
39+
Mathf.CeilToInt(scaleOffsetRes.w / 2f / threadY),
4240
1
4341
);
4442

PostProcessing/Shaders/Builtins/AutoExposure.compute

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ CBUFFER_START(Params)
1717
float4 _ScaleOffsetRes; // x: scale, y: offset, w: histogram pass width, h: histogram pass height
1818
CBUFFER_END
1919

20-
groupshared uint gs_pyramid[HISTOGRAM_BINS];
20+
groupshared uint gs_pyramid[HISTOGRAM_REDUCTION_BINS];
2121

2222
float GetExposureMultiplier(float avgLuminance)
2323
{
@@ -42,17 +42,22 @@ TRIVIAL_COMPUTE_KERNEL(MAIN)
4242

4343
#else
4444

45-
[numthreads(HISTOGRAM_THREAD_X, HISTOGRAM_BINS / HISTOGRAM_THREAD_X, 1)]
45+
[numthreads(HISTOGRAM_REDUCTION_THREAD_X, HISTOGRAM_REDUCTION_THREAD_Y, 1)]
4646
void MAIN(uint2 groupThreadId : SV_GroupThreadID)
4747
{
48-
const uint thread_id = groupThreadId.y * HISTOGRAM_THREAD_X + groupThreadId.x;
48+
#if HISTOGRAM_REDUCTION_ALT_PATH
49+
const uint thread_id = groupThreadId.y * HISTOGRAM_REDUCTION_THREAD_X + groupThreadId.x;
50+
gs_pyramid[thread_id] = max(_HistogramBuffer[thread_id], _HistogramBuffer[thread_id + HISTOGRAM_REDUCTION_BINS]);
51+
#else
52+
const uint thread_id = groupThreadId.y * HISTOGRAM_REDUCTION_THREAD_X + groupThreadId.x;
4953
gs_pyramid[thread_id] = _HistogramBuffer[thread_id];
54+
#endif
5055

5156
GroupMemoryBarrierWithGroupSync();
5257

5358
// Parallel reduction to find the max value
5459
UNITY_UNROLL
55-
for (uint i = HISTOGRAM_BINS >> 1u; i > 0u; i >>= 1u)
60+
for (uint i = HISTOGRAM_REDUCTION_BINS >> 1u; i > 0u; i >>= 1u)
5661
{
5762
if (thread_id < i)
5863
gs_pyramid[thread_id] = max(gs_pyramid[thread_id], gs_pyramid[thread_id + i]);

PostProcessing/Shaders/Builtins/ExposureHistogram.compute

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,22 @@ TRIVIAL_COMPUTE_KERNEL(KEyeHistogramClear)
3030
[numthreads(HISTOGRAM_THREAD_X, HISTOGRAM_THREAD_Y, 1)]
3131
void KEyeHistogram(uint2 dispatchThreadId : SV_DispatchThreadID, uint2 groupThreadId : SV_GroupThreadID)
3232
{
33-
// Pretty straightforward implementation of histogram gathering using atomic ops.
34-
// I tried a few methods (no atomic ops / heavy LDS leveraging) but this one turned out to be
35-
// the fastest on desktop (Nvidia - Kepler/Maxwell) and PS4. Still need to try it on GCN/desktop
36-
// but considering it runs very fast on PS4 we can expect it to run well (?).
37-
3833
const uint localThreadId = groupThreadId.y * HISTOGRAM_THREAD_X + groupThreadId.x;
3934

4035
// Clears the shared memory
36+
#if HISTOGRAM_REDUCTION_ALT_PATH
37+
uint localThreadIdOff = localThreadId << 1u;
38+
if (localThreadIdOff < HISTOGRAM_BINS)
39+
{
40+
gs_histogram[localThreadIdOff ] = 0u;
41+
gs_histogram[localThreadIdOff + 1] = 0u;
42+
}
43+
#else
4144
if (localThreadId < HISTOGRAM_BINS)
45+
{
4246
gs_histogram[localThreadId] = 0u;
47+
}
48+
#endif
4349

4450
float2 ipos = float2(dispatchThreadId) * 2.0;
4551

@@ -71,8 +77,18 @@ void KEyeHistogram(uint2 dispatchThreadId : SV_DispatchThreadID, uint2 groupThre
7177
GroupMemoryBarrierWithGroupSync();
7278

7379
// Merge everything
80+
#if HISTOGRAM_REDUCTION_ALT_PATH
81+
if (localThreadIdOff < HISTOGRAM_BINS)
82+
{
83+
InterlockedAdd(_HistogramBuffer[localThreadIdOff ], gs_histogram[localThreadIdOff ]);
84+
InterlockedAdd(_HistogramBuffer[localThreadIdOff + 1], gs_histogram[localThreadIdOff + 1]);
85+
}
86+
#else
7487
if (localThreadId < HISTOGRAM_BINS)
88+
{
7589
InterlockedAdd(_HistogramBuffer[localThreadId], gs_histogram[localThreadId]);
90+
}
91+
#endif
7692
}
7793

7894
#pragma kernel KEyeHistogramClear

PostProcessing/Shaders/Builtins/ExposureHistogram.hlsl

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,26 @@
11
#ifndef UNITY_POSTFX_EXPOSURE_HISTOGRAM
22
#define UNITY_POSTFX_EXPOSURE_HISTOGRAM
33

4-
// Optimal values for PS4/GCN
5-
// Using a group size of 32x32 seems to be a bit faster on Kepler/Maxwell
6-
// Don't forget to update 'AutoExposureRenderer.cs' if you change these values !
4+
// Don't forget to update 'LogHistogram.cs' if you change these values !
75
#define HISTOGRAM_BINS 128
86
#define HISTOGRAM_TEXELS HISTOGRAM_BINS / 4
9-
#if SHADER_API_GLES3
10-
#define HISTOGRAM_THREAD_X 16
11-
#define HISTOGRAM_THREAD_Y 8
7+
8+
#if SHADER_API_GLES3 || SHADER_API_METAL
9+
#define HISTOGRAM_THREAD_X 8
10+
#define HISTOGRAM_THREAD_Y 8
11+
#define HISTOGRAM_REDUCTION_THREAD_X 8
12+
#define HISTOGRAM_REDUCTION_THREAD_Y 8
13+
#define HISTOGRAM_REDUCTION_ALT_PATH 1
1214
#else
13-
#define HISTOGRAM_THREAD_X 16
14-
#define HISTOGRAM_THREAD_Y 16
15+
#define HISTOGRAM_THREAD_X 16
16+
#define HISTOGRAM_THREAD_Y 16
17+
#define HISTOGRAM_REDUCTION_THREAD_X HISTOGRAM_THREAD_X
18+
#define HISTOGRAM_REDUCTION_THREAD_Y HISTOGRAM_BINS / HISTOGRAM_THREAD_Y
19+
#define HISTOGRAM_REDUCTION_ALT_PATH 0
1520
#endif
1621

22+
#define HISTOGRAM_REDUCTION_BINS HISTOGRAM_REDUCTION_THREAD_X * HISTOGRAM_REDUCTION_THREAD_Y
23+
1724
float GetHistogramBinFromLuminance(float value, float2 scaleOffset)
1825
{
1926
return saturate(log2(value) * scaleOffset.x + scaleOffset.y);

0 commit comments

Comments
 (0)