Source/ThirdParty/ANGLE/src/libANGLE/renderer/metal/shaders/gen_mipmap.metal - WebKit - Git at Google

 //
 // Copyright 2020 The ANGLE Project. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //

 #include "common.h"

 using namespace rx::mtl_shader;

 #define kThreadGroupXYZ                                                      \
     (kGenerateMipThreadGroupSizePerDim * kGenerateMipThreadGroupSizePerDim * \
      kGenerateMipThreadGroupSizePerDim)

 #define kThreadGroupXY (kGenerateMipThreadGroupSizePerDim * kGenerateMipThreadGroupSizePerDim)
 #define kThreadGroupX kGenerateMipThreadGroupSizePerDim

 #define TEXEL_STORE(index, texel) \
     sR[index] = texel.r;          \
     sG[index] = texel.g;          \
     sB[index] = texel.b;          \
     sA[index] = texel.a;

 #define TEXEL_LOAD(index) float4(sR[index], sG[index], sB[index], sA[index])

 #define TO_LINEAR(texel) (options.sRGB ? sRGBtoLinear(texel) : texel)

 #define OUT_OF_BOUND_CHECK(edgeValue, targetValue, condition) \
     (condition) ? (edgeValue) : (targetValue)

 struct GenMipParams
 {
     uint srcLevel;
     uint numMipLevelsToGen;
     bool sRGB;
 };

 // NOTE(hqle): For numMipLevelsToGen > 1, this function assumes the texture is power of two. If it
 // is not, quality will not be good.
 kernel void generate3DMipmaps(uint lIndex [[thread_index_in_threadgroup]],
                               ushort3 gIndices [[thread_position_in_grid]],
                               texture3d<float> srcTexture [[texture(0)]],
                               texture3d<float, access::write> dstMip1 [[texture(1)]],
                               texture3d<float, access::write> dstMip2 [[texture(2)]],
                               texture3d<float, access::write> dstMip3 [[texture(3)]],
                               texture3d<float, access::write> dstMip4 [[texture(4)]],
                               constant GenMipParams &options [[buffer(0)]])
 {
     ushort3 mipSize    = ushort3(dstMip1.get_width(), dstMip1.get_height(), dstMip1.get_depth());
     bool validThread   = gIndices.x < mipSize.x && gIndices.y < mipSize.y && gIndices.z < mipSize.z;

     constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);

     // NOTE(hqle): Use simd_group function whenever available. That could avoid barrier use.

     // Use struct of array style to avoid bank conflict.
     threadgroup float sR[kThreadGroupXYZ];
     threadgroup float sG[kThreadGroupXYZ];
     threadgroup float sB[kThreadGroupXYZ];
     threadgroup float sA[kThreadGroupXYZ];

     // ----- First mip level -------
     float4 texel1;
     if (validThread)
     {
         float3 texCoords = (float3(gIndices) + float3(0.5, 0.5, 0.5)) / float3(mipSize);
         texel1           = srcTexture.sample(textureSampler, texCoords, level(options.srcLevel));

         // Write to texture
         dstMip1.write(texel1, gIndices);
     }
     else
     {
         // This will invalidate all subsequent checks
         lIndex = 0xffffffff;
     }

     if (options.numMipLevelsToGen == 1)
     {
         return;
     }

     // ---- Second mip level --------

     // Write to shared memory
     if (options.sRGB)
     {
         texel1 = linearToSRGB(texel1);
     }
     TEXEL_STORE(lIndex, texel1);

     threadgroup_barrier(mem_flags::mem_threadgroup);

     // Index must be even
     if ((lIndex & 0x49) == 0)  // (lIndex & b1001001) == 0
     {
         bool3 atEdge = gIndices == (mipSize - ushort3(1));

         // (x+1, y, z)
         // If the width of mip is 1, texel2 will equal to texel1:
         float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 1), atEdge.x);
         // (x, y+1, z)
         float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupX), atEdge.y);
         // (x, y, z+1)
         float4 texel4 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupXY), atEdge.z);
         // (x+1, y+1, z)
         float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupX + 1)),
                                            atEdge.x | atEdge.y);
         // (x+1, y, z+1)
         float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupXY + 1)),
                                            atEdge.x | atEdge.z);
         // (x, y+1, z+1)
         float4 texel7 = OUT_OF_BOUND_CHECK(
             texel3, TEXEL_LOAD(lIndex + (kThreadGroupXY + kThreadGroupX)), atEdge.y | atEdge.z);
         // (x+1, y+1, z+1)
         float4 texel8 =
             OUT_OF_BOUND_CHECK(texel5, TEXEL_LOAD(lIndex + (kThreadGroupXY + kThreadGroupX + 1)),
                                atEdge.x | atEdge.y | atEdge.z);

         texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;

         dstMip2.write(TO_LINEAR(texel1), gIndices >> 1);

         // Write to shared memory
         TEXEL_STORE(lIndex, texel1);
     }

     if (options.numMipLevelsToGen == 2)
     {
         return;
     }

     // ---- 3rd mip level --------
     threadgroup_barrier(mem_flags::mem_threadgroup);

     // Index must be multiple of 4
     if ((lIndex & 0xdb) == 0)  // (lIndex & b11011011) == 0
     {
         mipSize      = max(mipSize >> 1, ushort3(1));
         bool3 atEdge = (gIndices >> 1) == (mipSize - ushort3(1));

         // (x+1, y, z)
         // If the width of mip is 1, texel2 will equal to texel1:
         float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2), atEdge.x);
         // (x, y+1, z)
         float4 texel3 =
             OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (2 * kThreadGroupX)), atEdge.y);
         // (x, y, z+1)
         float4 texel4 =
             OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY)), atEdge.z);
         // (x+1, y+1, z)
         float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupX + 2)),
                                            atEdge.x | atEdge.y);
         // (x+1, y, z+1)
         float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2)),
                                            atEdge.x | atEdge.z);
         // (x, y+1, z+1)
         float4 texel7 = OUT_OF_BOUND_CHECK(
             texel3, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2 * kThreadGroupX)),
             atEdge.y | atEdge.z);
         // (x+1, y+1, z+1)
         float4 texel8 = OUT_OF_BOUND_CHECK(
             texel5, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2 * kThreadGroupX + 2)),
             atEdge.x | atEdge.y | atEdge.z);

         texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;

         dstMip3.write(TO_LINEAR(texel1), gIndices >> 2);

         // Write to shared memory
         TEXEL_STORE(lIndex, texel1);
     }

     if (options.numMipLevelsToGen == 3)
     {
         return;
     }

     // ---- 4th mip level --------
     threadgroup_barrier(mem_flags::mem_threadgroup);

     // Index must be multiple of 8
     if ((lIndex & 0x1ff) == 0)  // (lIndex & b111111111) == 0
     {
         mipSize      = max(mipSize >> 1, ushort3(1));
         bool3 atEdge = (gIndices >> 2) == (mipSize - ushort3(1));

         // (x+1, y, z)
         // If the width of mip is 1, texel2 will equal to texel1:
         float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4), atEdge.x);
         // (x, y+1, z)
         float4 texel3 =
             OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (4 * kThreadGroupX)), atEdge.y);
         // (x, y, z+1)
         float4 texel4 =
             OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY)), atEdge.z);
         // (x+1, y+1, z)
         float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupX + 4)),
                                            atEdge.x | atEdge.y);
         // (x+1, y, z+1)
         float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4)),
                                            atEdge.x | atEdge.z);
         // (x, y+1, z+1)
         float4 texel7 = OUT_OF_BOUND_CHECK(
             texel3, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4 * kThreadGroupX)),
             atEdge.y | atEdge.z);
         // (x+1, y+1, z+1)
         float4 texel8 = OUT_OF_BOUND_CHECK(
             texel5, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4 * kThreadGroupX + 4)),
             atEdge.x | atEdge.y | atEdge.z);

         texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;

         dstMip4.write(TO_LINEAR(texel1), gIndices >> 3);
     }
 }

 kernel void generate2DMipmaps(uint lIndex [[thread_index_in_threadgroup]],
                               ushort2 gIndices [[thread_position_in_grid]],
                               texture2d<float> srcTexture [[texture(0)]],
                               texture2d<float, access::write> dstMip1 [[texture(1)]],
                               texture2d<float, access::write> dstMip2 [[texture(2)]],
                               texture2d<float, access::write> dstMip3 [[texture(3)]],
                               texture2d<float, access::write> dstMip4 [[texture(4)]],
                               constant GenMipParams &options [[buffer(0)]])
 {
     uint firstMipLevel = options.srcLevel + 1;
     ushort2 mipSize =
         ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
     bool validThread = gIndices.x < mipSize.x && gIndices.y < mipSize.y;

     constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);

     // NOTE(hqle): Use simd_group function whenever available. That could avoid barrier use.

     // Use struct of array style to avoid bank conflict.
     threadgroup float sR[kThreadGroupXY];
     threadgroup float sG[kThreadGroupXY];
     threadgroup float sB[kThreadGroupXY];
     threadgroup float sA[kThreadGroupXY];

     // ----- First mip level -------
     float4 texel1;
     if (validThread)
     {
         float2 texCoords = (float2(gIndices) + float2(0.5, 0.5)) / float2(mipSize);
         texel1           = srcTexture.sample(textureSampler, texCoords, level(options.srcLevel));

         // Write to texture
         dstMip1.write(TO_LINEAR(texel1), gIndices);
     }
     else
     {
         // This will invalidate all subsequent checks
         lIndex = 0xffffffff;
     }

     if (options.numMipLevelsToGen == 1)
     {
         return;
     }

     // ---- Second mip level --------

     // Write to shared memory
     TEXEL_STORE(lIndex, texel1);

     threadgroup_barrier(mem_flags::mem_threadgroup);

     // Index must be even
     if ((lIndex & 0x09) == 0)  // (lIndex & b001001) == 0
     {
         bool2 atEdge = gIndices == (mipSize - ushort2(1));

         // (x+1, y)
         // If the width of mip is 1, texel2 will equal to texel1:
         float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 1), atEdge.x);
         // (x, y+1)
         float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupX), atEdge.y);
         // (x+1, y+1)
         float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupX + 1)),
                                            atEdge.x | atEdge.y);

         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

         dstMip2.write(TO_LINEAR(texel1), gIndices >> 1);

         // Write to shared memory
         TEXEL_STORE(lIndex, texel1);
     }

     if (options.numMipLevelsToGen == 2)
     {
         return;
     }

     // ---- 3rd mip level --------
     threadgroup_barrier(mem_flags::mem_threadgroup);

     // Index must be multiple of 4
     if ((lIndex & 0x1b) == 0)  // (lIndex & b011011) == 0
     {
         mipSize      = max(mipSize >> 1, ushort2(1));
         bool2 atEdge = (gIndices >> 1) == (mipSize - ushort2(1));

         // (x+1, y)
         // If the width of mip is 1, texel2 will equal to texel1:
         float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2), atEdge.x);
         // (x, y+1)
         float4 texel3 =
             OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2 * kThreadGroupX), atEdge.y);
         // (x+1, y+1)
         float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupX + 2)),
                                            atEdge.x | atEdge.y);

         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

         dstMip3.write(TO_LINEAR(texel1), gIndices >> 2);

         // Write to shared memory
         TEXEL_STORE(lIndex, texel1);
     }

     if (options.numMipLevelsToGen == 3)
     {
         return;
     }

     // ---- 4th mip level --------
     threadgroup_barrier(mem_flags::mem_threadgroup);

     // Index must be multiple of 8
     if ((lIndex & 0x3f) == 0)  // (lIndex & b111111) == 0
     {
         mipSize      = max(mipSize >> 1, ushort2(1));
         bool2 atEdge = (gIndices >> 2) == (mipSize - ushort2(1));

         // (x+1, y)
         // If the width of mip is 1, texel2 will equal to texel1:
         float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4), atEdge.x);
         // (x, y+1)
         float4 texel3 =
             OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4 * kThreadGroupX), atEdge.y);
         // (x+1, y+1)
         float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupX + 4)),
                                            atEdge.x | atEdge.y);

         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

         dstMip4.write(TO_LINEAR(texel1), gIndices >> 3);
     }
 }

 template <typename TextureTypeR, typename TextureTypeW>
 static __attribute__((always_inline)) void generateCubeOr2DArray2ndAndMoreMipmaps(
     uint lIndex,
     ushort3 gIndices,
     TextureTypeR srcTexture,
     TextureTypeW dstMip2,
     TextureTypeW dstMip3,
     TextureTypeW dstMip4,
     ushort2 mip1Size,
     float4 mip1Texel,
     threadgroup float *sR,
     threadgroup float *sG,
     threadgroup float *sB,
     threadgroup float *sA,
     constant GenMipParams &options)
 {
     ushort2 mipSize = mip1Size;
     float4 texel1   = mip1Texel;

     // ---- Second mip level --------

     // Write to shared memory
     TEXEL_STORE(lIndex, texel1);

     threadgroup_barrier(mem_flags::mem_threadgroup);

     // Index must be even
     if ((lIndex & 0x09) == 0)  // (lIndex & b001001) == 0
     {
         bool2 atEdge = gIndices.xy == (mipSize - ushort2(1));

         // (x+1, y)
         // If the width of mip is 1, texel2 will equal to texel1:
         float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 1), atEdge.x);
         // (x, y+1)
         float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupX), atEdge.y);
         // (x+1, y+1)
         float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupX + 1)),
                                            atEdge.x | atEdge.y);

         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

         dstMip2.write(TO_LINEAR(texel1), gIndices.xy >> 1, gIndices.z);

         // Write to shared memory
         TEXEL_STORE(lIndex, texel1);
     }

     if (options.numMipLevelsToGen == 2)
     {
         return;
     }

     // ---- 3rd mip level --------
     threadgroup_barrier(mem_flags::mem_threadgroup);

     // Index must be multiple of 4
     if ((lIndex & 0x1b) == 0)  // (lIndex & b011011) == 0
     {
         mipSize      = max(mipSize >> 1, ushort2(1));
         bool2 atEdge = (gIndices.xy >> 1) == (mipSize - ushort2(1));

         // (x+1, y)
         // If the width of mip is 1, texel2 will equal to texel1:
         float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2), atEdge.x);
         // (x, y+1)
         float4 texel3 =
             OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2 * kThreadGroupX), atEdge.y);
         // (x+1, y+1)
         float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupX + 2)),
                                            atEdge.x | atEdge.y);

         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

         dstMip3.write(TO_LINEAR(texel1), gIndices.xy >> 2, gIndices.z);

         // Write to shared memory
         TEXEL_STORE(lIndex, texel1);
     }

     if (options.numMipLevelsToGen == 3)
     {
         return;
     }

     // ---- 4th mip level --------
     threadgroup_barrier(mem_flags::mem_threadgroup);

     // Index must be multiple of 8
     if ((lIndex & 0x3f) == 0)  // (lIndex & b111111) == 0
     {
         mipSize      = max(mipSize >> 1, ushort2(1));
         bool2 atEdge = (gIndices.xy >> 2) == (mipSize - ushort2(1));

         // (x+1, y)
         // If the width of mip is 1, texel2 will equal to texel1:
         float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4), atEdge.x);
         // (x, y+1)
         float4 texel3 =
             OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4 * kThreadGroupX), atEdge.y);
         // (x+1, y+1)
         float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupX + 4)),
                                            atEdge.x | atEdge.y);

         texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

         dstMip4.write(TO_LINEAR(texel1), gIndices.xy >> 3, gIndices.z);
     }
 }

 kernel void generateCubeMipmaps(uint lIndex [[thread_index_in_threadgroup]],
                                 ushort3 gIndices [[thread_position_in_grid]],
                                 texturecube<float> srcTexture [[texture(0)]],
                                 texturecube<float, access::write> dstMip1 [[texture(1)]],
                                 texturecube<float, access::write> dstMip2 [[texture(2)]],
                                 texturecube<float, access::write> dstMip3 [[texture(3)]],
                                 texturecube<float, access::write> dstMip4 [[texture(4)]],
                                 constant GenMipParams &options [[buffer(0)]])
 {
     uint firstMipLevel = options.srcLevel + 1;
     ushort2 mip1Size =
         ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
     bool validThread = gIndices.x < mip1Size.x && gIndices.y < mip1Size.y;

     constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);

     // ----- First mip level -------
     float4 mip1Texel;
     if (validThread)
     {
         float2 texCoords = (float2(gIndices.xy) + float2(0.5, 0.5)) / float2(mip1Size);
         mip1Texel = srcTexture.sample(textureSampler, cubeTexcoords(texCoords, int(gIndices.z)),
                                       level(options.srcLevel));

         // Write to texture
         dstMip1.write(TO_LINEAR(mip1Texel), gIndices.xy, gIndices.z);
     }
     else
     {
         // This will invalidate all subsequent checks
         lIndex = 0xffffffff;
     }

     if (options.numMipLevelsToGen == 1)
     {
         return;
     }

     // Use struct of array style to avoid bank conflict.
     threadgroup float sR[kThreadGroupXY];
     threadgroup float sG[kThreadGroupXY];
     threadgroup float sB[kThreadGroupXY];
     threadgroup float sA[kThreadGroupXY];

     generateCubeOr2DArray2ndAndMoreMipmaps(lIndex, gIndices, srcTexture, dstMip2, dstMip3, dstMip4,
                                            mip1Size, mip1Texel, sR, sG, sB, sA, options);
 }

 kernel void generate2DArrayMipmaps(uint lIndex [[thread_index_in_threadgroup]],
                                    ushort3 gIndices [[thread_position_in_grid]],
                                    texture2d_array<float> srcTexture [[texture(0)]],
                                    texture2d_array<float, access::write> dstMip1 [[texture(1)]],
                                    texture2d_array<float, access::write> dstMip2 [[texture(2)]],
                                    texture2d_array<float, access::write> dstMip3 [[texture(3)]],
                                    texture2d_array<float, access::write> dstMip4 [[texture(4)]],
                                    constant GenMipParams &options [[buffer(0)]])
 {
     uint firstMipLevel = options.srcLevel + 1;
     ushort2 mip1Size =
         ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
     bool validThread = gIndices.x < mip1Size.x && gIndices.y < mip1Size.y;

     constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);

     // ----- First mip level -------
     float4 mip1Texel;
     if (validThread)
     {
         float2 texCoords = (float2(gIndices.xy) + float2(0.5, 0.5)) / float2(mip1Size);
         mip1Texel =
             srcTexture.sample(textureSampler, texCoords, gIndices.z, level(options.srcLevel));

         // Write to texture
         dstMip1.write(TO_LINEAR(mip1Texel), gIndices.xy, gIndices.z);
     }
     else
     {
         // This will invalidate all subsequent checks
         lIndex = 0xffffffff;
     }

     if (options.numMipLevelsToGen == 1)
     {
         return;
     }

     // Use struct of array style to avoid bank conflict.
     threadgroup float sR[kThreadGroupXY];
     threadgroup float sG[kThreadGroupXY];
     threadgroup float sB[kThreadGroupXY];
     threadgroup float sA[kThreadGroupXY];

     generateCubeOr2DArray2ndAndMoreMipmaps(lIndex, gIndices, srcTexture, dstMip2, dstMip3, dstMip4,
                                            mip1Size, mip1Texel, sR, sG, sB, sA, options);
 }
	//
	// Copyright 2020 The ANGLE Project. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.
	//

	#include "common.h"

	using namespace rx::mtl_shader;

	#define kThreadGroupXYZ \
	(kGenerateMipThreadGroupSizePerDim * kGenerateMipThreadGroupSizePerDim * \
	kGenerateMipThreadGroupSizePerDim)

	#define kThreadGroupXY (kGenerateMipThreadGroupSizePerDim * kGenerateMipThreadGroupSizePerDim)
	#define kThreadGroupX kGenerateMipThreadGroupSizePerDim

	#define TEXEL_STORE(index, texel) \
	sR[index] = texel.r; \
	sG[index] = texel.g; \
	sB[index] = texel.b; \
	sA[index] = texel.a;

	#define TEXEL_LOAD(index) float4(sR[index], sG[index], sB[index], sA[index])

	#define TO_LINEAR(texel) (options.sRGB ? sRGBtoLinear(texel) : texel)

	#define OUT_OF_BOUND_CHECK(edgeValue, targetValue, condition) \
	(condition) ? (edgeValue) : (targetValue)

	struct GenMipParams
	{
	uint srcLevel;
	uint numMipLevelsToGen;
	bool sRGB;
	};

	// NOTE(hqle): For numMipLevelsToGen > 1, this function assumes the texture is power of two. If it
	// is not, quality will not be good.
	kernel void generate3DMipmaps(uint lIndex [[thread_index_in_threadgroup]],
	ushort3 gIndices [[thread_position_in_grid]],
	texture3d<float> srcTexture [[texture(0)]],
	texture3d<float, access::write> dstMip1 [[texture(1)]],
	texture3d<float, access::write> dstMip2 [[texture(2)]],
	texture3d<float, access::write> dstMip3 [[texture(3)]],
	texture3d<float, access::write> dstMip4 [[texture(4)]],
	constant GenMipParams &options [[buffer(0)]])
	{
	ushort3 mipSize = ushort3(dstMip1.get_width(), dstMip1.get_height(), dstMip1.get_depth());
	bool validThread = gIndices.x < mipSize.x && gIndices.y < mipSize.y && gIndices.z < mipSize.z;

	constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);

	// NOTE(hqle): Use simd_group function whenever available. That could avoid barrier use.

	// Use struct of array style to avoid bank conflict.
	threadgroup float sR[kThreadGroupXYZ];
	threadgroup float sG[kThreadGroupXYZ];
	threadgroup float sB[kThreadGroupXYZ];
	threadgroup float sA[kThreadGroupXYZ];

	// ----- First mip level -------
	float4 texel1;
	if (validThread)
	{
	float3 texCoords = (float3(gIndices) + float3(0.5, 0.5, 0.5)) / float3(mipSize);
	texel1 = srcTexture.sample(textureSampler, texCoords, level(options.srcLevel));

	// Write to texture
	dstMip1.write(texel1, gIndices);
	}
	else
	{
	// This will invalidate all subsequent checks
	lIndex = 0xffffffff;
	}

	if (options.numMipLevelsToGen == 1)
	{
	return;
	}

	// ---- Second mip level --------

	// Write to shared memory
	if (options.sRGB)
	{
	texel1 = linearToSRGB(texel1);
	}
	TEXEL_STORE(lIndex, texel1);

	threadgroup_barrier(mem_flags::mem_threadgroup);

	// Index must be even
	if ((lIndex & 0x49) == 0) // (lIndex & b1001001) == 0
	{
	bool3 atEdge = gIndices == (mipSize - ushort3(1));

	// (x+1, y, z)
	// If the width of mip is 1, texel2 will equal to texel1:
	float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 1), atEdge.x);
	// (x, y+1, z)
	float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupX), atEdge.y);
	// (x, y, z+1)
	float4 texel4 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupXY), atEdge.z);
	// (x+1, y+1, z)
	float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupX + 1)),
	atEdge.x \| atEdge.y);
	// (x+1, y, z+1)
	float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupXY + 1)),
	atEdge.x \| atEdge.z);
	// (x, y+1, z+1)
	float4 texel7 = OUT_OF_BOUND_CHECK(
	texel3, TEXEL_LOAD(lIndex + (kThreadGroupXY + kThreadGroupX)), atEdge.y \| atEdge.z);
	// (x+1, y+1, z+1)
	float4 texel8 =
	OUT_OF_BOUND_CHECK(texel5, TEXEL_LOAD(lIndex + (kThreadGroupXY + kThreadGroupX + 1)),
	atEdge.x \| atEdge.y \| atEdge.z);

	texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;

	dstMip2.write(TO_LINEAR(texel1), gIndices >> 1);

	// Write to shared memory
	TEXEL_STORE(lIndex, texel1);
	}

	if (options.numMipLevelsToGen == 2)
	{
	return;
	}

	// ---- 3rd mip level --------
	threadgroup_barrier(mem_flags::mem_threadgroup);

	// Index must be multiple of 4
	if ((lIndex & 0xdb) == 0) // (lIndex & b11011011) == 0
	{
	mipSize = max(mipSize >> 1, ushort3(1));
	bool3 atEdge = (gIndices >> 1) == (mipSize - ushort3(1));

	// (x+1, y, z)
	// If the width of mip is 1, texel2 will equal to texel1:
	float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2), atEdge.x);
	// (x, y+1, z)
	float4 texel3 =
	OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (2 * kThreadGroupX)), atEdge.y);
	// (x, y, z+1)
	float4 texel4 =
	OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY)), atEdge.z);
	// (x+1, y+1, z)
	float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupX + 2)),
	atEdge.x \| atEdge.y);
	// (x+1, y, z+1)
	float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2)),
	atEdge.x \| atEdge.z);
	// (x, y+1, z+1)
	float4 texel7 = OUT_OF_BOUND_CHECK(
	texel3, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2 * kThreadGroupX)),
	atEdge.y \| atEdge.z);
	// (x+1, y+1, z+1)
	float4 texel8 = OUT_OF_BOUND_CHECK(
	texel5, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2 * kThreadGroupX + 2)),
	atEdge.x \| atEdge.y \| atEdge.z);

	texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;

	dstMip3.write(TO_LINEAR(texel1), gIndices >> 2);

	// Write to shared memory
	TEXEL_STORE(lIndex, texel1);
	}

	if (options.numMipLevelsToGen == 3)
	{
	return;
	}

	// ---- 4th mip level --------
	threadgroup_barrier(mem_flags::mem_threadgroup);

	// Index must be multiple of 8
	if ((lIndex & 0x1ff) == 0) // (lIndex & b111111111) == 0
	{
	mipSize = max(mipSize >> 1, ushort3(1));
	bool3 atEdge = (gIndices >> 2) == (mipSize - ushort3(1));

	// (x+1, y, z)
	// If the width of mip is 1, texel2 will equal to texel1:
	float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4), atEdge.x);
	// (x, y+1, z)
	float4 texel3 =
	OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (4 * kThreadGroupX)), atEdge.y);
	// (x, y, z+1)
	float4 texel4 =
	OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY)), atEdge.z);
	// (x+1, y+1, z)
	float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupX + 4)),
	atEdge.x \| atEdge.y);
	// (x+1, y, z+1)
	float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4)),
	atEdge.x \| atEdge.z);
	// (x, y+1, z+1)
	float4 texel7 = OUT_OF_BOUND_CHECK(
	texel3, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4 * kThreadGroupX)),
	atEdge.y \| atEdge.z);
	// (x+1, y+1, z+1)
	float4 texel8 = OUT_OF_BOUND_CHECK(
	texel5, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4 * kThreadGroupX + 4)),
	atEdge.x \| atEdge.y \| atEdge.z);

	texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;

	dstMip4.write(TO_LINEAR(texel1), gIndices >> 3);
	}
	}

	kernel void generate2DMipmaps(uint lIndex [[thread_index_in_threadgroup]],
	ushort2 gIndices [[thread_position_in_grid]],
	texture2d<float> srcTexture [[texture(0)]],
	texture2d<float, access::write> dstMip1 [[texture(1)]],
	texture2d<float, access::write> dstMip2 [[texture(2)]],
	texture2d<float, access::write> dstMip3 [[texture(3)]],
	texture2d<float, access::write> dstMip4 [[texture(4)]],
	constant GenMipParams &options [[buffer(0)]])
	{
	uint firstMipLevel = options.srcLevel + 1;
	ushort2 mipSize =
	ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
	bool validThread = gIndices.x < mipSize.x && gIndices.y < mipSize.y;

	constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);

	// NOTE(hqle): Use simd_group function whenever available. That could avoid barrier use.

	// Use struct of array style to avoid bank conflict.
	threadgroup float sR[kThreadGroupXY];
	threadgroup float sG[kThreadGroupXY];
	threadgroup float sB[kThreadGroupXY];
	threadgroup float sA[kThreadGroupXY];

	// ----- First mip level -------
	float4 texel1;
	if (validThread)
	{
	float2 texCoords = (float2(gIndices) + float2(0.5, 0.5)) / float2(mipSize);
	texel1 = srcTexture.sample(textureSampler, texCoords, level(options.srcLevel));

	// Write to texture
	dstMip1.write(TO_LINEAR(texel1), gIndices);
	}
	else
	{
	// This will invalidate all subsequent checks
	lIndex = 0xffffffff;
	}

	if (options.numMipLevelsToGen == 1)
	{
	return;
	}

	// ---- Second mip level --------

	// Write to shared memory
	TEXEL_STORE(lIndex, texel1);

	threadgroup_barrier(mem_flags::mem_threadgroup);

	// Index must be even
	if ((lIndex & 0x09) == 0) // (lIndex & b001001) == 0
	{
	bool2 atEdge = gIndices == (mipSize - ushort2(1));

	// (x+1, y)
	// If the width of mip is 1, texel2 will equal to texel1:
	float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 1), atEdge.x);
	// (x, y+1)
	float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupX), atEdge.y);
	// (x+1, y+1)
	float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupX + 1)),
	atEdge.x \| atEdge.y);

	texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

	dstMip2.write(TO_LINEAR(texel1), gIndices >> 1);

	// Write to shared memory
	TEXEL_STORE(lIndex, texel1);
	}

	if (options.numMipLevelsToGen == 2)
	{
	return;
	}

	// ---- 3rd mip level --------
	threadgroup_barrier(mem_flags::mem_threadgroup);

	// Index must be multiple of 4
	if ((lIndex & 0x1b) == 0) // (lIndex & b011011) == 0
	{
	mipSize = max(mipSize >> 1, ushort2(1));
	bool2 atEdge = (gIndices >> 1) == (mipSize - ushort2(1));

	// (x+1, y)
	// If the width of mip is 1, texel2 will equal to texel1:
	float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2), atEdge.x);
	// (x, y+1)
	float4 texel3 =
	OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2 * kThreadGroupX), atEdge.y);
	// (x+1, y+1)
	float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupX + 2)),
	atEdge.x \| atEdge.y);

	texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

	dstMip3.write(TO_LINEAR(texel1), gIndices >> 2);

	// Write to shared memory
	TEXEL_STORE(lIndex, texel1);
	}

	if (options.numMipLevelsToGen == 3)
	{
	return;
	}

	// ---- 4th mip level --------
	threadgroup_barrier(mem_flags::mem_threadgroup);

	// Index must be multiple of 8
	if ((lIndex & 0x3f) == 0) // (lIndex & b111111) == 0
	{
	mipSize = max(mipSize >> 1, ushort2(1));
	bool2 atEdge = (gIndices >> 2) == (mipSize - ushort2(1));

	// (x+1, y)
	// If the width of mip is 1, texel2 will equal to texel1:
	float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4), atEdge.x);
	// (x, y+1)
	float4 texel3 =
	OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4 * kThreadGroupX), atEdge.y);
	// (x+1, y+1)
	float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupX + 4)),
	atEdge.x \| atEdge.y);

	texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

	dstMip4.write(TO_LINEAR(texel1), gIndices >> 3);
	}
	}

	template <typename TextureTypeR, typename TextureTypeW>
	static __attribute__((always_inline)) void generateCubeOr2DArray2ndAndMoreMipmaps(
	uint lIndex,
	ushort3 gIndices,
	TextureTypeR srcTexture,
	TextureTypeW dstMip2,
	TextureTypeW dstMip3,
	TextureTypeW dstMip4,
	ushort2 mip1Size,
	float4 mip1Texel,
	threadgroup float *sR,
	threadgroup float *sG,
	threadgroup float *sB,
	threadgroup float *sA,
	constant GenMipParams &options)
	{
	ushort2 mipSize = mip1Size;
	float4 texel1 = mip1Texel;

	// ---- Second mip level --------

	// Write to shared memory
	TEXEL_STORE(lIndex, texel1);

	threadgroup_barrier(mem_flags::mem_threadgroup);

	// Index must be even
	if ((lIndex & 0x09) == 0) // (lIndex & b001001) == 0
	{
	bool2 atEdge = gIndices.xy == (mipSize - ushort2(1));

	// (x+1, y)
	// If the width of mip is 1, texel2 will equal to texel1:
	float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 1), atEdge.x);
	// (x, y+1)
	float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupX), atEdge.y);
	// (x+1, y+1)
	float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupX + 1)),
	atEdge.x \| atEdge.y);

	texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

	dstMip2.write(TO_LINEAR(texel1), gIndices.xy >> 1, gIndices.z);

	// Write to shared memory
	TEXEL_STORE(lIndex, texel1);
	}

	if (options.numMipLevelsToGen == 2)
	{
	return;
	}

	// ---- 3rd mip level --------
	threadgroup_barrier(mem_flags::mem_threadgroup);

	// Index must be multiple of 4
	if ((lIndex & 0x1b) == 0) // (lIndex & b011011) == 0
	{
	mipSize = max(mipSize >> 1, ushort2(1));
	bool2 atEdge = (gIndices.xy >> 1) == (mipSize - ushort2(1));

	// (x+1, y)
	// If the width of mip is 1, texel2 will equal to texel1:
	float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2), atEdge.x);
	// (x, y+1)
	float4 texel3 =
	OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2 * kThreadGroupX), atEdge.y);
	// (x+1, y+1)
	float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupX + 2)),
	atEdge.x \| atEdge.y);

	texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

	dstMip3.write(TO_LINEAR(texel1), gIndices.xy >> 2, gIndices.z);

	// Write to shared memory
	TEXEL_STORE(lIndex, texel1);
	}

	if (options.numMipLevelsToGen == 3)
	{
	return;
	}

	// ---- 4th mip level --------
	threadgroup_barrier(mem_flags::mem_threadgroup);

	// Index must be multiple of 8
	if ((lIndex & 0x3f) == 0) // (lIndex & b111111) == 0
	{
	mipSize = max(mipSize >> 1, ushort2(1));
	bool2 atEdge = (gIndices.xy >> 2) == (mipSize - ushort2(1));

	// (x+1, y)
	// If the width of mip is 1, texel2 will equal to texel1:
	float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4), atEdge.x);
	// (x, y+1)
	float4 texel3 =
	OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4 * kThreadGroupX), atEdge.y);
	// (x+1, y+1)
	float4 texel4 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupX + 4)),
	atEdge.x \| atEdge.y);

	texel1 = (texel1 + texel2 + texel3 + texel4) / 4.0;

	dstMip4.write(TO_LINEAR(texel1), gIndices.xy >> 3, gIndices.z);
	}
	}

	kernel void generateCubeMipmaps(uint lIndex [[thread_index_in_threadgroup]],
	ushort3 gIndices [[thread_position_in_grid]],
	texturecube<float> srcTexture [[texture(0)]],
	texturecube<float, access::write> dstMip1 [[texture(1)]],
	texturecube<float, access::write> dstMip2 [[texture(2)]],
	texturecube<float, access::write> dstMip3 [[texture(3)]],
	texturecube<float, access::write> dstMip4 [[texture(4)]],
	constant GenMipParams &options [[buffer(0)]])
	{
	uint firstMipLevel = options.srcLevel + 1;
	ushort2 mip1Size =
	ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
	bool validThread = gIndices.x < mip1Size.x && gIndices.y < mip1Size.y;

	constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);

	// ----- First mip level -------
	float4 mip1Texel;
	if (validThread)
	{
	float2 texCoords = (float2(gIndices.xy) + float2(0.5, 0.5)) / float2(mip1Size);
	mip1Texel = srcTexture.sample(textureSampler, cubeTexcoords(texCoords, int(gIndices.z)),
	level(options.srcLevel));

	// Write to texture
	dstMip1.write(TO_LINEAR(mip1Texel), gIndices.xy, gIndices.z);
	}
	else
	{
	// This will invalidate all subsequent checks
	lIndex = 0xffffffff;
	}

	if (options.numMipLevelsToGen == 1)
	{
	return;
	}

	// Use struct of array style to avoid bank conflict.
	threadgroup float sR[kThreadGroupXY];
	threadgroup float sG[kThreadGroupXY];
	threadgroup float sB[kThreadGroupXY];
	threadgroup float sA[kThreadGroupXY];

	generateCubeOr2DArray2ndAndMoreMipmaps(lIndex, gIndices, srcTexture, dstMip2, dstMip3, dstMip4,
	mip1Size, mip1Texel, sR, sG, sB, sA, options);
	}

	kernel void generate2DArrayMipmaps(uint lIndex [[thread_index_in_threadgroup]],
	ushort3 gIndices [[thread_position_in_grid]],
	texture2d_array<float> srcTexture [[texture(0)]],
	texture2d_array<float, access::write> dstMip1 [[texture(1)]],
	texture2d_array<float, access::write> dstMip2 [[texture(2)]],
	texture2d_array<float, access::write> dstMip3 [[texture(3)]],
	texture2d_array<float, access::write> dstMip4 [[texture(4)]],
	constant GenMipParams &options [[buffer(0)]])
	{
	uint firstMipLevel = options.srcLevel + 1;
	ushort2 mip1Size =
	ushort2(srcTexture.get_width(firstMipLevel), srcTexture.get_height(firstMipLevel));
	bool validThread = gIndices.x < mip1Size.x && gIndices.y < mip1Size.y;

	constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);

	// ----- First mip level -------
	float4 mip1Texel;
	if (validThread)
	{
	float2 texCoords = (float2(gIndices.xy) + float2(0.5, 0.5)) / float2(mip1Size);
	mip1Texel =
	srcTexture.sample(textureSampler, texCoords, gIndices.z, level(options.srcLevel));

	// Write to texture
	dstMip1.write(TO_LINEAR(mip1Texel), gIndices.xy, gIndices.z);
	}
	else
	{
	// This will invalidate all subsequent checks
	lIndex = 0xffffffff;
	}

	if (options.numMipLevelsToGen == 1)
	{
	return;
	}

	// Use struct of array style to avoid bank conflict.
	threadgroup float sR[kThreadGroupXY];
	threadgroup float sG[kThreadGroupXY];
	threadgroup float sB[kThreadGroupXY];
	threadgroup float sA[kThreadGroupXY];

	generateCubeOr2DArray2ndAndMoreMipmaps(lIndex, gIndices, srcTexture, dstMip2, dstMip3, dstMip4,
	mip1Size, mip1Texel, sR, sG, sB, sA, options);
	}