[WHLSL] Matrix memory layout should match HLSL by laying out columns linearly
https://bugs.webkit.org/show_bug.cgi?id=199215

Reviewed by Myles C. Maxfield.

Source/WebCore:

This patch makes it so that we lay out matrices in memory in the same
way HLSL does. This is by laying out columns linearly in memory. So a float4x4
composed by this series of floats in memory:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

composes this logical matrix:
[[ 0,  4,  8, 12]
 [ 1,  5,  9, 13]
 [ 2,  6, 10, 14]
 [ 3,  7, 11, 15]]

To implement this, we switch to using an array to represent the memory
contents linear memory layout of a matrix. So the matrix float4x3 will now
be an array<float, 12> in metal. Then, we change the indexed getter and
setter methods for matrices to load and store from and to the correct
memory locations. The memory layout of matrices is observable to WHLSL
when using a matrix as an input/output to a shader.

Test: webgpu/whlsl-matrix-memory-layout.html

* Modules/webgpu/WHLSL/Metal/WHLSLNativeFunctionWriter.cpp:
(WebCore::WHLSL::Metal::writeNativeFunction):
* Modules/webgpu/WHLSL/Metal/WHLSLNativeTypeWriter.cpp:
(WebCore::WHLSL::Metal::writeNativeType):

LayoutTests:

* webgpu/whlsl-matrix-memory-layout-expected.txt: Added.
* webgpu/whlsl-matrix-memory-layout.html: Added.
* webgpu/whlsl-test-harness-test.html:


git-svn-id: http://svn.webkit.org/repository/webkit/trunk@247468 268f45cc-cd09-0410-ab3c-d52691b4dbfc
diff --git a/LayoutTests/ChangeLog b/LayoutTests/ChangeLog
index 9f4e7db..365a177 100644
--- a/LayoutTests/ChangeLog
+++ b/LayoutTests/ChangeLog
@@ -1,3 +1,14 @@
+2019-07-15  Saam Barati  <sbarati@apple.com>
+
+        [WHLSL] Matrix memory layout should match HLSL by laying out columns linearly
+        https://bugs.webkit.org/show_bug.cgi?id=199215
+
+        Reviewed by Myles C. Maxfield.
+
+        * webgpu/whlsl-matrix-memory-layout-expected.txt: Added.
+        * webgpu/whlsl-matrix-memory-layout.html: Added.
+        * webgpu/whlsl-test-harness-test.html:
+
 2019-07-15  Wenson Hsieh  <wenson_hsieh@apple.com>
 
         [Text autosizing] [iPadOS] Product label text is clipped in portrait mode on the front page of sephora.com
diff --git a/LayoutTests/webgpu/whlsl-matrix-memory-layout-expected.txt b/LayoutTests/webgpu/whlsl-matrix-memory-layout-expected.txt
new file mode 100644
index 0000000..4882f90
--- /dev/null
+++ b/LayoutTests/webgpu/whlsl-matrix-memory-layout-expected.txt
@@ -0,0 +1,5 @@
+PASS 
+PASS successfullyParsed is true
+
+TEST COMPLETE
+
diff --git a/LayoutTests/webgpu/whlsl-matrix-memory-layout.html b/LayoutTests/webgpu/whlsl-matrix-memory-layout.html
new file mode 100644
index 0000000..eb08148
--- /dev/null
+++ b/LayoutTests/webgpu/whlsl-matrix-memory-layout.html
@@ -0,0 +1,139 @@
+<!DOCTYPE html>
+<html>
+<head>
+<script src="js/webgpu-functions.js"></script>
+<script src="../resources/js-test-pre.js"></script>
+</head>
+<body>
+<script>
+const shaderSource = `
+[numthreads(1, 1, 1)]
+compute void computeShader(device float4x4[] buffer : register(u0), float3 threadID : SV_DispatchThreadID) {
+    float4x4 result;
+    result[0] = float4(1, 2, 3, 4);
+    result[1] = float4(5, 6, 7, 8);
+    result[2] = float4(9, 10, 11, 12);
+    result[3] = float4(13, 14, 15, 16);
+
+    float4x4 mat = buffer[0];
+
+    if (mat[0][0] == 0
+        && mat[0][1] == 1
+        && mat[0][2] == 2
+        && mat[0][3] == 3
+
+        && mat[1][0] == 4
+        && mat[1][1] == 5
+        && mat[1][2] == 6
+        && mat[1][3] == 7
+
+        && mat[2][0] == 8
+        && mat[2][1] == 9
+        && mat[2][2] == 10
+        && mat[2][3] == 11
+
+        && mat[3][0] == 12
+        && mat[3][1] == 13
+        && mat[3][2] == 14
+        && mat[3][3] == 15) {
+        buffer[0] = result;
+    }
+}
+`;
+
+async function start(device) {
+    const shaderModule = device.createShaderModule({code: shaderSource, isWHLSL: true});
+    const computeStage = {module: shaderModule, entryPoint: "computeShader"};
+
+    const bindGroupLayoutDescriptor = {bindings: [{binding: 0, visibility: 7, type: "storage-buffer"}]};
+    const bindGroupLayout = device.createBindGroupLayout(bindGroupLayoutDescriptor);
+    const pipelineLayoutDescriptor = {bindGroupLayouts: [bindGroupLayout]};
+    const pipelineLayout = device.createPipelineLayout(pipelineLayoutDescriptor);
+
+    const computePipelineDescriptor = {computeStage, layout: pipelineLayout};
+    const computePipeline = device.createComputePipeline(computePipelineDescriptor);
+
+    const size = Float32Array.BYTES_PER_ELEMENT * 16;
+
+    const bufferDescriptor = {size, usage: GPUBufferUsage.MAP_WRITE | GPUBufferUsage.TRANSFER_SRC};
+    const buffer = device.createBuffer(bufferDescriptor);
+    const bufferArrayBuffer = await buffer.mapWriteAsync();
+    const bufferFloat32Array = new Float32Array(bufferArrayBuffer);
+    bufferFloat32Array[0] = 0;
+    bufferFloat32Array[1] = 4;
+    bufferFloat32Array[2] = 8;
+    bufferFloat32Array[3] = 12;
+    bufferFloat32Array[4] = 1;
+    bufferFloat32Array[5] = 5;
+    bufferFloat32Array[6] = 9;
+    bufferFloat32Array[7] = 13;
+    bufferFloat32Array[8] = 2;
+    bufferFloat32Array[9] = 6;
+    bufferFloat32Array[10] = 10;
+    bufferFloat32Array[11] = 14;
+    bufferFloat32Array[12] = 3;
+    bufferFloat32Array[13] = 7;
+    bufferFloat32Array[14] = 11;
+    bufferFloat32Array[15] = 15;
+    buffer.unmap();
+
+    const resultsBufferDescriptor = {size, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.TRANSFER_DST | GPUBufferUsage.MAP_READ};
+    const resultsBuffer = device.createBuffer(resultsBufferDescriptor);
+
+    const bufferBinding = {buffer: resultsBuffer, size};
+    const bindGroupBinding = {binding: 0, resource: bufferBinding};
+    const bindGroupDescriptor = {layout: bindGroupLayout, bindings: [bindGroupBinding]};
+    const bindGroup = device.createBindGroup(bindGroupDescriptor);
+
+    const commandEncoder = device.createCommandEncoder(); // {}
+    commandEncoder.copyBufferToBuffer(buffer, 0, resultsBuffer, 0, size);
+    const computePassEncoder = commandEncoder.beginComputePass();
+    computePassEncoder.setPipeline(computePipeline);
+    computePassEncoder.setBindGroup(0, bindGroup);
+    computePassEncoder.dispatch(2, 1, 1);
+    computePassEncoder.endPass();
+    const commandBuffer = commandEncoder.finish();
+    device.getQueue().submit([commandBuffer]);
+
+    const resultsArrayBuffer = await resultsBuffer.mapReadAsync();
+    const resultsFloat32Array = new Float32Array(resultsArrayBuffer);
+    if (resultsFloat32Array[0] === 1
+        && resultsFloat32Array[1] === 5
+        && resultsFloat32Array[2] === 9
+        && resultsFloat32Array[3] === 13
+
+        && resultsFloat32Array[4] === 2
+        && resultsFloat32Array[5] === 6
+        && resultsFloat32Array[6] === 10
+        && resultsFloat32Array[7] === 14
+
+        && resultsFloat32Array[8] === 3
+        && resultsFloat32Array[9] === 7
+        && resultsFloat32Array[10] === 11
+        && resultsFloat32Array[11] === 15
+
+        && resultsFloat32Array[12] === 4
+        && resultsFloat32Array[13] === 8
+        && resultsFloat32Array[14] === 12
+        && resultsFloat32Array[15] === 16)
+        testPassed("");
+    else
+        testFailed("");
+    resultsBuffer.unmap();
+}
+window.jsTestIsAsync = true;
+getBasicDevice().then(function(device) {
+    start(device).then(function() {
+        finishJSTest();
+    }, function() {
+        testFailed("");
+        finishJSTest();
+    });
+}, function() {
+    testPassed("");
+    finishJSTest();
+});
+</script>
+<script src="../resources/js-test-post.js"></script>
+</body>
+</html>
diff --git a/LayoutTests/webgpu/whlsl-test-harness-test.html b/LayoutTests/webgpu/whlsl-test-harness-test.html
index cd1ac1f..4529146 100644
--- a/LayoutTests/webgpu/whlsl-test-harness-test.html
+++ b/LayoutTests/webgpu/whlsl-test-harness-test.html
@@ -42,8 +42,9 @@
     "float4x4": makeFloat4x4
 };
 
-const float4x4expected = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
-const float4expected = float4x4expected.slice(0, 4);
+const float4x4expected = [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15];
+const float4x4ColumnExpected = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+const float4expected = [0, 1, 2, 3];
 
 let whlslTests = {};
 
@@ -172,7 +173,7 @@
         arg[i] = i;
         multiple4x4args.push(arg);
     }
-    checkArrays("float4x4", "return in0 + in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8 + in9 + in10 + in11 + in12 + in13 + in14 + in15;", multiple4x4args, float4x4expected);
+    checkArrays("float4x4", "return in0 + in1 + in2 + in3 + in4 + in5 + in6 + in7 + in8 + in9 + in10 + in11 + in12 + in13 + in14 + in15;", multiple4x4args, float4x4ColumnExpected);
     checkArrays("float4x4", "return in0[0];", [[float4x4expected]], float4x4expected);
 };
 
diff --git a/Source/WebCore/ChangeLog b/Source/WebCore/ChangeLog
index 3851112..72ec529 100644
--- a/Source/WebCore/ChangeLog
+++ b/Source/WebCore/ChangeLog
@@ -1,3 +1,35 @@
+2019-07-15  Saam Barati  <sbarati@apple.com>
+
+        [WHLSL] Matrix memory layout should match HLSL by laying out columns linearly
+        https://bugs.webkit.org/show_bug.cgi?id=199215
+
+        Reviewed by Myles C. Maxfield.
+
+        This patch makes it so that we lay out matrices in memory in the same
+        way HLSL does. This is by laying out columns linearly in memory. So a float4x4
+        composed by this series of floats in memory:
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+        
+        composes this logical matrix:
+        [[ 0,  4,  8, 12]
+         [ 1,  5,  9, 13]
+         [ 2,  6, 10, 14]
+         [ 3,  7, 11, 15]]
+        
+        To implement this, we switch to using an array to represent the memory
+        contents linear memory layout of a matrix. So the matrix float4x3 will now
+        be an array<float, 12> in metal. Then, we change the indexed getter and
+        setter methods for matrices to load and store from and to the correct
+        memory locations. The memory layout of matrices is observable to WHLSL
+        when using a matrix as an input/output to a shader.
+
+        Test: webgpu/whlsl-matrix-memory-layout.html
+
+        * Modules/webgpu/WHLSL/Metal/WHLSLNativeFunctionWriter.cpp:
+        (WebCore::WHLSL::Metal::writeNativeFunction):
+        * Modules/webgpu/WHLSL/Metal/WHLSLNativeTypeWriter.cpp:
+        (WebCore::WHLSL::Metal::writeNativeType):
+
 2019-07-15  Wenson Hsieh  <wenson_hsieh@apple.com>
 
         [Text autosizing] [iPadOS] Product label text is clipped in portrait mode on the front page of sephora.com
diff --git a/Source/WebCore/Modules/webgpu/WHLSL/Metal/WHLSLNativeFunctionWriter.cpp b/Source/WebCore/Modules/webgpu/WHLSL/Metal/WHLSLNativeFunctionWriter.cpp
index f250faa..40b3ecf 100644
--- a/Source/WebCore/Modules/webgpu/WHLSL/Metal/WHLSLNativeFunctionWriter.cpp
+++ b/Source/WebCore/Modules/webgpu/WHLSL/Metal/WHLSLNativeFunctionWriter.cpp
@@ -250,12 +250,18 @@
         return stringBuilder.toString();
     }
 
-    auto numberOfMatrixRows = [&] {
+    auto matrixDimension = [&] (unsigned typeArgumentIndex) -> unsigned {
         auto& typeReference = downcast<AST::TypeReference>(*nativeFunctionDeclaration.parameters()[0]->type());
         auto& matrixType = downcast<AST::NativeTypeDeclaration>(downcast<AST::TypeReference>(downcast<AST::TypeDefinition>(typeReference.resolvedType()).type()).resolvedType());
         ASSERT(matrixType.name() == "matrix");
         ASSERT(matrixType.typeArguments().size() == 3);
-        return String::number(WTF::get<AST::ConstantExpression>(matrixType.typeArguments()[1]).integerLiteral().value());
+        return WTF::get<AST::ConstantExpression>(matrixType.typeArguments()[typeArgumentIndex]).integerLiteral().value();
+    };
+    auto numberOfMatrixRows = [&] {
+        return matrixDimension(1);
+    };
+    auto numberOfMatrixColumns = [&] {
+        return matrixDimension(2);
     };
 
     if (nativeFunctionDeclaration.name() == "operator[]") {
@@ -263,9 +269,20 @@
         auto metalParameter1Name = typeNamer.mangledNameForType(*nativeFunctionDeclaration.parameters()[0]->type());
         auto metalParameter2Name = typeNamer.mangledNameForType(*nativeFunctionDeclaration.parameters()[1]->type());
         auto metalReturnName = typeNamer.mangledNameForType(nativeFunctionDeclaration.type());
+
+        unsigned numberOfRows = numberOfMatrixRows();
+        unsigned numberOfColumns = numberOfMatrixColumns();
+
         stringBuilder.append(makeString(metalReturnName, ' ', outputFunctionName, '(', metalParameter1Name, " m, ", metalParameter2Name, " i) {\n"));
-        stringBuilder.append(makeString("    if (i < ", numberOfMatrixRows(), ") return m[i];\n"));
-        stringBuilder.append(makeString("    return ", metalReturnName, "(0);\n"));
+        stringBuilder.append(makeString("    if (i >= ", numberOfRows, ") return ", metalReturnName, "(0);\n"));
+        stringBuilder.append(makeString("    ", metalReturnName, " result;\n"));
+        stringBuilder.append("    result[0] = m[i];\n");
+        stringBuilder.append(makeString("    result[1] = m[i + ", numberOfRows, "];\n"));
+        if (numberOfColumns >= 3)
+            stringBuilder.append(makeString("    result[2] = m[i + ", numberOfRows * 2, "];\n"));
+        if (numberOfColumns >= 4)
+            stringBuilder.append(makeString("    result[3] = m[i + ", numberOfRows * 3, "];\n"));
+        stringBuilder.append("    return result;\n");
         stringBuilder.append("}\n");
         return stringBuilder.toString();
     }
@@ -276,9 +293,19 @@
         auto metalParameter2Name = typeNamer.mangledNameForType(*nativeFunctionDeclaration.parameters()[1]->type());
         auto metalParameter3Name = typeNamer.mangledNameForType(*nativeFunctionDeclaration.parameters()[2]->type());
         auto metalReturnName = typeNamer.mangledNameForType(nativeFunctionDeclaration.type());
+
+        unsigned numberOfRows = numberOfMatrixRows();
+        unsigned numberOfColumns = numberOfMatrixColumns();
+
         stringBuilder.append(makeString(metalReturnName, ' ', outputFunctionName, '(', metalParameter1Name, " m, ", metalParameter2Name, " i, ", metalParameter3Name, " v) {\n"));
-        stringBuilder.append(makeString("    if (i < ", numberOfMatrixRows(), ") m[i] = v;\n"));
-        stringBuilder.append("    return m;\n");
+        stringBuilder.append(makeString("    if (i >= ", numberOfRows, ") return m;\n"));
+        stringBuilder.append(makeString("    m[i] = v[0];\n"));
+        stringBuilder.append(makeString("    m[i + ", numberOfRows, "] = v[1];\n"));
+        if (numberOfColumns >= 3)
+            stringBuilder.append(makeString("    m[i + ", numberOfRows * 2, "] = v[2];\n"));
+        if (numberOfColumns >= 4)
+            stringBuilder.append(makeString("    m[i + ", numberOfRows * 3, "] = v[3];\n"));
+        stringBuilder.append("    return m;");
         stringBuilder.append("}\n");
         return stringBuilder.toString();
     }
diff --git a/Source/WebCore/Modules/webgpu/WHLSL/Metal/WHLSLNativeTypeWriter.cpp b/Source/WebCore/Modules/webgpu/WHLSL/Metal/WHLSLNativeTypeWriter.cpp
index ebfe61d..9be42c3 100644
--- a/Source/WebCore/Modules/webgpu/WHLSL/Metal/WHLSLNativeTypeWriter.cpp
+++ b/Source/WebCore/Modules/webgpu/WHLSL/Metal/WHLSLNativeTypeWriter.cpp
@@ -122,35 +122,19 @@
             ASSERT(parameterType.name() == "float");
             return "float";
         })();
+
         ASSERT(WTF::holds_alternative<AST::ConstantExpression>(nativeTypeDeclaration.typeArguments()[1]));
         auto& constantExpression1 = WTF::get<AST::ConstantExpression>(nativeTypeDeclaration.typeArguments()[1]);
         auto& integerLiteral1 = constantExpression1.integerLiteral();
-        auto middle = ([&]() -> String {
-            switch (integerLiteral1.value()) {
-            case 2:
-                return "2"_str;
-            case 3:
-                return "3"_str;
-            default:
-                ASSERT(integerLiteral1.value() == 4);
-                return "4"_str;
-            }
-        })();
+        unsigned rows = integerLiteral1.value();
+        ASSERT(rows == 2 || rows == 3 || rows == 4);
+
         ASSERT(WTF::holds_alternative<AST::ConstantExpression>(nativeTypeDeclaration.typeArguments()[2]));
         auto& constantExpression2 = WTF::get<AST::ConstantExpression>(nativeTypeDeclaration.typeArguments()[2]);
         auto& integerLiteral2 = constantExpression2.integerLiteral();
-        auto suffix = ([&]() -> String {
-            switch (integerLiteral2.value()) {
-            case 2:
-                return "2"_str;
-            case 3:
-                return "3"_str;
-            default:
-                ASSERT(integerLiteral2.value() == 4);
-                return "4"_str;
-            }
-        })();
-        return makeString(prefix, middle, 'x', suffix);
+        unsigned columns = integerLiteral2.value();
+        ASSERT(columns == 2 || columns == 3 || columns == 4);
+        return makeString("array<", prefix, ", ", columns * rows, ">");
     }
     ASSERT(nativeTypeDeclaration.typeArguments().size() == 1);
     ASSERT(WTF::holds_alternative<UniqueRef<AST::TypeReference>>(nativeTypeDeclaration.typeArguments()[0]));