Source/WebCore/platform/graphics/cpu/arm/filters/FELightingNEON.cpp - WebKit - Git at Google

 /*
  * Copyright (C) 2011 University of Szeged
  * Copyright (C) 2011 Zoltan Herczeg
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "config.h"
 #include "FELightingNEON.h"

 #if CPU(ARM_NEON) && CPU(ARM_TRADITIONAL) && COMPILER(GCC_COMPATIBLE)

 namespace WebCore {

 // These constants are copied to the following SIMD registers:
 //   ALPHAX_Q ALPHAY_Q REMAPX_D REMAPY_D


 static alignas(16) short s_FELightingConstantsForNeon[] = {
     // Alpha coefficients.
     -2, 1, 0, -1, 2, 1, 0, -1,
     0, -1, -2, -1, 0, 1, 2, 1,
     // Remapping indicies.
     0x0f0e, 0x0302, 0x0504, 0x0706,
     0x0b0a, 0x1312, 0x1514, 0x1716,
 };

 short* feLightingConstantsForNeon()
 {
     return s_FELightingConstantsForNeon;
 }

 void FELighting::platformApplyNeonWorker(FELightingPaintingDataForNeon* parameters)
 {
     neonDrawLighting(parameters);
 }

 #define ASSTRING(str) #str
 #define TOSTRING(value) ASSTRING(value)

 #define PIXELS_OFFSET TOSTRING(0)
 #define YSTART_OFFSET TOSTRING(4)
 #define WIDTH_OFFSET TOSTRING(8)
 #define HEIGHT_OFFSET TOSTRING(12)
 #define FLAGS_OFFSET TOSTRING(16)
 #define SPECULAR_EXPONENT_OFFSET TOSTRING(20)
 #define CONE_EXPONENT_OFFSET TOSTRING(24)
 #define FLOAT_ARGUMENTS_OFFSET TOSTRING(28)
 #define PAINTING_CONSTANTS_OFFSET TOSTRING(32)
 #define NL "\n"

 // Register allocation
 #define PAINTING_DATA_R       "r11"
 #define RESET_WIDTH_R         PAINTING_DATA_R
 #define PIXELS_R              "r4"
 #define WIDTH_R               "r5"
 #define HEIGHT_R              "r6"
 #define FLAGS_R               "r7"
 #define SPECULAR_EXPONENT_R   "r8"
 #define CONE_EXPONENT_R       "r10"
 #define SCANLINE_R            "r12"

 #define TMP1_Q                "q0"
 #define TMP1_D0               "d0"
 #define TMP1_S0               "s0"
 #define TMP1_S1               "s1"
 #define TMP1_D1               "d1"
 #define TMP1_S2               "s2"
 #define TMP1_S3               "s3"
 #define TMP2_Q                "q1"
 #define TMP2_D0               "d2"
 #define TMP2_S0               "s4"
 #define TMP2_S1               "s5"
 #define TMP2_D1               "d3"
 #define TMP2_S2               "s6"
 #define TMP2_S3               "s7"
 #define TMP3_Q                "q2"
 #define TMP3_D0               "d4"
 #define TMP3_S0               "s8"
 #define TMP3_S1               "s9"
 #define TMP3_D1               "d5"
 #define TMP3_S2               "s10"
 #define TMP3_S3               "s11"

 #define COSINE_OF_ANGLE       "s12"
 #define POWF_INT_S            "s13"
 #define POWF_FRAC_S           "s14"
 #define SPOT_COLOR_Q          "q4"

 // Because of VMIN and VMAX CONST_ZERO_S and CONST_ONE_S
 // must be placed on the same side of the double vector

 // Current pixel position
 #define POSITION_Q            "q5"
 #define POSITION_X_S          "s20"
 #define POSITION_Y_S          "s21"
 #define POSITION_Z_S          "s22"
 #define CONST_ZERO_HI_D       "d11"
 #define CONST_ZERO_S          "s23"

 // -------------------------------
 //     Variable arguments
 // Misc arguments
 #define READ1_RANGE           "d12-d15"
 #define READ2_RANGE           "d16-d19"
 #define READ3_RANGE           "d20-d21"

 #define SCALE_S               "s24"
 #define SCALE_DIV4_S          "s25"
 #define DIFFUSE_CONST_S       "s26"

 // Light source position
 #define CONE_CUT_OFF_S        "s28"
 #define CONE_FULL_LIGHT_S     "s29"
 #define CONE_CUT_OFF_RANGE_S  "s30"
 #define CONST_ONE_HI_D        "d15"
 #define CONST_ONE_S           "s31"

 #define LIGHT_Q               "q8"
 #define DIRECTION_Q           "q9"
 #define COLOR_Q               "q10"
 // -------------------------------
 //    Constant coefficients
 #define READ4_RANGE           "d22-d25"
 #define READ5_RANGE           "d26-d27"

 #define ALPHAX_Q              "q11"
 #define ALPHAY_Q              "q12"
 #define REMAPX_D              "d26"
 #define REMAPY_D              "d27"
 // -------------------------------

 #define ALL_ROWS_D            "{d28,d29,d30}"
 #define TOP_ROW_D             "d28"
 #define MIDDLE_ROW_D          "d29"
 #define BOTTOM_ROW_D          "d30"

 #define GET_LENGTH(source, temp) \
     "vmul.f32 " temp##_Q ", " source##_Q ", " source##_Q NL \
     "vadd.f32 " source##_S3 ", " temp##_S0 ", " temp##_S1 NL \
     "vadd.f32 " source##_S3 ", " source##_S3 ", " temp##_S2 NL \
     "vsqrt.f32 " source##_S3 ", " source##_S3 NL

 // destination##_S3 can contain the multiply of length.
 #define DOT_PRODUCT(destination, source1, source2) \
     "vmul.f32 " destination##_Q ", " source1##_Q ", " source2##_Q NL \
     "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S1 NL \
     "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S2 NL

 #define MULTIPLY_BY_DIFFUSE_CONST(normalVectorLength, dotProductLength) \
     "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL \
     "vmuleq.f32 " TMP2_S1 ", " DIFFUSE_CONST_S ", " normalVectorLength NL \
     "vdiveq.f32 " TMP2_S1 ", " TMP2_S1 ", " dotProductLength NL \
     "vdivne.f32 " TMP2_S1 ", " normalVectorLength ", " dotProductLength NL

 #define POWF_SQR(value, exponent, current, remaining) \
     "tst " exponent ", #" ASSTRING(current) NL \
     "vmulne.f32 " value ", " value ", " POWF_INT_S NL \
     "tst " exponent ", #" ASSTRING(remaining) NL \
     "vmulne.f32 " POWF_INT_S ", " POWF_INT_S ", " POWF_INT_S NL

 #define POWF_SQRT(value, exponent, current, remaining) \
     "tst " exponent ", #" ASSTRING(remaining) NL \
     "vsqrtne.f32 " POWF_FRAC_S ", " POWF_FRAC_S NL \
     "tst " exponent ", #" ASSTRING(current) NL \
     "vmulne.f32 " value ", " value ", " POWF_FRAC_S NL

 // This simplified powf function is sufficiently accurate.
 #define POWF(value, exponent) \
     "tst " exponent ", #0xfc0" NL \
     "vmovne.f32 " POWF_INT_S ", " value NL \
     "tst " exponent ", #0x03f" NL \
     "vmovne.f32 " POWF_FRAC_S ", " value NL \
     "vmov.f32 " value ", " CONST_ONE_S NL \
     \
     POWF_SQR(value, exponent, 0x040, 0xf80) \
     POWF_SQR(value, exponent, 0x080, 0xf00) \
     POWF_SQR(value, exponent, 0x100, 0xe00) \
     POWF_SQR(value, exponent, 0x200, 0xc00) \
     POWF_SQR(value, exponent, 0x400, 0x800) \
     "tst " exponent ", #0x800" NL \
     "vmulne.f32 " value ", " value ", " POWF_INT_S NL \
     \
     POWF_SQRT(value, exponent, 0x20, 0x3f) \
     POWF_SQRT(value, exponent, 0x10, 0x1f) \
     POWF_SQRT(value, exponent, 0x08, 0x0f) \
     POWF_SQRT(value, exponent, 0x04, 0x07) \
     POWF_SQRT(value, exponent, 0x02, 0x03) \
     POWF_SQRT(value, exponent, 0x01, 0x01)

 // The following algorithm is an ARM-NEON optimized version of
 // the main loop found in FELighting.cpp. Since the whole code
 // is redesigned to be as effective as possible (ARM specific
 // thinking), it is four times faster than its C++ counterpart.

 asm ( // NOLINT
 ".globl " TOSTRING(neonDrawLighting) NL
 TOSTRING(neonDrawLighting) ":" NL
     // Because of the clever register allocation, nothing is stored on the stack
     // except the saved registers.
     // Stack must be aligned to 8 bytes.
     "stmdb sp!, {r4-r8, r10, r11, lr}" NL
     "vstmdb sp!, {d8-d15}" NL
     "mov " PAINTING_DATA_R ", r0" NL

     // The following two arguments are loaded to SIMD registers.
     "ldr r0, [" PAINTING_DATA_R ", #" FLOAT_ARGUMENTS_OFFSET "]" NL
     "ldr r1, [" PAINTING_DATA_R ", #" PAINTING_CONSTANTS_OFFSET "]" NL
     "ldr " PIXELS_R ", [" PAINTING_DATA_R ", #" PIXELS_OFFSET "]" NL
     "vldr.f32 " POSITION_Y_S ", [" PAINTING_DATA_R ", #" YSTART_OFFSET "]"  NL
     "ldr " WIDTH_R ", [" PAINTING_DATA_R ", #" WIDTH_OFFSET "]" NL
     "ldr " HEIGHT_R ", [" PAINTING_DATA_R ", #" HEIGHT_OFFSET "]" NL
     "ldr " FLAGS_R ", [" PAINTING_DATA_R ", #" FLAGS_OFFSET "]" NL
     "ldr " SPECULAR_EXPONENT_R ", [" PAINTING_DATA_R ", #" SPECULAR_EXPONENT_OFFSET "]" NL
     "ldr " CONE_EXPONENT_R ", [" PAINTING_DATA_R ", #" CONE_EXPONENT_OFFSET "]" NL

     // Load all data to the SIMD registers with the least number of instructions.
     "vld1.f32 { " READ1_RANGE " }, [r0]!" NL
     "vld1.f32 { " READ2_RANGE " }, [r0]!" NL
     "vld1.f32 { " READ3_RANGE " }, [r0]!" NL
     "vld1.s16 {" READ4_RANGE "}, [r1]!" NL
     "vld1.s16 {" READ5_RANGE "}, [r1]!" NL

     // Initializing local variables.
     "mov " SCANLINE_R ", " WIDTH_R ", lsl #2" NL
     "add " SCANLINE_R ", " SCANLINE_R ", #8" NL
     "add " PIXELS_R ", " PIXELS_R ", " SCANLINE_R NL
     "add " PIXELS_R ", " PIXELS_R ", #3" NL
     "mov r0, #0" NL
     "vmov.f32 " CONST_ZERO_S ", r0" NL
     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
     "vmov.f32 " SPOT_COLOR_Q ", " COLOR_Q NL
     "mov " RESET_WIDTH_R ", " WIDTH_R NL

 ".mainLoop:" NL
     "mov r3, #3" NL
     "vmov.f32 " POSITION_X_S ", " CONST_ONE_S NL

 ".scanline:" NL
     // The ROW registers are storing the alpha channel of the last three pixels.
     // The alpha channel is stored as signed short (sint16) values. The fourth value
     // is garbage. The following instructions are shifting out the unnecessary alpha
     // values and load the next ones.
     "ldrb r0, [" PIXELS_R ", -" SCANLINE_R "]" NL
     "ldrb r1, [" PIXELS_R ", +" SCANLINE_R "]" NL
     "ldrb r2, [" PIXELS_R "], #4" NL
     "vext.s16 " TOP_ROW_D ", " TOP_ROW_D ", " TOP_ROW_D ", #3" NL
     "vext.s16 " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", #3" NL
     "vext.s16 " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", #3" NL
     "vmov.s16 " TOP_ROW_D "[1], r0" NL
     "vmov.s16 " MIDDLE_ROW_D "[1], r2" NL
     "vmov.s16 " BOTTOM_ROW_D "[1], r1" NL

     // The two border pixels (rightmost and leftmost) are skipped when
     // the next scanline is reached. It also jumps, when the algorithm
     // is started, and the first free alpha values are loaded to each row.
     "subs r3, r3, #1" NL
     "bne .scanline" NL

     // The light vector goes to TMP1_Q. It is constant in case of distant light.
     // The fourth value contains the length of the light vector.
     "tst " FLAGS_R ", #" TOSTRING(FLAG_POINT_LIGHT | FLAG_SPOT_LIGHT) NL
     "beq .distantLight" NL

     "vmov.s16 r3, " MIDDLE_ROW_D "[2]" NL
     "vmov.f32 " POSITION_Z_S ", r3" NL
     "vcvt.f32.s32 " POSITION_Z_S ", " POSITION_Z_S NL
     "vmul.f32 " POSITION_Z_S ", " POSITION_Z_S ", " SCALE_S NL

     "vsub.f32 " TMP1_Q ", " LIGHT_Q ", " POSITION_Q NL
     GET_LENGTH(TMP1, TMP2)

     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
     "bne .cosineOfAngle" NL
 ".visiblePixel:" NL

     //     | -1  0  1 |      | -1 -2 -1 |
     // X = | -2  0  2 |  Y = |  0  0  0 |
     //     | -1  0  1 |      |  1  2  1 |

     // Multiply the alpha values by the X and Y matrices.

     // Moving the 8 alpha value to TMP3.
     "vtbl.8 " TMP3_D0 ", " ALL_ROWS_D ", " REMAPX_D NL
     "vtbl.8 " TMP3_D1 ", " ALL_ROWS_D ", " REMAPY_D NL

     "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAX_Q NL
     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
     "vmov.s16 r0, " TMP2_D0 "[0]" NL

     "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAY_Q NL
     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
     "vmov.s16 r1, " TMP2_D0 "[0]" NL

     // r0 and r1 contains the X and Y coordinates of the
     // normal vector, respectively.

     // Calculating the spot light strength.
     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
     "beq .endLight" NL

     "vneg.f32 " TMP3_S1 ", " COSINE_OF_ANGLE NL
     "tst " FLAGS_R ", #" TOSTRING(FLAG_CONE_EXPONENT_IS_1) NL
     "beq .coneExpPowf" NL
 ".coneExpPowfFinished:" NL

     // Smoothing the cone edge if necessary.
     "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_FULL_LIGHT_S NL
     "fmstat" NL
     "bhi .cutOff" NL
 ".cutOffFinished:" NL

     "vmin.f32 " TMP3_D0 ", " TMP3_D0 ", " CONST_ONE_HI_D NL
     "vmul.f32 " COLOR_Q ", " SPOT_COLOR_Q ", " TMP3_D0 "[1]" NL

 ".endLight:" NL
     // Summarize:
     // r0 and r1 contains the normalVector.
     // TMP1_Q contains the light vector and its length.
     // COLOR_Q contains the color of the light vector.

     // Test whether both r0 and r1 are zero (Normal vector is (0, 0, 1)).
     "orrs r2, r0, r1" NL
     "bne .normalVectorIsNonZero" NL

     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
     "bne .specularLight1" NL

     // Calculate diffuse light strength.
     MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
     "b .lightStrengthCalculated" NL

 ".specularLight1:" NL
     // Calculating specular light strength.
     "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
     GET_LENGTH(TMP1, TMP2)

     // When the exponent is 1, we don't need to call an expensive powf function.
     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
     "vdiveq.f32 " TMP2_S1 ", " TMP1_S2 ", " TMP1_S3 NL
     "beq .specularExpPowf" NL

     MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
     "b .lightStrengthCalculated" NL

 ".normalVectorIsNonZero:" NL
     // Normal vector goes to TMP2, and its length is calculated as well.
     "vmov.s32 " TMP2_S0 ", r0" NL
     "vcvt.f32.s32 " TMP2_S0 ", " TMP2_S0 NL
     "vmul.f32 " TMP2_S0 ", " TMP2_S0 ", " SCALE_DIV4_S NL
     "vmov.s32 " TMP2_S1 ", r1" NL
     "vcvt.f32.s32 " TMP2_S1 ", " TMP2_S1 NL
     "vmul.f32 " TMP2_S1 ", " TMP2_S1 ", " SCALE_DIV4_S NL
     "vmov.f32 " TMP2_S2 ", " CONST_ONE_S NL
     GET_LENGTH(TMP2, TMP3)

     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
     "bne .specularLight2" NL

     // Calculating diffuse light strength.
     DOT_PRODUCT(TMP3, TMP2, TMP1)
     MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)
     "b .lightStrengthCalculated" NL

 ".specularLight2:" NL
     // Calculating specular light strength.
     "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
     GET_LENGTH(TMP1, TMP3)
     DOT_PRODUCT(TMP3, TMP2, TMP1)

     // When the exponent is 1, we don't need to call an expensive powf function.
     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
     "vdiveq.f32 " TMP2_S1 ", " TMP3_S0 ", " TMP3_S3 NL
     "beq .specularExpPowf" NL
     MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)

 ".lightStrengthCalculated:" NL
     // TMP2_S1 contains the light strength. Clamp it to [0, 1]
     "vmax.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ZERO_HI_D NL
     "vmin.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ONE_HI_D NL
     "vmul.f32 " TMP3_Q ", " COLOR_Q ", " TMP2_D0 "[1]" NL
     "vcvt.u32.f32 " TMP3_Q ", " TMP3_Q NL
     "vmov.u32 r2, r3, " TMP3_S0 ", " TMP3_S1 NL
     // The color values are stored in-place.
     "strb r2, [" PIXELS_R ", #-11]" NL
     "strb r3, [" PIXELS_R ", #-10]" NL
     "vmov.u32 r2, " TMP3_S2 NL
     "strb r2, [" PIXELS_R ", #-9]" NL

     // Continue to the next pixel.
 ".blackPixel:" NL
     "vadd.f32 " POSITION_X_S ", " CONST_ONE_S NL
     "mov r3, #1" NL
     "subs " WIDTH_R ", " WIDTH_R ", #1" NL
     "bne .scanline" NL

     // If the end of the scanline is reached, we continue
     // to the next scanline.
     "vadd.f32 " POSITION_Y_S ", " CONST_ONE_S NL
     "mov " WIDTH_R ", " RESET_WIDTH_R NL
     "subs " HEIGHT_R ", " HEIGHT_R ", #1" NL
     "bne .mainLoop" NL

     // Return.
     "vldmia sp!, {d8-d15}" NL
     "ldmia sp!, {r4-r8, r10, r11, pc}" NL

 ".distantLight:" NL
     // In case of distant light, the light vector is constant,
     // we simply copy it.
     "vmov.f32 " TMP1_Q ", " LIGHT_Q NL
     "b .visiblePixel" NL

 ".cosineOfAngle:" NL
     // If the pixel is outside of the cone angle, it is simply a black pixel.
     DOT_PRODUCT(TMP3, TMP1, DIRECTION)
     "vdiv.f32 " COSINE_OF_ANGLE ", " TMP3_S0 ", " TMP1_S3 NL
     "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_CUT_OFF_S NL
     "fmstat" NL
     "bls .visiblePixel" NL
     "mov r0, #0" NL
     "strh r0, [" PIXELS_R ", #-11]" NL
     "strb r0, [" PIXELS_R ", #-9]" NL
     "b .blackPixel" NL

 ".cutOff:" NL
     // Smoothing the light strength on the cone edge.
     "vsub.f32 " TMP3_S0 ", " CONE_CUT_OFF_S ", " COSINE_OF_ANGLE NL
     "vdiv.f32 " TMP3_S0 ", " TMP3_S0 ", " CONE_CUT_OFF_RANGE_S NL
     "vmul.f32 " TMP3_S1 ", " TMP3_S1 ", " TMP3_S0 NL
     "b .cutOffFinished" NL

 ".coneExpPowf:" NL
     POWF(TMP3_S1, CONE_EXPONENT_R)
     "b .coneExpPowfFinished" NL

 ".specularExpPowf:" NL
     POWF(TMP2_S1, SPECULAR_EXPONENT_R)
     "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL
     "vmuleq.f32 " TMP2_S1 ", " TMP2_S1 ", " DIFFUSE_CONST_S NL
     "b .lightStrengthCalculated" NL
 ); // NOLINT

 int FELighting::getPowerCoefficients(float exponent)
 {
     // Calling a powf function from the assembly code would require to save
     // and reload a lot of NEON registers. Since the base is in range [0..1]
     // and only 8 bit precision is required, we use our own powf function.
     // This is probably not the best, but it uses only a few registers and
     // gives us enough precision (modifying the exponent field directly would
     // also be possible).

     // First, we limit the exponent to maximum of 64, which gives us enough
     // precision. We split the exponent to an integer and fraction part,
     // since a^x = (a^y)*(a^z) where x = y+z. The integer exponent of the
     // power is estimated by square, and the fraction exponent of the power
     // is estimated by square root assembly instructions.
     int i, result;

     if (exponent < 0)
         exponent = 1 / (-exponent);

     if (exponent > 63.99)
         exponent = 63.99;

     exponent /= 64;
     result = 0;
     for (i = 11; i >= 0; --i) {
         exponent *= 2;
         if (exponent >= 1) {
             result |= 1 << i;
             exponent -= 1;
         }
     }
     return result;
 }

 } // namespace WebCore

 #endif // CPU(ARM_NEON) && COMPILER(GCC_COMPATIBLE)
	/*
	* Copyright (C) 2011 University of Szeged
	* Copyright (C) 2011 Zoltan Herczeg
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "config.h"
	#include "FELightingNEON.h"

	#if CPU(ARM_NEON) && CPU(ARM_TRADITIONAL) && COMPILER(GCC_COMPATIBLE)

	namespace WebCore {

	// These constants are copied to the following SIMD registers:
	// ALPHAX_Q ALPHAY_Q REMAPX_D REMAPY_D


	static alignas(16) short s_FELightingConstantsForNeon[] = {
	// Alpha coefficients.
	-2, 1, 0, -1, 2, 1, 0, -1,
	0, -1, -2, -1, 0, 1, 2, 1,
	// Remapping indicies.
	0x0f0e, 0x0302, 0x0504, 0x0706,
	0x0b0a, 0x1312, 0x1514, 0x1716,
	};

	short* feLightingConstantsForNeon()
	{
	return s_FELightingConstantsForNeon;
	}

	void FELighting::platformApplyNeonWorker(FELightingPaintingDataForNeon* parameters)
	{
	neonDrawLighting(parameters);
	}

	#define ASSTRING(str) #str
	#define TOSTRING(value) ASSTRING(value)

	#define PIXELS_OFFSET TOSTRING(0)
	#define YSTART_OFFSET TOSTRING(4)
	#define WIDTH_OFFSET TOSTRING(8)
	#define HEIGHT_OFFSET TOSTRING(12)
	#define FLAGS_OFFSET TOSTRING(16)
	#define SPECULAR_EXPONENT_OFFSET TOSTRING(20)
	#define CONE_EXPONENT_OFFSET TOSTRING(24)
	#define FLOAT_ARGUMENTS_OFFSET TOSTRING(28)
	#define PAINTING_CONSTANTS_OFFSET TOSTRING(32)
	#define NL "\n"

	// Register allocation
	#define PAINTING_DATA_R "r11"
	#define RESET_WIDTH_R PAINTING_DATA_R
	#define PIXELS_R "r4"
	#define WIDTH_R "r5"
	#define HEIGHT_R "r6"
	#define FLAGS_R "r7"
	#define SPECULAR_EXPONENT_R "r8"
	#define CONE_EXPONENT_R "r10"
	#define SCANLINE_R "r12"

	#define TMP1_Q "q0"
	#define TMP1_D0 "d0"
	#define TMP1_S0 "s0"
	#define TMP1_S1 "s1"
	#define TMP1_D1 "d1"
	#define TMP1_S2 "s2"
	#define TMP1_S3 "s3"
	#define TMP2_Q "q1"
	#define TMP2_D0 "d2"
	#define TMP2_S0 "s4"
	#define TMP2_S1 "s5"
	#define TMP2_D1 "d3"
	#define TMP2_S2 "s6"
	#define TMP2_S3 "s7"
	#define TMP3_Q "q2"
	#define TMP3_D0 "d4"
	#define TMP3_S0 "s8"
	#define TMP3_S1 "s9"
	#define TMP3_D1 "d5"
	#define TMP3_S2 "s10"
	#define TMP3_S3 "s11"

	#define COSINE_OF_ANGLE "s12"
	#define POWF_INT_S "s13"
	#define POWF_FRAC_S "s14"
	#define SPOT_COLOR_Q "q4"

	// Because of VMIN and VMAX CONST_ZERO_S and CONST_ONE_S
	// must be placed on the same side of the double vector

	// Current pixel position
	#define POSITION_Q "q5"
	#define POSITION_X_S "s20"
	#define POSITION_Y_S "s21"
	#define POSITION_Z_S "s22"
	#define CONST_ZERO_HI_D "d11"
	#define CONST_ZERO_S "s23"

	// -------------------------------
	// Variable arguments
	// Misc arguments
	#define READ1_RANGE "d12-d15"
	#define READ2_RANGE "d16-d19"
	#define READ3_RANGE "d20-d21"

	#define SCALE_S "s24"
	#define SCALE_DIV4_S "s25"
	#define DIFFUSE_CONST_S "s26"

	// Light source position
	#define CONE_CUT_OFF_S "s28"
	#define CONE_FULL_LIGHT_S "s29"
	#define CONE_CUT_OFF_RANGE_S "s30"
	#define CONST_ONE_HI_D "d15"
	#define CONST_ONE_S "s31"

	#define LIGHT_Q "q8"
	#define DIRECTION_Q "q9"
	#define COLOR_Q "q10"
	// -------------------------------
	// Constant coefficients
	#define READ4_RANGE "d22-d25"
	#define READ5_RANGE "d26-d27"

	#define ALPHAX_Q "q11"
	#define ALPHAY_Q "q12"
	#define REMAPX_D "d26"
	#define REMAPY_D "d27"
	// -------------------------------

	#define ALL_ROWS_D "{d28,d29,d30}"
	#define TOP_ROW_D "d28"
	#define MIDDLE_ROW_D "d29"
	#define BOTTOM_ROW_D "d30"

	#define GET_LENGTH(source, temp) \
	"vmul.f32 " temp##_Q ", " source##_Q ", " source##_Q NL \
	"vadd.f32 " source##_S3 ", " temp##_S0 ", " temp##_S1 NL \
	"vadd.f32 " source##_S3 ", " source##_S3 ", " temp##_S2 NL \
	"vsqrt.f32 " source##_S3 ", " source##_S3 NL

	// destination##_S3 can contain the multiply of length.
	#define DOT_PRODUCT(destination, source1, source2) \
	"vmul.f32 " destination##_Q ", " source1##_Q ", " source2##_Q NL \
	"vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S1 NL \
	"vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S2 NL

	#define MULTIPLY_BY_DIFFUSE_CONST(normalVectorLength, dotProductLength) \
	"tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL \
	"vmuleq.f32 " TMP2_S1 ", " DIFFUSE_CONST_S ", " normalVectorLength NL \
	"vdiveq.f32 " TMP2_S1 ", " TMP2_S1 ", " dotProductLength NL \
	"vdivne.f32 " TMP2_S1 ", " normalVectorLength ", " dotProductLength NL

	#define POWF_SQR(value, exponent, current, remaining) \
	"tst " exponent ", #" ASSTRING(current) NL \
	"vmulne.f32 " value ", " value ", " POWF_INT_S NL \
	"tst " exponent ", #" ASSTRING(remaining) NL \
	"vmulne.f32 " POWF_INT_S ", " POWF_INT_S ", " POWF_INT_S NL

	#define POWF_SQRT(value, exponent, current, remaining) \
	"tst " exponent ", #" ASSTRING(remaining) NL \
	"vsqrtne.f32 " POWF_FRAC_S ", " POWF_FRAC_S NL \
	"tst " exponent ", #" ASSTRING(current) NL \
	"vmulne.f32 " value ", " value ", " POWF_FRAC_S NL

	// This simplified powf function is sufficiently accurate.
	#define POWF(value, exponent) \
	"tst " exponent ", #0xfc0" NL \
	"vmovne.f32 " POWF_INT_S ", " value NL \
	"tst " exponent ", #0x03f" NL \
	"vmovne.f32 " POWF_FRAC_S ", " value NL \
	"vmov.f32 " value ", " CONST_ONE_S NL \
	\
	POWF_SQR(value, exponent, 0x040, 0xf80) \
	POWF_SQR(value, exponent, 0x080, 0xf00) \
	POWF_SQR(value, exponent, 0x100, 0xe00) \
	POWF_SQR(value, exponent, 0x200, 0xc00) \
	POWF_SQR(value, exponent, 0x400, 0x800) \
	"tst " exponent ", #0x800" NL \
	"vmulne.f32 " value ", " value ", " POWF_INT_S NL \
	\
	POWF_SQRT(value, exponent, 0x20, 0x3f) \
	POWF_SQRT(value, exponent, 0x10, 0x1f) \
	POWF_SQRT(value, exponent, 0x08, 0x0f) \
	POWF_SQRT(value, exponent, 0x04, 0x07) \
	POWF_SQRT(value, exponent, 0x02, 0x03) \
	POWF_SQRT(value, exponent, 0x01, 0x01)

	// The following algorithm is an ARM-NEON optimized version of
	// the main loop found in FELighting.cpp. Since the whole code
	// is redesigned to be as effective as possible (ARM specific
	// thinking), it is four times faster than its C++ counterpart.

	asm ( // NOLINT
	".globl " TOSTRING(neonDrawLighting) NL
	TOSTRING(neonDrawLighting) ":" NL
	// Because of the clever register allocation, nothing is stored on the stack
	// except the saved registers.
	// Stack must be aligned to 8 bytes.
	"stmdb sp!, {r4-r8, r10, r11, lr}" NL
	"vstmdb sp!, {d8-d15}" NL
	"mov " PAINTING_DATA_R ", r0" NL

	// The following two arguments are loaded to SIMD registers.
	"ldr r0, [" PAINTING_DATA_R ", #" FLOAT_ARGUMENTS_OFFSET "]" NL
	"ldr r1, [" PAINTING_DATA_R ", #" PAINTING_CONSTANTS_OFFSET "]" NL
	"ldr " PIXELS_R ", [" PAINTING_DATA_R ", #" PIXELS_OFFSET "]" NL
	"vldr.f32 " POSITION_Y_S ", [" PAINTING_DATA_R ", #" YSTART_OFFSET "]" NL
	"ldr " WIDTH_R ", [" PAINTING_DATA_R ", #" WIDTH_OFFSET "]" NL
	"ldr " HEIGHT_R ", [" PAINTING_DATA_R ", #" HEIGHT_OFFSET "]" NL
	"ldr " FLAGS_R ", [" PAINTING_DATA_R ", #" FLAGS_OFFSET "]" NL
	"ldr " SPECULAR_EXPONENT_R ", [" PAINTING_DATA_R ", #" SPECULAR_EXPONENT_OFFSET "]" NL
	"ldr " CONE_EXPONENT_R ", [" PAINTING_DATA_R ", #" CONE_EXPONENT_OFFSET "]" NL

	// Load all data to the SIMD registers with the least number of instructions.
	"vld1.f32 { " READ1_RANGE " }, [r0]!" NL
	"vld1.f32 { " READ2_RANGE " }, [r0]!" NL
	"vld1.f32 { " READ3_RANGE " }, [r0]!" NL
	"vld1.s16 {" READ4_RANGE "}, [r1]!" NL
	"vld1.s16 {" READ5_RANGE "}, [r1]!" NL

	// Initializing local variables.
	"mov " SCANLINE_R ", " WIDTH_R ", lsl #2" NL
	"add " SCANLINE_R ", " SCANLINE_R ", #8" NL
	"add " PIXELS_R ", " PIXELS_R ", " SCANLINE_R NL
	"add " PIXELS_R ", " PIXELS_R ", #3" NL
	"mov r0, #0" NL
	"vmov.f32 " CONST_ZERO_S ", r0" NL
	"tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
	"vmov.f32 " SPOT_COLOR_Q ", " COLOR_Q NL
	"mov " RESET_WIDTH_R ", " WIDTH_R NL

	".mainLoop:" NL
	"mov r3, #3" NL
	"vmov.f32 " POSITION_X_S ", " CONST_ONE_S NL

	".scanline:" NL
	// The ROW registers are storing the alpha channel of the last three pixels.
	// The alpha channel is stored as signed short (sint16) values. The fourth value
	// is garbage. The following instructions are shifting out the unnecessary alpha
	// values and load the next ones.
	"ldrb r0, [" PIXELS_R ", -" SCANLINE_R "]" NL
	"ldrb r1, [" PIXELS_R ", +" SCANLINE_R "]" NL
	"ldrb r2, [" PIXELS_R "], #4" NL
	"vext.s16 " TOP_ROW_D ", " TOP_ROW_D ", " TOP_ROW_D ", #3" NL
	"vext.s16 " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", #3" NL
	"vext.s16 " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", #3" NL
	"vmov.s16 " TOP_ROW_D "[1], r0" NL
	"vmov.s16 " MIDDLE_ROW_D "[1], r2" NL
	"vmov.s16 " BOTTOM_ROW_D "[1], r1" NL

	// The two border pixels (rightmost and leftmost) are skipped when
	// the next scanline is reached. It also jumps, when the algorithm
	// is started, and the first free alpha values are loaded to each row.
	"subs r3, r3, #1" NL
	"bne .scanline" NL

	// The light vector goes to TMP1_Q. It is constant in case of distant light.
	// The fourth value contains the length of the light vector.
	"tst " FLAGS_R ", #" TOSTRING(FLAG_POINT_LIGHT \| FLAG_SPOT_LIGHT) NL
	"beq .distantLight" NL

	"vmov.s16 r3, " MIDDLE_ROW_D "[2]" NL
	"vmov.f32 " POSITION_Z_S ", r3" NL
	"vcvt.f32.s32 " POSITION_Z_S ", " POSITION_Z_S NL
	"vmul.f32 " POSITION_Z_S ", " POSITION_Z_S ", " SCALE_S NL

	"vsub.f32 " TMP1_Q ", " LIGHT_Q ", " POSITION_Q NL
	GET_LENGTH(TMP1, TMP2)

	"tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
	"bne .cosineOfAngle" NL
	".visiblePixel:" NL

	// \| -1 0 1 \| \| -1 -2 -1 \|
	// X = \| -2 0 2 \| Y = \| 0 0 0 \|
	// \| -1 0 1 \| \| 1 2 1 \|

	// Multiply the alpha values by the X and Y matrices.

	// Moving the 8 alpha value to TMP3.
	"vtbl.8 " TMP3_D0 ", " ALL_ROWS_D ", " REMAPX_D NL
	"vtbl.8 " TMP3_D1 ", " ALL_ROWS_D ", " REMAPY_D NL

	"vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAX_Q NL
	"vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
	"vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
	"vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
	"vmov.s16 r0, " TMP2_D0 "[0]" NL

	"vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAY_Q NL
	"vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
	"vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
	"vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
	"vmov.s16 r1, " TMP2_D0 "[0]" NL

	// r0 and r1 contains the X and Y coordinates of the
	// normal vector, respectively.

	// Calculating the spot light strength.
	"tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
	"beq .endLight" NL

	"vneg.f32 " TMP3_S1 ", " COSINE_OF_ANGLE NL
	"tst " FLAGS_R ", #" TOSTRING(FLAG_CONE_EXPONENT_IS_1) NL
	"beq .coneExpPowf" NL
	".coneExpPowfFinished:" NL

	// Smoothing the cone edge if necessary.
	"vcmp.f32 " COSINE_OF_ANGLE ", " CONE_FULL_LIGHT_S NL
	"fmstat" NL
	"bhi .cutOff" NL
	".cutOffFinished:" NL

	"vmin.f32 " TMP3_D0 ", " TMP3_D0 ", " CONST_ONE_HI_D NL
	"vmul.f32 " COLOR_Q ", " SPOT_COLOR_Q ", " TMP3_D0 "[1]" NL

	".endLight:" NL
	// Summarize:
	// r0 and r1 contains the normalVector.
	// TMP1_Q contains the light vector and its length.
	// COLOR_Q contains the color of the light vector.

	// Test whether both r0 and r1 are zero (Normal vector is (0, 0, 1)).
	"orrs r2, r0, r1" NL
	"bne .normalVectorIsNonZero" NL

	"tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
	"bne .specularLight1" NL

	// Calculate diffuse light strength.
	MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
	"b .lightStrengthCalculated" NL

	".specularLight1:" NL
	// Calculating specular light strength.
	"vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
	GET_LENGTH(TMP1, TMP2)

	// When the exponent is 1, we don't need to call an expensive powf function.
	"tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
	"vdiveq.f32 " TMP2_S1 ", " TMP1_S2 ", " TMP1_S3 NL
	"beq .specularExpPowf" NL

	MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
	"b .lightStrengthCalculated" NL

	".normalVectorIsNonZero:" NL
	// Normal vector goes to TMP2, and its length is calculated as well.
	"vmov.s32 " TMP2_S0 ", r0" NL
	"vcvt.f32.s32 " TMP2_S0 ", " TMP2_S0 NL
	"vmul.f32 " TMP2_S0 ", " TMP2_S0 ", " SCALE_DIV4_S NL
	"vmov.s32 " TMP2_S1 ", r1" NL
	"vcvt.f32.s32 " TMP2_S1 ", " TMP2_S1 NL
	"vmul.f32 " TMP2_S1 ", " TMP2_S1 ", " SCALE_DIV4_S NL
	"vmov.f32 " TMP2_S2 ", " CONST_ONE_S NL
	GET_LENGTH(TMP2, TMP3)

	"tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
	"bne .specularLight2" NL

	// Calculating diffuse light strength.
	DOT_PRODUCT(TMP3, TMP2, TMP1)
	MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)
	"b .lightStrengthCalculated" NL

	".specularLight2:" NL
	// Calculating specular light strength.
	"vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
	GET_LENGTH(TMP1, TMP3)
	DOT_PRODUCT(TMP3, TMP2, TMP1)

	// When the exponent is 1, we don't need to call an expensive powf function.
	"tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
	"vdiveq.f32 " TMP2_S1 ", " TMP3_S0 ", " TMP3_S3 NL
	"beq .specularExpPowf" NL
	MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)

	".lightStrengthCalculated:" NL
	// TMP2_S1 contains the light strength. Clamp it to [0, 1]
	"vmax.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ZERO_HI_D NL
	"vmin.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ONE_HI_D NL
	"vmul.f32 " TMP3_Q ", " COLOR_Q ", " TMP2_D0 "[1]" NL
	"vcvt.u32.f32 " TMP3_Q ", " TMP3_Q NL
	"vmov.u32 r2, r3, " TMP3_S0 ", " TMP3_S1 NL
	// The color values are stored in-place.
	"strb r2, [" PIXELS_R ", #-11]" NL
	"strb r3, [" PIXELS_R ", #-10]" NL
	"vmov.u32 r2, " TMP3_S2 NL
	"strb r2, [" PIXELS_R ", #-9]" NL

	// Continue to the next pixel.
	".blackPixel:" NL
	"vadd.f32 " POSITION_X_S ", " CONST_ONE_S NL
	"mov r3, #1" NL
	"subs " WIDTH_R ", " WIDTH_R ", #1" NL
	"bne .scanline" NL

	// If the end of the scanline is reached, we continue
	// to the next scanline.
	"vadd.f32 " POSITION_Y_S ", " CONST_ONE_S NL
	"mov " WIDTH_R ", " RESET_WIDTH_R NL
	"subs " HEIGHT_R ", " HEIGHT_R ", #1" NL
	"bne .mainLoop" NL

	// Return.
	"vldmia sp!, {d8-d15}" NL
	"ldmia sp!, {r4-r8, r10, r11, pc}" NL

	".distantLight:" NL
	// In case of distant light, the light vector is constant,
	// we simply copy it.
	"vmov.f32 " TMP1_Q ", " LIGHT_Q NL
	"b .visiblePixel" NL

	".cosineOfAngle:" NL
	// If the pixel is outside of the cone angle, it is simply a black pixel.
	DOT_PRODUCT(TMP3, TMP1, DIRECTION)
	"vdiv.f32 " COSINE_OF_ANGLE ", " TMP3_S0 ", " TMP1_S3 NL
	"vcmp.f32 " COSINE_OF_ANGLE ", " CONE_CUT_OFF_S NL
	"fmstat" NL
	"bls .visiblePixel" NL
	"mov r0, #0" NL
	"strh r0, [" PIXELS_R ", #-11]" NL
	"strb r0, [" PIXELS_R ", #-9]" NL
	"b .blackPixel" NL

	".cutOff:" NL
	// Smoothing the light strength on the cone edge.
	"vsub.f32 " TMP3_S0 ", " CONE_CUT_OFF_S ", " COSINE_OF_ANGLE NL
	"vdiv.f32 " TMP3_S0 ", " TMP3_S0 ", " CONE_CUT_OFF_RANGE_S NL
	"vmul.f32 " TMP3_S1 ", " TMP3_S1 ", " TMP3_S0 NL
	"b .cutOffFinished" NL

	".coneExpPowf:" NL
	POWF(TMP3_S1, CONE_EXPONENT_R)
	"b .coneExpPowfFinished" NL

	".specularExpPowf:" NL
	POWF(TMP2_S1, SPECULAR_EXPONENT_R)
	"tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL
	"vmuleq.f32 " TMP2_S1 ", " TMP2_S1 ", " DIFFUSE_CONST_S NL
	"b .lightStrengthCalculated" NL
	); // NOLINT

	int FELighting::getPowerCoefficients(float exponent)
	{
	// Calling a powf function from the assembly code would require to save
	// and reload a lot of NEON registers. Since the base is in range [0..1]
	// and only 8 bit precision is required, we use our own powf function.
	// This is probably not the best, but it uses only a few registers and
	// gives us enough precision (modifying the exponent field directly would
	// also be possible).

	// First, we limit the exponent to maximum of 64, which gives us enough
	// precision. We split the exponent to an integer and fraction part,
	// since a^x = (a^y)*(a^z) where x = y+z. The integer exponent of the
	// power is estimated by square, and the fraction exponent of the power
	// is estimated by square root assembly instructions.
	int i, result;

	if (exponent < 0)
	exponent = 1 / (-exponent);

	if (exponent > 63.99)
	exponent = 63.99;

	exponent /= 64;
	result = 0;
	for (i = 11; i >= 0; --i) {
	exponent *= 2;
	if (exponent >= 1) {
	result \|= 1 << i;
	exponent -= 1;
	}
	}
	return result;
	}

	} // namespace WebCore

	#endif // CPU(ARM_NEON) && COMPILER(GCC_COMPATIBLE)