blob: 1dec9c09aeb76a145f69178a0f53e2179d934176 [file] [log] [blame]
/*
* Copyright (C) 2010, Google Inc. All rights reserved.
* Copyright (C) 2020, Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#if ENABLE(WEB_AUDIO)
#include "AudioUtilities.h"
#include "VectorMath.h"
#if USE(ACCELERATE)
#include <Accelerate/Accelerate.h>
#endif
#if CPU(X86_SSE2)
#include <emmintrin.h>
#endif
#if HAVE(ARM_NEON_INTRINSICS)
#include <arm_neon.h>
#endif
#include <algorithm>
#include <math.h>
namespace WebCore {
namespace VectorMath {
#if USE(ACCELERATE)
// On the Mac we use the highly optimized versions in Accelerate.framework
void multiplyByScalar(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
vDSP_vsmul(inputVector, 1, &scalar, outputVector, 1, numberOfElementsToProcess);
}
void add(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
vDSP_vadd(inputVector1, 1, inputVector2, 1, outputVector, 1, numberOfElementsToProcess);
}
void substract(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
vDSP_vsub(inputVector1, 1, inputVector2, 1, outputVector, 1, numberOfElementsToProcess);
}
void addScalar(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
vDSP_vsadd(inputVector, 1, &scalar, outputVector, 1, numberOfElementsToProcess);
}
void multiply(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
vDSP_vmul(inputVector1, 1, inputVector2, 1, outputVector, 1, numberOfElementsToProcess);
}
void interpolate(const float* inputVector1, float* inputVector2, float interpolationFactor, float* outputVector, size_t numberOfElementsToProcess)
{
vDSP_vintb(inputVector1, 1, inputVector2, 1, &interpolationFactor, outputVector, 1, numberOfElementsToProcess);
}
void multiplyComplex(const float* realVector1, const float* imagVector1, const float* realVector2, const float* imag2P, float* realOutputVector, float* imagDestP, size_t numberOfElementsToProcess)
{
DSPSplitComplex sc1;
DSPSplitComplex sc2;
DSPSplitComplex dest;
sc1.realp = const_cast<float*>(realVector1);
sc1.imagp = const_cast<float*>(imagVector1);
sc2.realp = const_cast<float*>(realVector2);
sc2.imagp = const_cast<float*>(imag2P);
dest.realp = realOutputVector;
dest.imagp = imagDestP;
vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, numberOfElementsToProcess, 1);
}
void multiplyByScalarThenAddToOutput(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
vDSP_vsma(inputVector, 1, &scalar, outputVector, 1, outputVector, 1, numberOfElementsToProcess);
}
void multiplyByScalarThenAddToVector(const float* inputVector1, float scalar, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
vDSP_vsma(inputVector1, 1, &scalar, inputVector2, 1, outputVector, 1, numberOfElementsToProcess);
}
void addVectorsThenMultiplyByScalar(const float* inputVector1, const float* inputVector2, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
vDSP_vasm(inputVector1, 1, inputVector2, 1, &scalar, outputVector, 1, numberOfElementsToProcess);
}
float maximumMagnitude(const float* inputVector, size_t numberOfElementsToProcess)
{
float maximumValue = 0;
vDSP_maxmgv(inputVector, 1, &maximumValue, numberOfElementsToProcess);
return maximumValue;
}
float sumOfSquares(const float* inputVector, size_t numberOfElementsToProcess)
{
float sum = 0;
vDSP_svesq(const_cast<float*>(inputVector), 1, &sum, numberOfElementsToProcess);
return sum;
}
void clamp(const float* inputVector, float minimum, float maximum, float* outputVector, size_t numberOfElementsToProcess)
{
vDSP_vclip(const_cast<float*>(inputVector), 1, &minimum, &maximum, outputVector, 1, numberOfElementsToProcess);
}
void linearToDecibels(const float* inputVector, float* outputVector, size_t numberOfElementsToProcess)
{
float reference = 1;
vDSP_vdbcon(inputVector, 1, &reference, outputVector, 1, numberOfElementsToProcess, 1);
}
#else
static inline bool is16ByteAligned(const float* vector)
{
return !(reinterpret_cast<uintptr_t>(vector) & 0x0F);
}
void multiplyByScalarThenAddToVector(const float* inputVector1, float scalar, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
multiplyByScalar(inputVector1, scalar, outputVector, numberOfElementsToProcess);
add(outputVector, inputVector2, outputVector, numberOfElementsToProcess);
}
void multiplyByScalarThenAddToOutput(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
size_t n = numberOfElementsToProcess;
#if CPU(X86_SSE2)
// If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
while (!is16ByteAligned(inputVector) && n) {
*outputVector += scalar * *inputVector;
inputVector++;
outputVector++;
n--;
}
// Now the inputVector is aligned, use SSE.
size_t tailFrames = n % 4;
const float* endP = outputVector + n - tailFrames;
__m128 pSource;
__m128 dest;
__m128 temp;
__m128 mScale = _mm_set_ps1(scalar);
bool destAligned = is16ByteAligned(outputVector);
#define SSE2_MULT_ADD(loadInstr, storeInstr) \
while (outputVector < endP) \
{ \
pSource = _mm_load_ps(inputVector); \
temp = _mm_mul_ps(pSource, mScale); \
dest = _mm_##loadInstr##_ps(outputVector); \
dest = _mm_add_ps(dest, temp); \
_mm_##storeInstr##_ps(outputVector, dest); \
inputVector += 4; \
outputVector += 4; \
}
if (destAligned)
SSE2_MULT_ADD(load, store)
else
SSE2_MULT_ADD(loadu, storeu)
n = tailFrames;
#elif HAVE(ARM_NEON_INTRINSICS)
size_t tailFrames = n % 4;
const float* endP = outputVector + n - tailFrames;
float32x4_t k = vdupq_n_f32(scalar);
while (outputVector < endP) {
float32x4_t source = vld1q_f32(inputVector);
float32x4_t dest = vld1q_f32(outputVector);
dest = vmlaq_f32(dest, source, k);
vst1q_f32(outputVector, dest);
inputVector += 4;
outputVector += 4;
}
n = tailFrames;
#endif
while (n--) {
*outputVector += *inputVector * scalar;
++inputVector;
++outputVector;
}
}
void multiplyByScalar(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
size_t n = numberOfElementsToProcess;
#if CPU(X86_SSE2)
// If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
while (!is16ByteAligned(inputVector) && n) {
*outputVector = scalar * *inputVector;
inputVector++;
outputVector++;
n--;
}
// Now the inputVector address is aligned and start to apply SSE.
size_t group = n / 4;
__m128 mScale = _mm_set_ps1(scalar);
__m128* pSource;
__m128* pDest;
__m128 dest;
if (!is16ByteAligned(outputVector)) {
while (group--) {
pSource = reinterpret_cast<__m128*>(const_cast<float*>(inputVector));
dest = _mm_mul_ps(*pSource, mScale);
_mm_storeu_ps(outputVector, dest);
inputVector += 4;
outputVector += 4;
}
} else {
while (group--) {
pSource = reinterpret_cast<__m128*>(const_cast<float*>(inputVector));
pDest = reinterpret_cast<__m128*>(outputVector);
*pDest = _mm_mul_ps(*pSource, mScale);
inputVector += 4;
outputVector += 4;
}
}
// Non-SSE handling for remaining frames which is less than 4.
n %= 4;
#elif HAVE(ARM_NEON_INTRINSICS)
size_t tailFrames = n % 4;
const float* endP = outputVector + n - tailFrames;
while (outputVector < endP) {
float32x4_t source = vld1q_f32(inputVector);
vst1q_f32(outputVector, vmulq_n_f32(source, scalar));
inputVector += 4;
outputVector += 4;
}
n = tailFrames;
#endif
while (n--) {
*outputVector = scalar * *inputVector;
++inputVector;
++outputVector;
}
}
void addScalar(const float* inputVector, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
size_t n = numberOfElementsToProcess;
#if CPU(X86_SSE2)
// If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
while (!is16ByteAligned(inputVector) && n) {
*outputVector = *inputVector + scalar;
inputVector++;
outputVector++;
n--;
}
// Now the inputVector address is aligned and start to apply SSE.
size_t group = n / 4;
__m128 mScalar = _mm_set_ps1(scalar);
__m128* pSource;
__m128* pDest;
__m128 dest;
bool destAligned = is16ByteAligned(outputVector);
if (destAligned) { // all aligned
while (group--) {
pSource = reinterpret_cast<__m128*>(const_cast<float*>(inputVector));
pDest = reinterpret_cast<__m128*>(outputVector);
*pDest = _mm_add_ps(*pSource, mScalar);
inputVector += 4;
outputVector += 4;
}
} else {
while (group--) {
pSource = reinterpret_cast<__m128*>(const_cast<float*>(inputVector));
dest = _mm_add_ps(*pSource, mScalar);
_mm_storeu_ps(outputVector, dest);
inputVector += 4;
outputVector += 4;
}
}
// Non-SSE handling for remaining frames which is less than 4.
n %= 4;
#elif HAVE(ARM_NEON_INTRINSICS)
size_t tailFrames = n % 4;
const float* endP = outputVector + n - tailFrames;
float32x4_t scalarVector = vdupq_n_f32(scalar);
while (outputVector < endP) {
float32x4_t source = vld1q_f32(inputVector);
vst1q_f32(outputVector, vaddq_f32(source, scalarVector));
inputVector += 4;
outputVector += 4;
}
n = tailFrames;
#endif
while (n--) {
*outputVector = *inputVector + scalar;
++inputVector;
++outputVector;
}
}
void add(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
size_t n = numberOfElementsToProcess;
#if CPU(X86_SSE2)
// If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
while (!is16ByteAligned(inputVector1) && n) {
*outputVector = *inputVector1 + *inputVector2;
inputVector1++;
inputVector2++;
outputVector++;
n--;
}
// Now the inputVector1 address is aligned and start to apply SSE.
size_t group = n / 4;
__m128* pSource1;
__m128* pSource2;
__m128* pDest;
__m128 source2;
__m128 dest;
bool source2Aligned = is16ByteAligned(inputVector2);
bool destAligned = is16ByteAligned(outputVector);
if (source2Aligned && destAligned) { // all aligned
while (group--) {
pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector2));
pDest = reinterpret_cast<__m128*>(outputVector);
*pDest = _mm_add_ps(*pSource1, *pSource2);
inputVector1 += 4;
inputVector2 += 4;
outputVector += 4;
}
} else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
while (group--) {
pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector2));
dest = _mm_add_ps(*pSource1, *pSource2);
_mm_storeu_ps(outputVector, dest);
inputVector1 += 4;
inputVector2 += 4;
outputVector += 4;
}
} else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
while (group--) {
pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
source2 = _mm_loadu_ps(inputVector2);
pDest = reinterpret_cast<__m128*>(outputVector);
*pDest = _mm_add_ps(*pSource1, source2);
inputVector1 += 4;
inputVector2 += 4;
outputVector += 4;
}
} else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
while (group--) {
pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
source2 = _mm_loadu_ps(inputVector2);
dest = _mm_add_ps(*pSource1, source2);
_mm_storeu_ps(outputVector, dest);
inputVector1 += 4;
inputVector2 += 4;
outputVector += 4;
}
}
// Non-SSE handling for remaining frames which is less than 4.
n %= 4;
#elif HAVE(ARM_NEON_INTRINSICS)
size_t tailFrames = n % 4;
const float* endP = outputVector + n - tailFrames;
while (outputVector < endP) {
float32x4_t source1 = vld1q_f32(inputVector1);
float32x4_t source2 = vld1q_f32(inputVector2);
vst1q_f32(outputVector, vaddq_f32(source1, source2));
inputVector1 += 4;
inputVector2 += 4;
outputVector += 4;
}
n = tailFrames;
#endif
while (n--) {
*outputVector = *inputVector1 + *inputVector2;
++inputVector1;
++inputVector2;
++outputVector;
}
}
void substract(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
size_t n = numberOfElementsToProcess;
#if CPU(X86_SSE2)
// If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
while (!is16ByteAligned(inputVector1) && n) {
*outputVector = *inputVector1 - *inputVector2;
inputVector1++;
inputVector2++;
outputVector++;
n--;
}
// Now the inputVector1 address is aligned and start to apply SSE.
size_t group = n / 4;
__m128* pSource1;
__m128* pSource2;
__m128* pDest;
__m128 source2;
__m128 dest;
bool source2Aligned = is16ByteAligned(inputVector2);
bool destAligned = is16ByteAligned(outputVector);
if (source2Aligned && destAligned) { // all aligned
while (group--) {
pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector2));
pDest = reinterpret_cast<__m128*>(outputVector);
*pDest = _mm_sub_ps(*pSource1, *pSource2);
inputVector1 += 4;
inputVector2 += 4;
outputVector += 4;
}
} else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
while (group--) {
pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector2));
dest = _mm_sub_ps(*pSource1, *pSource2);
_mm_storeu_ps(outputVector, dest);
inputVector1 += 4;
inputVector2 += 4;
outputVector += 4;
}
} else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
while (group--) {
pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
source2 = _mm_loadu_ps(inputVector2);
pDest = reinterpret_cast<__m128*>(outputVector);
*pDest = _mm_sub_ps(*pSource1, source2);
inputVector1 += 4;
inputVector2 += 4;
outputVector += 4;
}
} else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
while (group--) {
pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(inputVector1));
source2 = _mm_loadu_ps(inputVector2);
dest = _mm_sub_ps(*pSource1, source2);
_mm_storeu_ps(outputVector, dest);
inputVector1 += 4;
inputVector2 += 4;
outputVector += 4;
}
}
// Non-SSE handling for remaining frames which is less than 4.
n %= 4;
#elif HAVE(ARM_NEON_INTRINSICS)
size_t tailFrames = n % 4;
const float* endP = outputVector + n - tailFrames;
while (outputVector < endP) {
float32x4_t source1 = vld1q_f32(inputVector1);
float32x4_t source2 = vld1q_f32(inputVector2);
vst1q_f32(outputVector, vsubq_f32(source1, source2));
inputVector1 += 4;
inputVector2 += 4;
outputVector += 4;
}
n = tailFrames;
#endif
while (n--) {
*outputVector = *inputVector1 - *inputVector2;
++inputVector1;
++inputVector2;
++outputVector;
}
}
void interpolate(const float* inputVector1, float* inputVector2, float interpolationFactor, float* outputVector, size_t numberOfElementsToProcess)
{
if (inputVector1 != outputVector)
memcpy(outputVector, inputVector1, numberOfElementsToProcess * sizeof(float));
// inputVector2[k] = inputVector2[k] - inputVector1[k]
substract(inputVector2, inputVector1, inputVector2, numberOfElementsToProcess);
// outputVector[k] = outputVector[k] + interpolationFactor * inputVector2[k]
// = inputVector1[k] + interpolationFactor * (inputVector2[k] - inputVector1[k]);
multiplyByScalarThenAddToOutput(inputVector2, interpolationFactor, outputVector, numberOfElementsToProcess);
}
void multiply(const float* inputVector1, const float* inputVector2, float* outputVector, size_t numberOfElementsToProcess)
{
size_t n = numberOfElementsToProcess;
#if CPU(X86_SSE2)
// If the inputVector1 address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
while (!is16ByteAligned(inputVector1) && n) {
*outputVector = *inputVector1 * *inputVector2;
inputVector1++;
inputVector2++;
outputVector++;
n--;
}
// Now the inputVector1 address aligned and start to apply SSE.
size_t tailFrames = n % 4;
const float* endP = outputVector + n - tailFrames;
__m128 pSource1;
__m128 pSource2;
__m128 dest;
bool source2Aligned = is16ByteAligned(inputVector2);
bool destAligned = is16ByteAligned(outputVector);
#define SSE2_MULT(loadInstr, storeInstr) \
while (outputVector < endP) \
{ \
pSource1 = _mm_load_ps(inputVector1); \
pSource2 = _mm_##loadInstr##_ps(inputVector2); \
dest = _mm_mul_ps(pSource1, pSource2); \
_mm_##storeInstr##_ps(outputVector, dest); \
inputVector1 += 4; \
inputVector2 += 4; \
outputVector += 4; \
}
if (source2Aligned && destAligned) // Both aligned.
SSE2_MULT(load, store)
else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
SSE2_MULT(load, storeu)
else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
SSE2_MULT(loadu, store)
else // Neither aligned.
SSE2_MULT(loadu, storeu)
n = tailFrames;
#elif HAVE(ARM_NEON_INTRINSICS)
size_t tailFrames = n % 4;
const float* endP = outputVector + n - tailFrames;
while (outputVector < endP) {
float32x4_t source1 = vld1q_f32(inputVector1);
float32x4_t source2 = vld1q_f32(inputVector2);
vst1q_f32(outputVector, vmulq_f32(source1, source2));
inputVector1 += 4;
inputVector2 += 4;
outputVector += 4;
}
n = tailFrames;
#endif
while (n--) {
*outputVector = *inputVector1 * *inputVector2;
++inputVector1;
++inputVector2;
++outputVector;
}
}
void multiplyComplex(const float* realVector1, const float* imagVector1, const float* realVector2, const float* imag2P, float* realOutputVector, float* imagDestP, size_t numberOfElementsToProcess)
{
unsigned i = 0;
#if CPU(X86_SSE2)
// Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
// Otherwise, fall through to the scalar code below.
if (is16ByteAligned(realVector1) && is16ByteAligned(imagVector1) && is16ByteAligned(realVector2) && is16ByteAligned(imag2P) && is16ByteAligned(realOutputVector) && is16ByteAligned(imagDestP)) {
unsigned endSize = numberOfElementsToProcess - numberOfElementsToProcess % 4;
while (i < endSize) {
__m128 real1 = _mm_load_ps(realVector1 + i);
__m128 real2 = _mm_load_ps(realVector2 + i);
__m128 imag1 = _mm_load_ps(imagVector1 + i);
__m128 imag2 = _mm_load_ps(imag2P + i);
__m128 real = _mm_mul_ps(real1, real2);
real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
__m128 imag = _mm_mul_ps(real1, imag2);
imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
_mm_store_ps(realOutputVector + i, real);
_mm_store_ps(imagDestP + i, imag);
i += 4;
}
}
#elif HAVE(ARM_NEON_INTRINSICS)
unsigned endSize = numberOfElementsToProcess - numberOfElementsToProcess % 4;
while (i < endSize) {
float32x4_t real1 = vld1q_f32(realVector1 + i);
float32x4_t real2 = vld1q_f32(realVector2 + i);
float32x4_t imag1 = vld1q_f32(imagVector1 + i);
float32x4_t imag2 = vld1q_f32(imag2P + i);
float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
vst1q_f32(realOutputVector + i, realResult);
vst1q_f32(imagDestP + i, imagResult);
i += 4;
}
#endif
for (; i < numberOfElementsToProcess; ++i) {
// Read and compute result before storing them, in case the
// destination is the same as one of the sources.
realOutputVector[i] = realVector1[i] * realVector2[i] - imagVector1[i] * imag2P[i];
imagDestP[i] = realVector1[i] * imag2P[i] + imagVector1[i] * realVector2[i];
}
}
float sumOfSquares(const float* inputVector, size_t numberOfElementsToProcess)
{
size_t n = numberOfElementsToProcess;
float sum = 0;
#if CPU(X86_SSE2)
// If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
while (!is16ByteAligned(inputVector) && n) {
float sample = *inputVector;
sum += sample * sample;
inputVector++;
n--;
}
// Now the inputVector is aligned, use SSE.
size_t tailFrames = n % 4;
const float* endP = inputVector + n - tailFrames;
__m128 source;
__m128 mSum = _mm_setzero_ps();
while (inputVector < endP) {
source = _mm_load_ps(inputVector);
source = _mm_mul_ps(source, source);
mSum = _mm_add_ps(mSum, source);
inputVector += 4;
}
// Summarize the SSE results.
const float* groupSumP = reinterpret_cast<float*>(&mSum);
sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
n = tailFrames;
#elif HAVE(ARM_NEON_INTRINSICS)
size_t tailFrames = n % 4;
const float* endP = inputVector + n - tailFrames;
float32x4_t fourSum = vdupq_n_f32(0);
while (inputVector < endP) {
float32x4_t source = vld1q_f32(inputVector);
fourSum = vmlaq_f32(fourSum, source, source);
inputVector += 4;
}
float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
float groupSum[2];
vst1_f32(groupSum, twoSum);
sum += groupSum[0] + groupSum[1];
n = tailFrames;
#endif
while (n--) {
float sample = *inputVector;
sum += sample * sample;
++inputVector;
}
return sum;
}
float maximumMagnitude(const float* inputVector, size_t numberOfElementsToProcess)
{
size_t n = numberOfElementsToProcess;
float max = 0;
#if CPU(X86_SSE2)
// If the inputVector address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
while (!is16ByteAligned(inputVector) && n) {
max = std::max(max, std::abs(*inputVector));
inputVector++;
n--;
}
// Now the inputVector is aligned, use SSE.
size_t tailFrames = n % 4;
const float* endP = inputVector + n - tailFrames;
__m128 source;
__m128 mMax = _mm_setzero_ps();
int mask = 0x7FFFFFFF;
__m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask));
while (inputVector < endP) {
source = _mm_load_ps(inputVector);
// Calculate the absolute value by anding source with mask, the sign bit is set to 0.
source = _mm_and_ps(source, mMask);
mMax = _mm_max_ps(mMax, source);
inputVector += 4;
}
// Get max from the SSE results.
const float* groupMaxP = reinterpret_cast<float*>(&mMax);
max = std::max(max, groupMaxP[0]);
max = std::max(max, groupMaxP[1]);
max = std::max(max, groupMaxP[2]);
max = std::max(max, groupMaxP[3]);
n = tailFrames;
#elif HAVE(ARM_NEON_INTRINSICS)
size_t tailFrames = n % 4;
const float* endP = inputVector + n - tailFrames;
float32x4_t fourMax = vdupq_n_f32(0);
while (inputVector < endP) {
float32x4_t source = vld1q_f32(inputVector);
fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
inputVector += 4;
}
float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));
float groupMax[2];
vst1_f32(groupMax, twoMax);
max = std::max(groupMax[0], groupMax[1]);
n = tailFrames;
#endif
while (n--) {
max = std::max(max, std::abs(*inputVector));
++inputVector;
}
return max;
}
void clamp(const float* inputVector, float minimum, float maximum, float* outputVector, size_t numberOfElementsToProcess)
{
size_t n = numberOfElementsToProcess;
// FIXME: Optimize for SSE2.
#if HAVE(ARM_NEON_INTRINSICS)
size_t tailFrames = n % 4;
const float* endP = outputVector + n - tailFrames;
float32x4_t low = vdupq_n_f32(minimum);
float32x4_t high = vdupq_n_f32(maximum);
while (outputVector < endP) {
float32x4_t source = vld1q_f32(inputVector);
vst1q_f32(outputVector, vmaxq_f32(vminq_f32(source, high), low));
inputVector += 4;
outputVector += 4;
}
n = tailFrames;
#endif
while (n--) {
*outputVector = std::clamp(*inputVector, minimum, maximum);
++inputVector;
++outputVector;
}
}
void linearToDecibels(const float* inputVector, float* outputVector, size_t numberOfElementsToProcess)
{
for (size_t i = 0; i < numberOfElementsToProcess; ++i)
outputVector[i] = AudioUtilities::linearToDecibels(inputVector[i]);
}
void addVectorsThenMultiplyByScalar(const float* inputVector1, const float* inputVector2, float scalar, float* outputVector, size_t numberOfElementsToProcess)
{
add(inputVector1, inputVector2, outputVector, numberOfElementsToProcess);
multiplyByScalar(outputVector, scalar, outputVector, numberOfElementsToProcess);
}
#endif // USE(ACCELERATE)
} // namespace VectorMath
} // namespace WebCore
#endif // ENABLE(WEB_AUDIO)