blob: 922e10420b58f37b3763219e7aae05842143cf3f [file] [log] [blame]
/*
* Copyright (C) 2011 Google Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of Apple Inc. ("Apple") nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#if ENABLE(WEB_AUDIO)
#include "SincResampler.h"
#include "AudioBus.h"
#include "AudioUtilities.h"
#include <wtf/MathExtras.h>
#if USE(ACCELERATE)
#include <Accelerate/Accelerate.h>
#elif CPU(X86_SSE2)
#include <xmmintrin.h>
#elif HAVE(ARM_NEON_INTRINSICS)
#include <arm_neon.h>
#endif
// Initial input buffer layout, dividing into regions r0 to r4 (note: r0, r3
// and r4 will move after the first load):
//
// |----------------|----------------------------------------------------------------|----------------|
//
// m_requestFrames
// <-------------------------------------------------------------------------------->
// r0 (during first load)
//
// kernelSize / 2 kernelSize / 2 kernelSize / 2 kernelSize / 2
// <---------------> <---------------> <---------------> <--------------->
// r1 r2 r3 r4
//
// m_blockSize == r4 - r2
// <--------------------------------------->
//
// m_requestFrames
// <------------------ ... ----------------->
// r0 (during second load)
//
// On the second request r0 slides to the right by kernelSize / 2 and r3, r4
// and m_blockSize are reinitialized via step (3) in the algorithm below.
//
// These new regions remain constant until a Flush() occurs. While complicated,
// this allows us to reduce jitter by always requesting the same amount from the
// provided callback.
// The Algorithm:
//
// 1) Allocate input_buffer of size: m_requestFrames + kernelSize; this ensures
// there's enough room to read m_requestFrames from the callback into region
// r0 (which will move between the first and subsequent passes).
//
// 2) Let r1, r2 each represent half the kernel centered around r0:
//
// r0 = m_inputBuffer + kernelSize / 2
// r1 = m_inputBuffer
// r2 = r0
//
// r0 is always m_requestFrames in size. r1, r2 are kernelSize / 2 in
// size. r1 must be zero initialized to avoid convolution with garbage (see
// step (5) for why).
//
// 3) Let r3, r4 each represent half the kernel right aligned with the end of
// r0 and choose m_blockSize as the distance in frames between r4 and r2:
//
// r3 = r0 + m_requestFrames - kernelSize
// r4 = r0 + m_requestFrames - kernelSize / 2
// m_blockSize = r4 - r2 = m_requestFrames - kernelSize / 2
//
// 4) Consume m_requestFrames frames into r0.
//
// 5) Position kernel centered at start of r2 and generate output frames until
// the kernel is centered at the start of r4 or we've finished generating
// all the output frames.
//
// 6) Wrap left over data from the r3 to r1 and r4 to r2.
//
// 7) If we're on the second load, in order to avoid overwriting the frames we
// just wrapped from r4 we need to slide r0 to the right by the size of
// r4, which is kernelSize / 2:
//
// r0 = r0 + kernelSize / 2 = m_inputBuffer + kernelSize
//
// r3, r4, and m_blockSize then need to be reinitialized, so goto (3).
//
// 8) Else, if we're not on the second load, goto (4).
//
// note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc.
namespace WebCore {
constexpr unsigned kernelSize { 32 };
constexpr unsigned numberOfKernelOffsets { 32 };
constexpr unsigned kernelStorageSize { kernelSize * (numberOfKernelOffsets + 1) };
static size_t calculateChunkSize(unsigned blockSize, double scaleFactor)
{
return blockSize / scaleFactor;
}
SincResampler::SincResampler(double scaleFactor, unsigned requestFrames, Function<void(float* buffer, size_t framesToProcess)>&& provideInput)
: m_scaleFactor(scaleFactor)
, m_kernelStorage(kernelStorageSize)
, m_requestFrames(requestFrames)
, m_provideInput(WTFMove(provideInput))
, m_inputBuffer(m_requestFrames + kernelSize) // See input buffer layout above.
, m_r1(m_inputBuffer.data())
, m_r2(m_inputBuffer.data() + kernelSize / 2)
{
ASSERT(m_provideInput);
ASSERT(m_requestFrames > 0);
updateRegions(false);
ASSERT(m_blockSize > kernelSize);
initializeKernel();
}
void SincResampler::updateRegions(bool isSecondLoad)
{
// Setup various region pointers in the buffer (see diagram above). If we're
// on the second load we need to slide m_r0 to the right by kernelSize / 2.
m_r0 = m_inputBuffer.data() + (isSecondLoad ? kernelSize : kernelSize / 2);
m_r3 = m_r0 + m_requestFrames - kernelSize;
m_r4 = m_r0 + m_requestFrames - kernelSize / 2;
m_blockSize = m_r4 - m_r2;
m_chunkSize = calculateChunkSize(m_blockSize, m_scaleFactor);
// m_r1 at the beginning of the buffer.
ASSERT(m_r1 == m_inputBuffer.data());
// m_r1 left of m_r2, m_r4 left of m_r3 and size correct.
ASSERT((m_r2 - m_r1) == (m_r4 - m_r3));
// m_r2 left of r3.
ASSERT(m_r2 <= m_r3);
}
void SincResampler::initializeKernel()
{
// Blackman window parameters.
double alpha = 0.16;
double a0 = 0.5 * (1.0 - alpha);
double a1 = 0.5;
double a2 = 0.5 * alpha;
// sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter.
double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0;
// The sinc function is an idealized brick-wall filter, but since we're windowing it the
// transition from pass to stop does not happen right away. So we should adjust the
// lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end.
// FIXME: this value is empirical and to be more exact should vary depending on kernelSize.
sincScaleFactor *= 0.9;
int n = kernelSize;
int halfSize = n / 2;
// Generates a set of windowed sinc() kernels.
// We generate a range of sub-sample offsets from 0.0 to 1.0.
for (unsigned offsetIndex = 0; offsetIndex <= numberOfKernelOffsets; ++offsetIndex) {
double subsampleOffset = static_cast<double>(offsetIndex) / numberOfKernelOffsets;
for (int i = 0; i < n; ++i) {
// Compute the sinc() with offset.
double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset);
double sinc = !s ? 1.0 : sin(s) / s;
sinc *= sincScaleFactor;
// Compute Blackman window, matching the offset of the sinc().
double x = (i - subsampleOffset) / n;
double window = a0 - a1 * cos(2.0 * piDouble * x) + a2 * cos(4.0 * piDouble * x);
// Window the sinc() function and store at the correct offset.
m_kernelStorage[i + offsetIndex * kernelSize] = sinc * window;
}
}
}
void SincResampler::processBuffer(const float* source, float* destination, unsigned numberOfSourceFrames, double scaleFactor)
{
SincResampler resampler(scaleFactor, AudioUtilities::renderQuantumSize, [source, numberOfSourceFrames](float* buffer, size_t framesToProcess) mutable {
// Clamp to number of frames available and zero-pad.
size_t framesToCopy = std::min<size_t>(numberOfSourceFrames, framesToProcess);
memcpy(buffer, source, sizeof(float) * framesToCopy);
// Zero-pad if necessary.
if (framesToCopy < framesToProcess)
memset(buffer + framesToCopy, 0, sizeof(float) * (framesToProcess - framesToCopy));
numberOfSourceFrames -= framesToCopy;
source += framesToCopy;
});
unsigned numberOfDestinationFrames = static_cast<unsigned>(numberOfSourceFrames / scaleFactor);
unsigned remaining = numberOfDestinationFrames;
while (remaining) {
unsigned framesThisTime = std::min<unsigned>(remaining, AudioUtilities::renderQuantumSize);
resampler.process(destination, framesThisTime);
destination += framesThisTime;
remaining -= framesThisTime;
}
}
void SincResampler::process(float* destination, size_t framesToProcess)
{
unsigned numberOfDestinationFrames = framesToProcess;
// Step (1)
// Prime the input buffer at the start of the input stream.
if (!m_isBufferPrimed) {
m_provideInput(m_r0, m_requestFrames);
m_isBufferPrimed = true;
}
// Step (2)
while (numberOfDestinationFrames) {
while (m_virtualSourceIndex < m_blockSize) {
// m_virtualSourceIndex lies in between two kernel offsets so figure out what they are.
int sourceIndexI = static_cast<int>(m_virtualSourceIndex);
double subsampleRemainder = m_virtualSourceIndex - sourceIndexI;
double virtualOffsetIndex = subsampleRemainder * numberOfKernelOffsets;
int offsetIndex = static_cast<int>(virtualOffsetIndex);
float* k1 = m_kernelStorage.data() + offsetIndex * kernelSize;
float* k2 = k1 + kernelSize;
// Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be true so long as kernelSize is a multiple of 16.
ASSERT(!(reinterpret_cast<uintptr_t>(k1) & 0x0F));
ASSERT(!(reinterpret_cast<uintptr_t>(k2) & 0x0F));
// Initialize input pointer based on quantized m_virtualSourceIndex.
float* inputP = m_r1 + sourceIndexI;
// Figure out how much to weight each kernel's "convolution".
double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex;
*destination++ = convolve(inputP, k1, k2, kernelInterpolationFactor);
// Advance the virtual index.
m_virtualSourceIndex += m_scaleFactor;
--numberOfDestinationFrames;
if (!numberOfDestinationFrames)
return;
}
// Wrap back around to the start.
ASSERT(m_virtualSourceIndex >= m_blockSize);
m_virtualSourceIndex -= m_blockSize;
// Step (3) Copy r3 to r1.
// This wraps the last input frames back to the start of the buffer.
memcpy(m_r1, m_r3, sizeof(float) * kernelSize);
// Step (4) -- Reinitialize regions if necessary.
if (m_r0 == m_r2)
updateRegions(true);
// Step (5)
// Refresh the buffer with more input.
m_provideInput(m_r0, m_requestFrames);
}
}
float SincResampler::convolve(const float* inputP, const float* k1, const float* k2, float kernelInterpolationFactor)
{
#if USE(ACCELERATE)
float sum1;
float sum2;
vDSP_dotpr(inputP, 1, k1, 1, &sum1, kernelSize);
vDSP_dotpr(inputP, 1, k2, 1, &sum2, kernelSize);
// Linearly interpolate the two "convolutions".
return (1.0f - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2;
#elif CPU(X86_SSE2)
__m128 m_input;
__m128 m_sums1 = _mm_setzero_ps();
__m128 m_sums2 = _mm_setzero_ps();
// Based on |inputP| alignment, we need to use loadu or load.
if (reinterpret_cast<uintptr_t>(inputP) & 0x0F) {
for (unsigned i = 0; i < kernelSize; i += 4) {
m_input = _mm_loadu_ps(inputP + i);
m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
}
} else {
for (unsigned i = 0; i < kernelSize; i += 4) {
m_input = _mm_load_ps(inputP + i);
m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));
m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));
}
}
// Linearly interpolate the two "convolutions".
m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0f - kernelInterpolationFactor));
m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernelInterpolationFactor));
m_sums1 = _mm_add_ps(m_sums1, m_sums2);
// Sum components together.
float result;
m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);
_mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(m_sums2, m_sums2, 1)));
return result;
#elif HAVE(ARM_NEON_INTRINSICS)
float32x4_t m_input;
float32x4_t m_sums1 = vmovq_n_f32(0);
float32x4_t m_sums2 = vmovq_n_f32(0);
const float* upper = inputP + kernelSize;
for (; inputP < upper; ) {
m_input = vld1q_f32(inputP);
inputP += 4;
m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
k1 += 4;
m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
k2 += 4;
}
// Linearly interpolate the two "convolutions".
m_sums1 = vmlaq_f32(vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernelInterpolationFactor)), m_sums2, vmovq_n_f32(kernelInterpolationFactor));
// Sum components together.
float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
#else
float sum1 = 0;
float sum2 = 0;
// Generate a single output sample.
int n = kernelSize;
while (n--) {
sum1 += *inputP * *k1++;
sum2 += *inputP++ * *k2++;
}
// Linearly interpolate the two "convolutions".
return (1.0f - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2;
#endif
}
} // namespace WebCore
#endif // ENABLE(WEB_AUDIO)