Source/WebCore/platform/graphics/filters/FEGaussianBlur.cpp - WebKit - Git at Google

 /*
  * Copyright (C) 2004, 2005, 2006, 2007 Nikolas Zimmermann <zimmermann@kde.org>
  * Copyright (C) 2004, 2005 Rob Buis <buis@kde.org>
  * Copyright (C) 2005 Eric Seidel <eric@webkit.org>
  * Copyright (C) 2009 Dirk Schulze <krit@webkit.org>
  * Copyright (C) 2010 Igalia, S.L.
  * Copyright (C) Research In Motion Limited 2010. All rights reserved.
  * Copyright (C) 2015-2016 Apple, Inc. All rights reserved.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Library General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Library General Public License for more details.
  *
  * You should have received a copy of the GNU Library General Public License
  * along with this library; see the file COPYING.LIB.  If not, write to
  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  * Boston, MA 02110-1301, USA.
  */

 #include "config.h"
 #include "FEGaussianBlur.h"

 #include "FEGaussianBlurNEON.h"
 #include "Filter.h"
 #include "GraphicsContext.h"
 #include <wtf/text/TextStream.h>

 #if USE(ACCELERATE)
 #include <Accelerate/Accelerate.h>
 #endif

 #include <JavaScriptCore/JSCInlines.h>
 #include <JavaScriptCore/TypedArrayInlines.h>
 #include <JavaScriptCore/Uint8ClampedArray.h>
 #include <wtf/MathExtras.h>
 #include <wtf/ParallelJobs.h>

 static inline float gaussianKernelFactor()
 {
     return 3 / 4.f * sqrtf(2 * piFloat);
 }

 static const int gMaxKernelSize = 500;

 namespace WebCore {

 inline void kernelPosition(int blurIteration, unsigned& radius, int& deltaLeft, int& deltaRight)
 {
     // Check http://www.w3.org/TR/SVG/filters.html#feGaussianBlurElement for details.
     switch (blurIteration) {
     case 0:
         if (!(radius % 2)) {
             deltaLeft = radius / 2 - 1;
             deltaRight = radius - deltaLeft;
         } else {
             deltaLeft = radius / 2;
             deltaRight = radius - deltaLeft;
         }
         break;
     case 1:
         if (!(radius % 2)) {
             deltaLeft++;
             deltaRight--;
         }
         break;
     case 2:
         if (!(radius % 2)) {
             deltaRight++;
             radius++;
         }
         break;
     }
 }

 FEGaussianBlur::FEGaussianBlur(Filter& filter, float x, float y, EdgeModeType edgeMode)
     : FilterEffect(filter)
     , m_stdX(x)
     , m_stdY(y)
     , m_edgeMode(edgeMode)
 {
 }

 Ref<FEGaussianBlur> FEGaussianBlur::create(Filter& filter, float x, float y, EdgeModeType edgeMode)
 {
     return adoptRef(*new FEGaussianBlur(filter, x, y, edgeMode));
 }

 void FEGaussianBlur::setStdDeviationX(float x)
 {
     m_stdX = x;
 }

 void FEGaussianBlur::setStdDeviationY(float y)
 {
     m_stdY = y;
 }

 void FEGaussianBlur::setEdgeMode(EdgeModeType edgeMode)
 {
     m_edgeMode = edgeMode;
 }

 // This function only operates on Alpha channel.
 inline void boxBlurAlphaOnly(const Uint8ClampedArray& srcPixelArray, Uint8ClampedArray& dstPixelArray,
     unsigned dx, int& dxLeft, int& dxRight, int& stride, int& strideLine, int& effectWidth, int& effectHeight, const int& maxKernelSize)
 {
     const uint8_t* srcData = srcPixelArray.data();
     uint8_t* dstData = dstPixelArray.data();
     // Memory alignment is: RGBA, zero-index based.
     const int channel = 3;

     for (int y = 0; y < effectHeight; ++y) {
         int line = y * strideLine;
         int sum = 0;

         // Fill the kernel.
         for (int i = 0; i < maxKernelSize; ++i) {
             unsigned offset = line + i * stride;
             const uint8_t* srcPtr = srcData + offset;
             sum += srcPtr[channel];
         }

         // Blurring.
         for (int x = 0; x < effectWidth; ++x) {
             unsigned pixelByteOffset = line + x * stride + channel;
             uint8_t* dstPtr = dstData + pixelByteOffset;
             *dstPtr = static_cast<uint8_t>(sum / dx);

             // Shift kernel.
             if (x >= dxLeft) {
                 unsigned leftOffset = pixelByteOffset - dxLeft * stride;
                 const uint8_t* srcPtr = srcData + leftOffset;
                 sum -= *srcPtr;
             }

             if (x + dxRight < effectWidth) {
                 unsigned rightOffset = pixelByteOffset + dxRight * stride;
                 const uint8_t* srcPtr = srcData + rightOffset;
                 sum += *srcPtr;
             }
         }
     }
 }

 inline void boxBlur(const Uint8ClampedArray& srcPixelArray, Uint8ClampedArray& dstPixelArray,
     unsigned dx, int dxLeft, int dxRight, int stride, int strideLine, int effectWidth, int effectHeight, bool alphaImage, EdgeModeType edgeMode)
 {
     const int maxKernelSize = std::min(dxRight, effectWidth);
     if (alphaImage)
         return boxBlurAlphaOnly(srcPixelArray, dstPixelArray, dx, dxLeft, dxRight, stride, strideLine,  effectWidth, effectHeight, maxKernelSize);

     const uint8_t* srcData = srcPixelArray.data();
     uint8_t* dstData = dstPixelArray.data();

     // Concerning the array width/length: it is Element size + Margin + Border. The number of pixels will be
     // P = width * height * channels.
     for (int y = 0; y < effectHeight; ++y) {
         int line = y * strideLine;
         int sumR = 0, sumG = 0, sumB = 0, sumA = 0;

         if (edgeMode == EDGEMODE_NONE) {
             // Fill the kernel.
             for (int i = 0; i < maxKernelSize; ++i) {
                 unsigned offset = line + i * stride;
                 const uint8_t* srcPtr = srcData + offset;
                 sumR += *srcPtr++;
                 sumG += *srcPtr++;
                 sumB += *srcPtr++;
                 sumA += *srcPtr;
             }

             // Blurring.
             for (int x = 0; x < effectWidth; ++x) {
                 unsigned pixelByteOffset = line + x * stride;
                 uint8_t* dstPtr = dstData + pixelByteOffset;

                 *dstPtr++ = static_cast<uint8_t>(sumR / dx);
                 *dstPtr++ = static_cast<uint8_t>(sumG / dx);
                 *dstPtr++ = static_cast<uint8_t>(sumB / dx);
                 *dstPtr = static_cast<uint8_t>(sumA / dx);

                 // Shift kernel.
                 if (x >= dxLeft) {
                     unsigned leftOffset = pixelByteOffset - dxLeft * stride;
                     const uint8_t* srcPtr = srcData + leftOffset;
                     sumR -= srcPtr[0];
                     sumG -= srcPtr[1];
                     sumB -= srcPtr[2];
                     sumA -= srcPtr[3];
                 }

                 if (x + dxRight < effectWidth) {
                     unsigned rightOffset = pixelByteOffset + dxRight * stride;
                     const uint8_t* srcPtr = srcData + rightOffset;
                     sumR += srcPtr[0];
                     sumG += srcPtr[1];
                     sumB += srcPtr[2];
                     sumA += srcPtr[3];
                 }
             }

         } else {
             // FIXME: Add support for 'wrap' here.
             // Get edge values for edgeMode 'duplicate'.
             const uint8_t* edgeValueLeft = srcData + line;
             const uint8_t* edgeValueRight  = srcData + (line + (effectWidth - 1) * stride);

             // Fill the kernel.
             for (int i = dxLeft * -1; i < dxRight; ++i) {
                 // Is this right for negative values of 'i'?
                 unsigned offset = line + i * stride;
                 const uint8_t* srcPtr = srcData + offset;

                 if (i < 0) {
                     sumR += edgeValueLeft[0];
                     sumG += edgeValueLeft[1];
                     sumB += edgeValueLeft[2];
                     sumA += edgeValueLeft[3];
                 } else if (i >= effectWidth) {
                     sumR += edgeValueRight[0];
                     sumG += edgeValueRight[1];
                     sumB += edgeValueRight[2];
                     sumA += edgeValueRight[3];
                 } else {
                     sumR += *srcPtr++;
                     sumG += *srcPtr++;
                     sumB += *srcPtr++;
                     sumA += *srcPtr;
                 }
             }

             // Blurring.
             for (int x = 0; x < effectWidth; ++x) {
                 unsigned pixelByteOffset = line + x * stride;
                 uint8_t* dstPtr = dstData + pixelByteOffset;

                 *dstPtr++ = static_cast<uint8_t>(sumR / dx);
                 *dstPtr++ = static_cast<uint8_t>(sumG / dx);
                 *dstPtr++ = static_cast<uint8_t>(sumB / dx);
                 *dstPtr = static_cast<uint8_t>(sumA / dx);

                 // Shift kernel.
                 if (x < dxLeft) {
                     sumR -= edgeValueLeft[0];
                     sumG -= edgeValueLeft[1];
                     sumB -= edgeValueLeft[2];
                     sumA -= edgeValueLeft[3];
                 } else {
                     unsigned leftOffset = pixelByteOffset - dxLeft * stride;
                     const uint8_t* srcPtr = srcData + leftOffset;
                     sumR -= srcPtr[0];
                     sumG -= srcPtr[1];
                     sumB -= srcPtr[2];
                     sumA -= srcPtr[3];
                 }

                 if (x + dxRight >= effectWidth) {
                     sumR += edgeValueRight[0];
                     sumG += edgeValueRight[1];
                     sumB += edgeValueRight[2];
                     sumA += edgeValueRight[3];
                 } else {
                     unsigned rightOffset = pixelByteOffset + dxRight * stride;
                     const uint8_t* srcPtr = srcData + rightOffset;
                     sumR += srcPtr[0];
                     sumG += srcPtr[1];
                     sumB += srcPtr[2];
                     sumA += srcPtr[3];
                 }
             }
         }
     }
 }

 #if USE(ACCELERATE)
 inline void accelerateBoxBlur(Uint8ClampedArray& ioBuffer, Uint8ClampedArray& tempBuffer, unsigned kernelSize, int stride, int effectWidth, int effectHeight)
 {
     if (!ioBuffer.data() || !tempBuffer.data()) {
         ASSERT_NOT_REACHED();
         return;
     }

     if (effectWidth <= 0 || effectHeight <= 0 || stride <= 0) {
         ASSERT_NOT_REACHED();
         return;
     }

     // We must always use an odd radius.
     if (kernelSize % 2 != 1)
         kernelSize += 1;

     vImage_Buffer effectInBuffer;
     effectInBuffer.data = static_cast<void*>(ioBuffer.data());
     effectInBuffer.width = effectWidth;
     effectInBuffer.height = effectHeight;
     effectInBuffer.rowBytes = stride;

     vImage_Buffer effectOutBuffer;
     effectOutBuffer.data = tempBuffer.data();
     effectOutBuffer.width = effectWidth;
     effectOutBuffer.height = effectHeight;
     effectOutBuffer.rowBytes = stride;

     // Determine the size of a temporary buffer by calling the function first with a special flag. vImage will return
     // the size needed, or an error (which are all negative).
     size_t tmpBufferSize = vImageBoxConvolve_ARGB8888(&effectInBuffer, &effectOutBuffer, 0, 0, 0, kernelSize, kernelSize, 0, kvImageEdgeExtend | kvImageGetTempBufferSize);
     if (tmpBufferSize <= 0)
         return;

     void* tmpBuffer = fastMalloc(tmpBufferSize);
     vImageBoxConvolve_ARGB8888(&effectInBuffer, &effectOutBuffer, tmpBuffer, 0, 0, kernelSize, kernelSize, 0, kvImageEdgeExtend);
     vImageBoxConvolve_ARGB8888(&effectOutBuffer, &effectInBuffer, tmpBuffer, 0, 0, kernelSize, kernelSize, 0, kvImageEdgeExtend);
     vImageBoxConvolve_ARGB8888(&effectInBuffer, &effectOutBuffer, tmpBuffer, 0, 0, kernelSize, kernelSize, 0, kvImageEdgeExtend);
     WTF::fastFree(tmpBuffer);

     // The final result should be stored in ioBuffer.
     ASSERT(ioBuffer.length() == tempBuffer.length());
     memcpy(ioBuffer.data(), tempBuffer.data(), ioBuffer.length());
 }
 #endif

 inline void standardBoxBlur(Uint8ClampedArray& ioBuffer, Uint8ClampedArray& tempBuffer, unsigned kernelSizeX, unsigned kernelSizeY, int stride, IntSize& paintSize, bool isAlphaImage, EdgeModeType edgeMode)
 {
     int dxLeft = 0;
     int dxRight = 0;
     int dyLeft = 0;
     int dyRight = 0;

     Uint8ClampedArray* fromBuffer = &ioBuffer;
     Uint8ClampedArray* toBuffer = &tempBuffer;

     for (int i = 0; i < 3; ++i) {
         if (kernelSizeX) {
             kernelPosition(i, kernelSizeX, dxLeft, dxRight);
 #if HAVE(ARM_NEON_INTRINSICS)
             if (!isAlphaImage)
                 boxBlurNEON(*fromBuffer, *toBuffer, kernelSizeX, dxLeft, dxRight, 4, stride, paintSize.width(), paintSize.height());
             else
                 boxBlur(*fromBuffer, *toBuffer, kernelSizeX, dxLeft, dxRight, 4, stride, paintSize.width(), paintSize.height(), true, edgeMode);
 #else
             boxBlur(*fromBuffer, *toBuffer, kernelSizeX, dxLeft, dxRight, 4, stride, paintSize.width(), paintSize.height(), isAlphaImage, edgeMode);
 #endif
             std::swap(fromBuffer, toBuffer);
         }

         if (kernelSizeY) {
             kernelPosition(i, kernelSizeY, dyLeft, dyRight);
 #if HAVE(ARM_NEON_INTRINSICS)
             if (!isAlphaImage)
                 boxBlurNEON(*fromBuffer, *toBuffer, kernelSizeY, dyLeft, dyRight, stride, 4, paintSize.height(), paintSize.width());
             else
                 boxBlur(*fromBuffer, *toBuffer, kernelSizeY, dyLeft, dyRight, stride, 4, paintSize.height(), paintSize.width(), true, edgeMode);
 #else
             boxBlur(*fromBuffer, *toBuffer, kernelSizeY, dyLeft, dyRight, stride, 4, paintSize.height(), paintSize.width(), isAlphaImage, edgeMode);
 #endif
             std::swap(fromBuffer, toBuffer);
         }
     }

     // The final result should be stored in ioBuffer.
     if (&ioBuffer != fromBuffer) {
         ASSERT(ioBuffer.length() == fromBuffer->length());
         memcpy(ioBuffer.data(), fromBuffer->data(), ioBuffer.length());
     }
 }

 inline void FEGaussianBlur::platformApplyGeneric(Uint8ClampedArray& ioBuffer, Uint8ClampedArray& tmpPixelArray, unsigned kernelSizeX, unsigned kernelSizeY, IntSize& paintSize)
 {
     int stride = 4 * paintSize.width();

 #if USE(ACCELERATE)
     if (kernelSizeX == kernelSizeY && (m_edgeMode == EDGEMODE_NONE || m_edgeMode == EDGEMODE_DUPLICATE)) {
         accelerateBoxBlur(ioBuffer, tmpPixelArray, kernelSizeX, stride, paintSize.width(), paintSize.height());
         return;
     }
 #endif

     standardBoxBlur(ioBuffer, tmpPixelArray, kernelSizeX, kernelSizeY, stride, paintSize, isAlphaImage(), m_edgeMode);
 }

 void FEGaussianBlur::platformApplyWorker(PlatformApplyParameters* parameters)
 {
     IntSize paintSize(parameters->width, parameters->height);
     parameters->filter->platformApplyGeneric(*parameters->ioPixelArray, *parameters->tmpPixelArray, parameters->kernelSizeX, parameters->kernelSizeY, paintSize);
 }

 inline void FEGaussianBlur::platformApply(Uint8ClampedArray& ioBuffer, Uint8ClampedArray& tmpPixelArray, unsigned kernelSizeX, unsigned kernelSizeY, IntSize& paintSize)
 {
 #if !USE(ACCELERATE)
     int scanline = 4 * paintSize.width();
     int extraHeight = 3 * kernelSizeY * 0.5f;
     int optimalThreadNumber = (paintSize.width() * paintSize.height()) / (s_minimalRectDimension + extraHeight * paintSize.width());

     if (optimalThreadNumber > 1) {
         WTF::ParallelJobs<PlatformApplyParameters> parallelJobs(&platformApplyWorker, optimalThreadNumber);

         int jobs = parallelJobs.numberOfJobs();
         if (jobs > 1) {
             // Split the job into "blockHeight"-sized jobs but there a few jobs that need to be slightly larger since
             // blockHeight * jobs < total size. These extras are handled by the remainder "jobsWithExtra".
             const int blockHeight = paintSize.height() / jobs;
             const int jobsWithExtra = paintSize.height() % jobs;

             int currentY = 0;
             for (int job = 0; job < jobs; job++) {
                 PlatformApplyParameters& params = parallelJobs.parameter(job);
                 params.filter = this;

                 int startY = !job ? 0 : currentY - extraHeight;
                 currentY += job < jobsWithExtra ? blockHeight + 1 : blockHeight;
                 int endY = job == jobs - 1 ? currentY : currentY + extraHeight;

                 int blockSize = (endY - startY) * scanline;
                 if (!job) {
                     params.ioPixelArray = &ioBuffer;
                     params.tmpPixelArray = &tmpPixelArray;
                 } else {
                     params.ioPixelArray = Uint8ClampedArray::createUninitialized(blockSize);
                     params.tmpPixelArray = Uint8ClampedArray::createUninitialized(blockSize);
                     memcpy(params.ioPixelArray->data(), ioBuffer.data() + startY * scanline, blockSize);
                 }

                 params.width = paintSize.width();
                 params.height = endY - startY;
                 params.kernelSizeX = kernelSizeX;
                 params.kernelSizeY = kernelSizeY;
             }

             parallelJobs.execute();

             // Copy together the parts of the image.
             currentY = 0;
             for (int job = 1; job < jobs; job++) {
                 PlatformApplyParameters& params = parallelJobs.parameter(job);
                 int sourceOffset;
                 int destinationOffset;
                 int size;
                 int adjustedBlockHeight = job < jobsWithExtra ? blockHeight + 1 : blockHeight;

                 currentY += adjustedBlockHeight;
                 sourceOffset = extraHeight * scanline;
                 destinationOffset = currentY * scanline;
                 size = adjustedBlockHeight * scanline;

                 memcpy(ioBuffer.data() + destinationOffset, params.ioPixelArray->data() + sourceOffset, size);
             }
             return;
         }
         // Fallback to single threaded mode.
     }
 #endif

     // The selection here eventually should happen dynamically on some platforms.
     platformApplyGeneric(ioBuffer, tmpPixelArray, kernelSizeX, kernelSizeY, paintSize);
 }

 static int clampedToKernelSize(float value)
 {
     // Limit the kernel size to 500. A bigger radius won't make a big difference for the result image but
     // inflates the absolute paint rect too much. This is compatible with Firefox' behavior.
     unsigned size = std::max<unsigned>(2, static_cast<unsigned>(floorf(value * gaussianKernelFactor() + 0.5f)));
     return clampTo<int>(std::min(size, static_cast<unsigned>(gMaxKernelSize)));
 }

 IntSize FEGaussianBlur::calculateUnscaledKernelSize(FloatSize stdDeviation)
 {
     ASSERT(stdDeviation.width() >= 0 && stdDeviation.height() >= 0);
     IntSize kernelSize;

     if (stdDeviation.width())
         kernelSize.setWidth(clampedToKernelSize(stdDeviation.width()));

     if (stdDeviation.height())
         kernelSize.setHeight(clampedToKernelSize(stdDeviation.height()));

     return kernelSize;
 }

 IntSize FEGaussianBlur::calculateKernelSize(const Filter& filter, FloatSize stdDeviation)
 {
     return calculateUnscaledKernelSize(filter.scaledByFilterResolution(stdDeviation));
 }

 void FEGaussianBlur::determineAbsolutePaintRect()
 {
     IntSize kernelSize = calculateKernelSize(filter(), { m_stdX, m_stdY });

     FloatRect absolutePaintRect = inputEffect(0)->absolutePaintRect();
     // Edge modes other than 'none' do not inflate the affected paint rect.
     if (m_edgeMode != EDGEMODE_NONE) {
         setAbsolutePaintRect(enclosingIntRect(absolutePaintRect));
         return;
     }

     // We take the half kernel size and multiply it with three, because we run box blur three times.
     absolutePaintRect.inflateX(3 * kernelSize.width() * 0.5f);
     absolutePaintRect.inflateY(3 * kernelSize.height() * 0.5f);

     if (clipsToBounds())
         absolutePaintRect.intersect(maxEffectRect());
     else
         absolutePaintRect.unite(maxEffectRect());

     setAbsolutePaintRect(enclosingIntRect(absolutePaintRect));
 }

 void FEGaussianBlur::platformApplySoftware()
 {
     FilterEffect* in = inputEffect(0);

     Uint8ClampedArray* resultPixelArray = createPremultipliedImageResult();
     if (!resultPixelArray)
         return;

     setIsAlphaImage(in->isAlphaImage());

     IntRect effectDrawingRect = requestedRegionOfInputImageData(in->absolutePaintRect());
     in->copyPremultipliedResult(*resultPixelArray, effectDrawingRect);

     if (!m_stdX && !m_stdY)
         return;

     IntSize kernelSize = calculateKernelSize(filter(), { m_stdX, m_stdY });
     kernelSize.scale(filter().filterScale());

     IntSize paintSize = absolutePaintRect().size();
     paintSize.scale(filter().filterScale());
     auto tmpImageData = Uint8ClampedArray::tryCreateUninitialized((paintSize.area() * 4).unsafeGet());
     if (!tmpImageData)
         return;

     platformApply(*resultPixelArray, *tmpImageData, kernelSize.width(), kernelSize.height(), paintSize);
 }

 TextStream& FEGaussianBlur::externalRepresentation(TextStream& ts, RepresentationType representation) const
 {
     ts << indent << "[feGaussianBlur";
     FilterEffect::externalRepresentation(ts, representation);
     ts << " stdDeviation=\"" << m_stdX << ", " << m_stdY << "\"]\n";

     TextStream::IndentScope indentScope(ts);
     inputEffect(0)->externalRepresentation(ts, representation);
     return ts;
 }

 } // namespace WebCore
	/*
	* Copyright (C) 2004, 2005, 2006, 2007 Nikolas Zimmermann <zimmermann@kde.org>
	* Copyright (C) 2004, 2005 Rob Buis <buis@kde.org>
	* Copyright (C) 2005 Eric Seidel <eric@webkit.org>
	* Copyright (C) 2009 Dirk Schulze <krit@webkit.org>
	* Copyright (C) 2010 Igalia, S.L.
	* Copyright (C) Research In Motion Limited 2010. All rights reserved.
	* Copyright (C) 2015-2016 Apple, Inc. All rights reserved.
	*
	* This library is free software; you can redistribute it and/or
	* modify it under the terms of the GNU Library General Public
	* License as published by the Free Software Foundation; either
	* version 2 of the License, or (at your option) any later version.
	*
	* This library is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* Library General Public License for more details.
	*
	* You should have received a copy of the GNU Library General Public License
	* along with this library; see the file COPYING.LIB. If not, write to
	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
	* Boston, MA 02110-1301, USA.
	*/

	#include "config.h"
	#include "FEGaussianBlur.h"

	#include "FEGaussianBlurNEON.h"
	#include "Filter.h"
	#include "GraphicsContext.h"
	#include <wtf/text/TextStream.h>

	#if USE(ACCELERATE)
	#include <Accelerate/Accelerate.h>
	#endif

	#include <JavaScriptCore/JSCInlines.h>
	#include <JavaScriptCore/TypedArrayInlines.h>
	#include <JavaScriptCore/Uint8ClampedArray.h>
	#include <wtf/MathExtras.h>
	#include <wtf/ParallelJobs.h>

	static inline float gaussianKernelFactor()
	{
	return 3 / 4.f * sqrtf(2 * piFloat);
	}

	static const int gMaxKernelSize = 500;

	namespace WebCore {

	inline void kernelPosition(int blurIteration, unsigned& radius, int& deltaLeft, int& deltaRight)
	{
	// Check http://www.w3.org/TR/SVG/filters.html#feGaussianBlurElement for details.
	switch (blurIteration) {
	case 0:
	if (!(radius % 2)) {
	deltaLeft = radius / 2 - 1;
	deltaRight = radius - deltaLeft;
	} else {
	deltaLeft = radius / 2;
	deltaRight = radius - deltaLeft;
	}
	break;
	case 1:
	if (!(radius % 2)) {
	deltaLeft++;
	deltaRight--;
	}
	break;
	case 2:
	if (!(radius % 2)) {
	deltaRight++;
	radius++;
	}
	break;
	}
	}

	FEGaussianBlur::FEGaussianBlur(Filter& filter, float x, float y, EdgeModeType edgeMode)
	: FilterEffect(filter)
	, m_stdX(x)
	, m_stdY(y)
	, m_edgeMode(edgeMode)
	{
	}

	Ref<FEGaussianBlur> FEGaussianBlur::create(Filter& filter, float x, float y, EdgeModeType edgeMode)
	{
	return adoptRef(*new FEGaussianBlur(filter, x, y, edgeMode));
	}

	void FEGaussianBlur::setStdDeviationX(float x)
	{
	m_stdX = x;
	}

	void FEGaussianBlur::setStdDeviationY(float y)
	{
	m_stdY = y;
	}

	void FEGaussianBlur::setEdgeMode(EdgeModeType edgeMode)
	{
	m_edgeMode = edgeMode;
	}

	// This function only operates on Alpha channel.
	inline void boxBlurAlphaOnly(const Uint8ClampedArray& srcPixelArray, Uint8ClampedArray& dstPixelArray,
	unsigned dx, int& dxLeft, int& dxRight, int& stride, int& strideLine, int& effectWidth, int& effectHeight, const int& maxKernelSize)
	{
	const uint8_t* srcData = srcPixelArray.data();
	uint8_t* dstData = dstPixelArray.data();
	// Memory alignment is: RGBA, zero-index based.
	const int channel = 3;

	for (int y = 0; y < effectHeight; ++y) {
	int line = y * strideLine;
	int sum = 0;

	// Fill the kernel.
	for (int i = 0; i < maxKernelSize; ++i) {
	unsigned offset = line + i * stride;
	const uint8_t* srcPtr = srcData + offset;
	sum += srcPtr[channel];
	}

	// Blurring.
	for (int x = 0; x < effectWidth; ++x) {
	unsigned pixelByteOffset = line + x * stride + channel;
	uint8_t* dstPtr = dstData + pixelByteOffset;
	*dstPtr = static_cast<uint8_t>(sum / dx);

	// Shift kernel.
	if (x >= dxLeft) {
	unsigned leftOffset = pixelByteOffset - dxLeft * stride;
	const uint8_t* srcPtr = srcData + leftOffset;
	sum -= *srcPtr;
	}

	if (x + dxRight < effectWidth) {
	unsigned rightOffset = pixelByteOffset + dxRight * stride;
	const uint8_t* srcPtr = srcData + rightOffset;
	sum += *srcPtr;
	}
	}
	}
	}

	inline void boxBlur(const Uint8ClampedArray& srcPixelArray, Uint8ClampedArray& dstPixelArray,
	unsigned dx, int dxLeft, int dxRight, int stride, int strideLine, int effectWidth, int effectHeight, bool alphaImage, EdgeModeType edgeMode)
	{
	const int maxKernelSize = std::min(dxRight, effectWidth);
	if (alphaImage)
	return boxBlurAlphaOnly(srcPixelArray, dstPixelArray, dx, dxLeft, dxRight, stride, strideLine, effectWidth, effectHeight, maxKernelSize);

	const uint8_t* srcData = srcPixelArray.data();
	uint8_t* dstData = dstPixelArray.data();

	// Concerning the array width/length: it is Element size + Margin + Border. The number of pixels will be
	// P = width * height * channels.
	for (int y = 0; y < effectHeight; ++y) {
	int line = y * strideLine;
	int sumR = 0, sumG = 0, sumB = 0, sumA = 0;

	if (edgeMode == EDGEMODE_NONE) {
	// Fill the kernel.
	for (int i = 0; i < maxKernelSize; ++i) {
	unsigned offset = line + i * stride;
	const uint8_t* srcPtr = srcData + offset;
	sumR += *srcPtr++;
	sumG += *srcPtr++;
	sumB += *srcPtr++;
	sumA += *srcPtr;
	}

	// Blurring.
	for (int x = 0; x < effectWidth; ++x) {
	unsigned pixelByteOffset = line + x * stride;
	uint8_t* dstPtr = dstData + pixelByteOffset;

	*dstPtr++ = static_cast<uint8_t>(sumR / dx);
	*dstPtr++ = static_cast<uint8_t>(sumG / dx);
	*dstPtr++ = static_cast<uint8_t>(sumB / dx);
	*dstPtr = static_cast<uint8_t>(sumA / dx);

	// Shift kernel.
	if (x >= dxLeft) {
	unsigned leftOffset = pixelByteOffset - dxLeft * stride;
	const uint8_t* srcPtr = srcData + leftOffset;
	sumR -= srcPtr[0];
	sumG -= srcPtr[1];
	sumB -= srcPtr[2];
	sumA -= srcPtr[3];
	}

	if (x + dxRight < effectWidth) {
	unsigned rightOffset = pixelByteOffset + dxRight * stride;
	const uint8_t* srcPtr = srcData + rightOffset;
	sumR += srcPtr[0];
	sumG += srcPtr[1];
	sumB += srcPtr[2];
	sumA += srcPtr[3];
	}
	}

	} else {
	// FIXME: Add support for 'wrap' here.
	// Get edge values for edgeMode 'duplicate'.
	const uint8_t* edgeValueLeft = srcData + line;
	const uint8_t* edgeValueRight = srcData + (line + (effectWidth - 1) * stride);

	// Fill the kernel.
	for (int i = dxLeft * -1; i < dxRight; ++i) {
	// Is this right for negative values of 'i'?
	unsigned offset = line + i * stride;
	const uint8_t* srcPtr = srcData + offset;

	if (i < 0) {
	sumR += edgeValueLeft[0];
	sumG += edgeValueLeft[1];
	sumB += edgeValueLeft[2];
	sumA += edgeValueLeft[3];
	} else if (i >= effectWidth) {
	sumR += edgeValueRight[0];
	sumG += edgeValueRight[1];
	sumB += edgeValueRight[2];
	sumA += edgeValueRight[3];
	} else {
	sumR += *srcPtr++;
	sumG += *srcPtr++;
	sumB += *srcPtr++;
	sumA += *srcPtr;
	}
	}

	// Blurring.
	for (int x = 0; x < effectWidth; ++x) {
	unsigned pixelByteOffset = line + x * stride;
	uint8_t* dstPtr = dstData + pixelByteOffset;

	*dstPtr++ = static_cast<uint8_t>(sumR / dx);
	*dstPtr++ = static_cast<uint8_t>(sumG / dx);
	*dstPtr++ = static_cast<uint8_t>(sumB / dx);
	*dstPtr = static_cast<uint8_t>(sumA / dx);

	// Shift kernel.
	if (x < dxLeft) {
	sumR -= edgeValueLeft[0];
	sumG -= edgeValueLeft[1];
	sumB -= edgeValueLeft[2];
	sumA -= edgeValueLeft[3];
	} else {
	unsigned leftOffset = pixelByteOffset - dxLeft * stride;
	const uint8_t* srcPtr = srcData + leftOffset;
	sumR -= srcPtr[0];
	sumG -= srcPtr[1];
	sumB -= srcPtr[2];
	sumA -= srcPtr[3];
	}

	if (x + dxRight >= effectWidth) {
	sumR += edgeValueRight[0];
	sumG += edgeValueRight[1];
	sumB += edgeValueRight[2];
	sumA += edgeValueRight[3];
	} else {
	unsigned rightOffset = pixelByteOffset + dxRight * stride;
	const uint8_t* srcPtr = srcData + rightOffset;
	sumR += srcPtr[0];
	sumG += srcPtr[1];
	sumB += srcPtr[2];
	sumA += srcPtr[3];
	}
	}
	}
	}
	}

	#if USE(ACCELERATE)
	inline void accelerateBoxBlur(Uint8ClampedArray& ioBuffer, Uint8ClampedArray& tempBuffer, unsigned kernelSize, int stride, int effectWidth, int effectHeight)
	{
	if (!ioBuffer.data() \|\| !tempBuffer.data()) {
	ASSERT_NOT_REACHED();
	return;
	}

	if (effectWidth <= 0 \|\| effectHeight <= 0 \|\| stride <= 0) {
	ASSERT_NOT_REACHED();
	return;
	}

	// We must always use an odd radius.
	if (kernelSize % 2 != 1)
	kernelSize += 1;

	vImage_Buffer effectInBuffer;
	effectInBuffer.data = static_cast<void*>(ioBuffer.data());
	effectInBuffer.width = effectWidth;
	effectInBuffer.height = effectHeight;
	effectInBuffer.rowBytes = stride;

	vImage_Buffer effectOutBuffer;
	effectOutBuffer.data = tempBuffer.data();
	effectOutBuffer.width = effectWidth;
	effectOutBuffer.height = effectHeight;
	effectOutBuffer.rowBytes = stride;

	// Determine the size of a temporary buffer by calling the function first with a special flag. vImage will return
	// the size needed, or an error (which are all negative).
	size_t tmpBufferSize = vImageBoxConvolve_ARGB8888(&effectInBuffer, &effectOutBuffer, 0, 0, 0, kernelSize, kernelSize, 0, kvImageEdgeExtend \| kvImageGetTempBufferSize);
	if (tmpBufferSize <= 0)
	return;

	void* tmpBuffer = fastMalloc(tmpBufferSize);
	vImageBoxConvolve_ARGB8888(&effectInBuffer, &effectOutBuffer, tmpBuffer, 0, 0, kernelSize, kernelSize, 0, kvImageEdgeExtend);
	vImageBoxConvolve_ARGB8888(&effectOutBuffer, &effectInBuffer, tmpBuffer, 0, 0, kernelSize, kernelSize, 0, kvImageEdgeExtend);
	vImageBoxConvolve_ARGB8888(&effectInBuffer, &effectOutBuffer, tmpBuffer, 0, 0, kernelSize, kernelSize, 0, kvImageEdgeExtend);
	WTF::fastFree(tmpBuffer);

	// The final result should be stored in ioBuffer.
	ASSERT(ioBuffer.length() == tempBuffer.length());
	memcpy(ioBuffer.data(), tempBuffer.data(), ioBuffer.length());
	}
	#endif

	inline void standardBoxBlur(Uint8ClampedArray& ioBuffer, Uint8ClampedArray& tempBuffer, unsigned kernelSizeX, unsigned kernelSizeY, int stride, IntSize& paintSize, bool isAlphaImage, EdgeModeType edgeMode)
	{
	int dxLeft = 0;
	int dxRight = 0;
	int dyLeft = 0;
	int dyRight = 0;

	Uint8ClampedArray* fromBuffer = &ioBuffer;
	Uint8ClampedArray* toBuffer = &tempBuffer;

	for (int i = 0; i < 3; ++i) {
	if (kernelSizeX) {
	kernelPosition(i, kernelSizeX, dxLeft, dxRight);
	#if HAVE(ARM_NEON_INTRINSICS)
	if (!isAlphaImage)
	boxBlurNEON(fromBuffer, toBuffer, kernelSizeX, dxLeft, dxRight, 4, stride, paintSize.width(), paintSize.height());
	else
	boxBlur(fromBuffer, toBuffer, kernelSizeX, dxLeft, dxRight, 4, stride, paintSize.width(), paintSize.height(), true, edgeMode);
	#else
	boxBlur(fromBuffer, toBuffer, kernelSizeX, dxLeft, dxRight, 4, stride, paintSize.width(), paintSize.height(), isAlphaImage, edgeMode);
	#endif
	std::swap(fromBuffer, toBuffer);
	}

	if (kernelSizeY) {
	kernelPosition(i, kernelSizeY, dyLeft, dyRight);
	#if HAVE(ARM_NEON_INTRINSICS)
	if (!isAlphaImage)
	boxBlurNEON(fromBuffer, toBuffer, kernelSizeY, dyLeft, dyRight, stride, 4, paintSize.height(), paintSize.width());
	else
	boxBlur(fromBuffer, toBuffer, kernelSizeY, dyLeft, dyRight, stride, 4, paintSize.height(), paintSize.width(), true, edgeMode);
	#else
	boxBlur(fromBuffer, toBuffer, kernelSizeY, dyLeft, dyRight, stride, 4, paintSize.height(), paintSize.width(), isAlphaImage, edgeMode);
	#endif
	std::swap(fromBuffer, toBuffer);
	}
	}

	// The final result should be stored in ioBuffer.
	if (&ioBuffer != fromBuffer) {
	ASSERT(ioBuffer.length() == fromBuffer->length());
	memcpy(ioBuffer.data(), fromBuffer->data(), ioBuffer.length());
	}
	}

	inline void FEGaussianBlur::platformApplyGeneric(Uint8ClampedArray& ioBuffer, Uint8ClampedArray& tmpPixelArray, unsigned kernelSizeX, unsigned kernelSizeY, IntSize& paintSize)
	{
	int stride = 4 * paintSize.width();

	#if USE(ACCELERATE)
	if (kernelSizeX == kernelSizeY && (m_edgeMode == EDGEMODE_NONE \|\| m_edgeMode == EDGEMODE_DUPLICATE)) {
	accelerateBoxBlur(ioBuffer, tmpPixelArray, kernelSizeX, stride, paintSize.width(), paintSize.height());
	return;
	}
	#endif

	standardBoxBlur(ioBuffer, tmpPixelArray, kernelSizeX, kernelSizeY, stride, paintSize, isAlphaImage(), m_edgeMode);
	}

	void FEGaussianBlur::platformApplyWorker(PlatformApplyParameters* parameters)
	{
	IntSize paintSize(parameters->width, parameters->height);
	parameters->filter->platformApplyGeneric(parameters->ioPixelArray, parameters->tmpPixelArray, parameters->kernelSizeX, parameters->kernelSizeY, paintSize);
	}

	inline void FEGaussianBlur::platformApply(Uint8ClampedArray& ioBuffer, Uint8ClampedArray& tmpPixelArray, unsigned kernelSizeX, unsigned kernelSizeY, IntSize& paintSize)
	{
	#if !USE(ACCELERATE)
	int scanline = 4 * paintSize.width();
	int extraHeight = 3 * kernelSizeY * 0.5f;
	int optimalThreadNumber = (paintSize.width() * paintSize.height()) / (s_minimalRectDimension + extraHeight * paintSize.width());

	if (optimalThreadNumber > 1) {
	WTF::ParallelJobs<PlatformApplyParameters> parallelJobs(&platformApplyWorker, optimalThreadNumber);

	int jobs = parallelJobs.numberOfJobs();
	if (jobs > 1) {
	// Split the job into "blockHeight"-sized jobs but there a few jobs that need to be slightly larger since
	// blockHeight * jobs < total size. These extras are handled by the remainder "jobsWithExtra".
	const int blockHeight = paintSize.height() / jobs;
	const int jobsWithExtra = paintSize.height() % jobs;

	int currentY = 0;
	for (int job = 0; job < jobs; job++) {
	PlatformApplyParameters& params = parallelJobs.parameter(job);
	params.filter = this;

	int startY = !job ? 0 : currentY - extraHeight;
	currentY += job < jobsWithExtra ? blockHeight + 1 : blockHeight;
	int endY = job == jobs - 1 ? currentY : currentY + extraHeight;

	int blockSize = (endY - startY) * scanline;
	if (!job) {
	params.ioPixelArray = &ioBuffer;
	params.tmpPixelArray = &tmpPixelArray;
	} else {
	params.ioPixelArray = Uint8ClampedArray::createUninitialized(blockSize);
	params.tmpPixelArray = Uint8ClampedArray::createUninitialized(blockSize);
	memcpy(params.ioPixelArray->data(), ioBuffer.data() + startY * scanline, blockSize);
	}

	params.width = paintSize.width();
	params.height = endY - startY;
	params.kernelSizeX = kernelSizeX;
	params.kernelSizeY = kernelSizeY;
	}

	parallelJobs.execute();

	// Copy together the parts of the image.
	currentY = 0;
	for (int job = 1; job < jobs; job++) {
	PlatformApplyParameters& params = parallelJobs.parameter(job);
	int sourceOffset;
	int destinationOffset;
	int size;
	int adjustedBlockHeight = job < jobsWithExtra ? blockHeight + 1 : blockHeight;

	currentY += adjustedBlockHeight;
	sourceOffset = extraHeight * scanline;
	destinationOffset = currentY * scanline;
	size = adjustedBlockHeight * scanline;

	memcpy(ioBuffer.data() + destinationOffset, params.ioPixelArray->data() + sourceOffset, size);
	}
	return;
	}
	// Fallback to single threaded mode.
	}
	#endif

	// The selection here eventually should happen dynamically on some platforms.
	platformApplyGeneric(ioBuffer, tmpPixelArray, kernelSizeX, kernelSizeY, paintSize);
	}

	static int clampedToKernelSize(float value)
	{
	// Limit the kernel size to 500. A bigger radius won't make a big difference for the result image but
	// inflates the absolute paint rect too much. This is compatible with Firefox' behavior.
	unsigned size = std::max<unsigned>(2, static_cast<unsigned>(floorf(value * gaussianKernelFactor() + 0.5f)));
	return clampTo<int>(std::min(size, static_cast<unsigned>(gMaxKernelSize)));
	}

	IntSize FEGaussianBlur::calculateUnscaledKernelSize(FloatSize stdDeviation)
	{
	ASSERT(stdDeviation.width() >= 0 && stdDeviation.height() >= 0);
	IntSize kernelSize;

	if (stdDeviation.width())
	kernelSize.setWidth(clampedToKernelSize(stdDeviation.width()));

	if (stdDeviation.height())
	kernelSize.setHeight(clampedToKernelSize(stdDeviation.height()));

	return kernelSize;
	}

	IntSize FEGaussianBlur::calculateKernelSize(const Filter& filter, FloatSize stdDeviation)
	{
	return calculateUnscaledKernelSize(filter.scaledByFilterResolution(stdDeviation));
	}

	void FEGaussianBlur::determineAbsolutePaintRect()
	{
	IntSize kernelSize = calculateKernelSize(filter(), { m_stdX, m_stdY });

	FloatRect absolutePaintRect = inputEffect(0)->absolutePaintRect();
	// Edge modes other than 'none' do not inflate the affected paint rect.
	if (m_edgeMode != EDGEMODE_NONE) {
	setAbsolutePaintRect(enclosingIntRect(absolutePaintRect));
	return;
	}

	// We take the half kernel size and multiply it with three, because we run box blur three times.
	absolutePaintRect.inflateX(3 * kernelSize.width() * 0.5f);
	absolutePaintRect.inflateY(3 * kernelSize.height() * 0.5f);

	if (clipsToBounds())
	absolutePaintRect.intersect(maxEffectRect());
	else
	absolutePaintRect.unite(maxEffectRect());

	setAbsolutePaintRect(enclosingIntRect(absolutePaintRect));
	}

	void FEGaussianBlur::platformApplySoftware()
	{
	FilterEffect* in = inputEffect(0);

	Uint8ClampedArray* resultPixelArray = createPremultipliedImageResult();
	if (!resultPixelArray)
	return;

	setIsAlphaImage(in->isAlphaImage());

	IntRect effectDrawingRect = requestedRegionOfInputImageData(in->absolutePaintRect());
	in->copyPremultipliedResult(*resultPixelArray, effectDrawingRect);

	if (!m_stdX && !m_stdY)
	return;

	IntSize kernelSize = calculateKernelSize(filter(), { m_stdX, m_stdY });
	kernelSize.scale(filter().filterScale());

	IntSize paintSize = absolutePaintRect().size();
	paintSize.scale(filter().filterScale());
	auto tmpImageData = Uint8ClampedArray::tryCreateUninitialized((paintSize.area() * 4).unsafeGet());
	if (!tmpImageData)
	return;

	platformApply(resultPixelArray, tmpImageData, kernelSize.width(), kernelSize.height(), paintSize);
	}

	TextStream& FEGaussianBlur::externalRepresentation(TextStream& ts, RepresentationType representation) const
	{
	ts << indent << "[feGaussianBlur";
	FilterEffect::externalRepresentation(ts, representation);
	ts << " stdDeviation=\"" << m_stdX << ", " << m_stdY << "\"]\n";

	TextStream::IndentScope indentScope(ts);
	inputEffect(0)->externalRepresentation(ts, representation);
	return ts;
	}

	} // namespace WebCore