| /* |
| * Copyright (C) 2012 Gabor Rapcsanyi (rgabor@inf.u-szeged.hu), University of Szeged |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #pragma once |
| |
| #if HAVE(ARM_NEON_INTRINSICS) |
| |
| #include <arm_neon.h> |
| |
| namespace WebCore { |
| |
| namespace SIMD { |
| |
| ALWAYS_INLINE void unpackOneRowOfRGBA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow) |
| { |
| unsigned componentsPerRow = pixelsPerRow * 4; |
| unsigned tailComponents = componentsPerRow % 16; |
| unsigned componentsSize = componentsPerRow - tailComponents; |
| const uint8_t* src = reinterpret_cast<const uint8_t*>(source); |
| |
| for (unsigned i = 0; i < componentsSize; i += 16) { |
| uint8x16x2_t components = vld2q_u8(src + i * 2); |
| vst1q_u8(destination + i, components.val[1]); |
| } |
| |
| source += componentsSize; |
| destination += componentsSize; |
| pixelsPerRow = tailComponents / 4; |
| } |
| |
| ALWAYS_INLINE void unpackOneRowOfRGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow) |
| { |
| unsigned componentsPerRow = pixelsPerRow * 3; |
| unsigned tailComponents = componentsPerRow % 24; |
| unsigned componentsSize = componentsPerRow - tailComponents; |
| |
| uint8x8_t componentA = vdup_n_u8(0xFF); |
| for (unsigned i = 0; i < componentsSize; i += 24) { |
| uint16x8x3_t RGB16 = vld3q_u16(source + i); |
| uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(RGB16.val[0], 8)); |
| uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(RGB16.val[1], 8)); |
| uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(RGB16.val[2], 8)); |
| uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}}; |
| vst4_u8(destination, RGBA8); |
| destination += 32; |
| } |
| |
| source += componentsSize; |
| pixelsPerRow = tailComponents / 3; |
| } |
| |
| ALWAYS_INLINE void unpackOneRowOfARGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow) |
| { |
| unsigned componentsPerRow = pixelsPerRow * 4; |
| unsigned tailComponents = componentsPerRow % 32; |
| unsigned componentsSize = componentsPerRow - tailComponents; |
| |
| for (unsigned i = 0; i < componentsSize; i += 32) { |
| uint16x8x4_t ARGB16 = vld4q_u16(source + i); |
| uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8)); |
| uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8)); |
| uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8)); |
| uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8)); |
| uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}}; |
| vst4_u8(destination + i, RGBA8); |
| } |
| |
| source += componentsSize; |
| destination += componentsSize; |
| pixelsPerRow = tailComponents / 4; |
| } |
| |
| ALWAYS_INLINE void unpackOneRowOfBGRA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow) |
| { |
| unsigned componentsPerRow = pixelsPerRow * 4; |
| unsigned tailComponents = componentsPerRow % 32; |
| unsigned componentsSize = componentsPerRow - tailComponents; |
| |
| for (unsigned i = 0; i < componentsSize; i += 32) { |
| uint16x8x4_t ARGB16 = vld4q_u16(source + i); |
| uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8)); |
| uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8)); |
| uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8)); |
| uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8)); |
| uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}}; |
| vst4_u8(destination + i, RGBA8); |
| } |
| |
| source += componentsSize; |
| destination += componentsSize; |
| pixelsPerRow = tailComponents / 4; |
| } |
| |
| ALWAYS_INLINE void unpackOneRowOfRGBA4444ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow) |
| { |
| unsigned tailPixels = pixelsPerRow % 8; |
| unsigned pixelSize = pixelsPerRow - tailPixels; |
| |
| uint16x8_t immediate0x0f = vdupq_n_u16(0x0F); |
| for (unsigned i = 0; i < pixelSize; i += 8) { |
| uint16x8_t eightPixels = vld1q_u16(source + i); |
| |
| uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 12)); |
| uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 8), immediate0x0f)); |
| uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 4), immediate0x0f)); |
| uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x0f)); |
| |
| componentR = vorr_u8(vshl_n_u8(componentR, 4), componentR); |
| componentG = vorr_u8(vshl_n_u8(componentG, 4), componentG); |
| componentB = vorr_u8(vshl_n_u8(componentB, 4), componentB); |
| componentA = vorr_u8(vshl_n_u8(componentA, 4), componentA); |
| |
| uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}}; |
| vst4_u8(destination, destComponents); |
| destination += 32; |
| } |
| |
| source += pixelSize; |
| pixelsPerRow = tailPixels; |
| } |
| |
| ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort4444(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow) |
| { |
| unsigned componentsPerRow = pixelsPerRow * 4; |
| unsigned tailComponents = componentsPerRow % 32; |
| unsigned componentsSize = componentsPerRow - tailComponents; |
| |
| uint8_t* dst = reinterpret_cast<uint8_t*>(destination); |
| uint8x8_t immediate0xf0 = vdup_n_u8(0xF0); |
| for (unsigned i = 0; i < componentsSize; i += 32) { |
| uint8x8x4_t RGBA8 = vld4_u8(source + i); |
| |
| uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf0); |
| uint8x8_t componentG = vshr_n_u8(vand_u8(RGBA8.val[1], immediate0xf0), 4); |
| uint8x8_t componentB = vand_u8(RGBA8.val[2], immediate0xf0); |
| uint8x8_t componentA = vshr_n_u8(vand_u8(RGBA8.val[3], immediate0xf0), 4); |
| |
| uint8x8x2_t RGBA4; |
| RGBA4.val[0] = vorr_u8(componentB, componentA); |
| RGBA4.val[1] = vorr_u8(componentR, componentG); |
| vst2_u8(dst, RGBA4); |
| dst += 16; |
| } |
| |
| source += componentsSize; |
| destination += componentsSize / 4; |
| pixelsPerRow = tailComponents / 4; |
| } |
| |
| ALWAYS_INLINE void unpackOneRowOfRGBA5551ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow) |
| { |
| unsigned tailPixels = pixelsPerRow % 8; |
| unsigned pixelSize = pixelsPerRow - tailPixels; |
| |
| uint8x8_t immediate0x7 = vdup_n_u8(0x7); |
| uint8x8_t immediate0xff = vdup_n_u8(0xFF); |
| uint16x8_t immediate0x1f = vdupq_n_u16(0x1F); |
| uint16x8_t immediate0x1 = vdupq_n_u16(0x1); |
| |
| for (unsigned i = 0; i < pixelSize; i += 8) { |
| uint16x8_t eightPixels = vld1q_u16(source + i); |
| |
| uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11)); |
| uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 6), immediate0x1f)); |
| uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 1), immediate0x1f)); |
| uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x1)); |
| |
| componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7)); |
| componentG = vorr_u8(vshl_n_u8(componentG, 3), vand_u8(componentG, immediate0x7)); |
| componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7)); |
| componentA = vmul_u8(componentA, immediate0xff); |
| |
| uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}}; |
| vst4_u8(destination, destComponents); |
| destination += 32; |
| } |
| |
| source += pixelSize; |
| pixelsPerRow = tailPixels; |
| } |
| |
| ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort5551(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow) |
| { |
| unsigned componentsPerRow = pixelsPerRow * 4; |
| unsigned tailComponents = componentsPerRow % 32; |
| unsigned componentsSize = componentsPerRow - tailComponents; |
| |
| uint8_t* dst = reinterpret_cast<uint8_t*>(destination); |
| |
| uint8x8_t immediate0xf8 = vdup_n_u8(0xF8); |
| uint8x8_t immediate0x18 = vdup_n_u8(0x18); |
| for (unsigned i = 0; i < componentsSize; i += 32) { |
| uint8x8x4_t RGBA8 = vld4_u8(source + i); |
| |
| uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8); |
| uint8x8_t componentG3bit = vshr_n_u8(RGBA8.val[1], 5); |
| |
| uint8x8_t componentG2bit = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x18), 3); |
| uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 2); |
| uint8x8_t componentA = vshr_n_u8(RGBA8.val[3], 7); |
| |
| uint8x8x2_t RGBA5551; |
| RGBA5551.val[0] = vorr_u8(vorr_u8(componentG2bit, componentB), componentA); |
| RGBA5551.val[1] = vorr_u8(componentR, componentG3bit); |
| vst2_u8(dst, RGBA5551); |
| dst += 16; |
| } |
| |
| source += componentsSize; |
| destination += componentsSize / 4; |
| pixelsPerRow = tailComponents / 4; |
| } |
| |
| ALWAYS_INLINE void unpackOneRowOfRGB565ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow) |
| { |
| unsigned tailPixels = pixelsPerRow % 8; |
| unsigned pixelSize = pixelsPerRow - tailPixels; |
| |
| uint16x8_t immediate0x3f = vdupq_n_u16(0x3F); |
| uint16x8_t immediate0x1f = vdupq_n_u16(0x1F); |
| uint8x8_t immediate0x3 = vdup_n_u8(0x3); |
| uint8x8_t immediate0x7 = vdup_n_u8(0x7); |
| |
| uint8x8_t componentA = vdup_n_u8(0xFF); |
| |
| for (unsigned i = 0; i < pixelSize; i += 8) { |
| uint16x8_t eightPixels = vld1q_u16(source + i); |
| |
| uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11)); |
| uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 5), immediate0x3f)); |
| uint8x8_t componentB = vqmovn_u16(vandq_u16(eightPixels, immediate0x1f)); |
| |
| componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7)); |
| componentG = vorr_u8(vshl_n_u8(componentG, 2), vand_u8(componentG, immediate0x3)); |
| componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7)); |
| |
| uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}}; |
| vst4_u8(destination, destComponents); |
| destination += 32; |
| } |
| |
| source += pixelSize; |
| pixelsPerRow = tailPixels; |
| } |
| |
| ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort565(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow) |
| { |
| unsigned componentsPerRow = pixelsPerRow * 4; |
| unsigned tailComponents = componentsPerRow % 32; |
| unsigned componentsSize = componentsPerRow - tailComponents; |
| uint8_t* dst = reinterpret_cast<uint8_t*>(destination); |
| |
| uint8x8_t immediate0xf8 = vdup_n_u8(0xF8); |
| uint8x8_t immediate0x1c = vdup_n_u8(0x1C); |
| for (unsigned i = 0; i < componentsSize; i += 32) { |
| uint8x8x4_t RGBA8 = vld4_u8(source + i); |
| |
| uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8); |
| uint8x8_t componentGLeft = vshr_n_u8(RGBA8.val[1], 5); |
| uint8x8_t componentGRight = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x1c), 3); |
| uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 3); |
| |
| uint8x8x2_t RGB565; |
| RGB565.val[0] = vorr_u8(componentGRight, componentB); |
| RGB565.val[1] = vorr_u8(componentR, componentGLeft); |
| vst2_u8(dst, RGB565); |
| dst += 16; |
| } |
| |
| source += componentsSize; |
| destination += componentsSize / 4; |
| pixelsPerRow = tailComponents / 4; |
| } |
| |
| } // namespace SIMD |
| |
| } // namespace WebCore |
| |
| #endif // HAVE(ARM_NEON_INTRINSICS) |