2 * Copyright (C) 2005-2015 Team Kodi
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <emmintrin.h>
23 inline void* memcpy_aligned(void* dst, const void* src, size_t size)
26 __m128i xmm1, xmm2, xmm3, xmm4;
28 // if memory is not aligned, use memcpy
29 if ((((size_t)(src) | (size_t)(dst)) & 0xF))
30 return memcpy(dst, src, size);
32 uint8_t* d = (uint8_t*)(dst);
33 uint8_t* s = (uint8_t*)(src);
35 for (i = 0; i < size - 63; i += 64)
37 xmm1 = _mm_load_si128((__m128i*)(s + i + 0));
38 xmm2 = _mm_load_si128((__m128i*)(s + i + 16));
39 xmm3 = _mm_load_si128((__m128i*)(s + i + 32));
40 xmm4 = _mm_load_si128((__m128i*)(s + i + 48));
41 _mm_stream_si128((__m128i*)(d + i + 0), xmm1);
42 _mm_stream_si128((__m128i*)(d + i + 16), xmm2);
43 _mm_stream_si128((__m128i*)(d + i + 32), xmm3);
44 _mm_stream_si128((__m128i*)(d + i + 48), xmm4);
46 for (; i < size; i += 16)
48 xmm1 = _mm_load_si128((__m128i*)(s + i));
49 _mm_stream_si128((__m128i*)(d + i), xmm1);
54 inline void convert_yuv420_nv12(uint8_t *const src[], const int srcStride[], int height, int width, uint8_t *const dst[], const int dstStride[])
56 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
59 // Convert to NV12 - Luma
60 if (srcStride[0] == dstStride[0])
61 memcpy_aligned(dst[0], src[0], srcStride[0] * height);
64 for (size_t line = 0; line < height; ++line)
66 uint8_t * s = src[0] + srcStride[0] * line;
67 uint8_t * d = dst[0] + dstStride[0] * line;
68 memcpy_aligned(d, s, srcStride[0]);
71 // Convert to NV12 - Chroma
72 size_t chromaWidth = (width + 1) >> 1;
73 size_t chromaHeight = height >> 1;
74 for (size_t line = 0; line < chromaHeight; ++line)
77 uint8_t * u = src[1] + line * srcStride[1];
78 uint8_t * v = src[2] + line * srcStride[2];
79 uint8_t * d = dst[1] + line * dstStride[1];
80 // if memory is not aligned use memcpy
81 if (((size_t)(u) | (size_t)(v) | (size_t)(d)) & 0xF)
83 for (i = 0; i < chromaWidth; ++i)
91 for (i = 0; i < (chromaWidth - 31); i += 32)
93 xmm0 = _mm_load_si128((__m128i*)(v + i));
94 xmm1 = _mm_load_si128((__m128i*)(u + i));
95 xmm2 = _mm_load_si128((__m128i*)(v + i + 16));
96 xmm3 = _mm_load_si128((__m128i*)(u + i + 16));
99 xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
100 xmm4 = _mm_unpackhi_epi8(xmm1, xmm4);
103 xmm2 = _mm_unpacklo_epi8(xmm3, xmm2);
104 xmm1 = _mm_unpackhi_epi8(xmm3, xmm1);
106 _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
107 _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4);
108 _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2);
109 _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1);
111 if (((size_t)chromaWidth) & 0xF)
115 for (; i < chromaWidth; ++i)
123 xmm0 = _mm_load_si128((__m128i*)(v + i));
124 xmm1 = _mm_load_si128((__m128i*)(u + i));
127 xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
128 xmm2 = _mm_unpackhi_epi8(xmm1, xmm2);
130 _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
131 _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2);