code.vuplus.com Git - vuplus_xbmc/blob - xbmc/utils/win32/memcpy_sse2.h

   1 /*
   2 *      Copyright (C) 2005-2015 Team Kodi
   3 *      http://kodi.tv
   4 *
   5 *  This library is free software; you can redistribute it and/or
   6 *  modify it under the terms of the GNU Lesser General Public
   7 *  License as published by the Free Software Foundation; either
   8 *  version 2.1 of the License, or (at your option) any later version.
   9 *
  10 *  This library is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 *  Lesser General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU Lesser General Public
  16 *  License along with this library; if not, write to the Free Software
  17 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  18 *
  19 */
  20
  21 #include <emmintrin.h>
  22
  23 inline void* memcpy_aligned(void* dst, const void* src, size_t size)
  24 {
  25   size_t i;
  26   __m128i xmm1, xmm2, xmm3, xmm4;
  27
  28   // if memory is not aligned, use memcpy
  29   if ((((size_t)(src) | (size_t)(dst)) & 0xF))
  30     return memcpy(dst, src, size);
  31
  32   uint8_t* d = (uint8_t*)(dst);
  33   uint8_t* s = (uint8_t*)(src);
  34
  35   for (i = 0; i < size - 63; i += 64)
  36   {
  37     xmm1 = _mm_load_si128((__m128i*)(s + i +  0));
  38     xmm2 = _mm_load_si128((__m128i*)(s + i + 16));
  39     xmm3 = _mm_load_si128((__m128i*)(s + i + 32));
  40     xmm4 = _mm_load_si128((__m128i*)(s + i + 48));
  41     _mm_stream_si128((__m128i*)(d + i +  0), xmm1);
  42     _mm_stream_si128((__m128i*)(d + i + 16), xmm2);
  43     _mm_stream_si128((__m128i*)(d + i + 32), xmm3);
  44     _mm_stream_si128((__m128i*)(d + i + 48), xmm4);
  45   }
  46   for (; i < size; i += 16)
  47   {
  48     xmm1 = _mm_load_si128((__m128i*)(s + i));
  49     _mm_stream_si128((__m128i*)(d + i), xmm1);
  50   }
  51   return dst;
  52 }
  53
  54 inline void convert_yuv420_nv12(uint8_t *const src[], const int srcStride[], int height, int width, uint8_t *const dst[], const int dstStride[])
  55 {
  56   __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
  57   _mm_sfence();
  58
  59   // Convert to NV12 - Luma
  60   if (srcStride[0] == dstStride[0])
  61     memcpy_aligned(dst[0], src[0], srcStride[0] * height);
  62   else
  63   {
  64     for (size_t line = 0; line < height; ++line)
  65     {
  66       uint8_t * s = src[0] + srcStride[0] * line;
  67       uint8_t * d = dst[0] + dstStride[0] * line;
  68       memcpy_aligned(d, s, srcStride[0]);
  69     }
  70   }
  71   // Convert to NV12 - Chroma
  72   size_t chromaWidth = (width + 1) >> 1;
  73   size_t chromaHeight = height >> 1;
  74   for (size_t line = 0; line < chromaHeight; ++line)
  75   {
  76     size_t i;
  77     uint8_t * u = src[1] + line * srcStride[1];
  78     uint8_t * v = src[2] + line * srcStride[2];
  79     uint8_t * d = dst[1] + line * dstStride[1];
  80     // if memory is not aligned use memcpy
  81     if (((size_t)(u) | (size_t)(v) | (size_t)(d)) & 0xF)
  82     {
  83       for (i = 0; i < chromaWidth; ++i)
  84       {
  85         *d++ = *u++;
  86         *d++ = *v++;
  87       }
  88     }
  89     else
  90     {
  91       for (i = 0; i < (chromaWidth - 31); i += 32)
  92       {
  93         xmm0 = _mm_load_si128((__m128i*)(v + i));
  94         xmm1 = _mm_load_si128((__m128i*)(u + i));
  95         xmm2 = _mm_load_si128((__m128i*)(v + i + 16));
  96         xmm3 = _mm_load_si128((__m128i*)(u + i + 16));
  97
  98         xmm4 = xmm0;
  99         xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
 100         xmm4 = _mm_unpackhi_epi8(xmm1, xmm4);
 101
 102         xmm1 = xmm2;
 103         xmm2 = _mm_unpacklo_epi8(xmm3, xmm2);
 104         xmm1 = _mm_unpackhi_epi8(xmm3, xmm1);
 105
 106         _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
 107         _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4);
 108         _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2);
 109         _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1);
 110       }
 111       if (((size_t)chromaWidth) & 0xF)
 112       {
 113         d += (i << 1);
 114         u += i; v += i;
 115         for (; i < chromaWidth; ++i)
 116         {
 117           *d++ = *u++;
 118           *d++ = *v++;
 119         }
 120       }
 121       else if (i < chromaWidth)
 122       {
 123         xmm0 = _mm_load_si128((__m128i*)(v + i));
 124         xmm1 = _mm_load_si128((__m128i*)(u + i));
 125
 126         xmm2 = xmm0;
 127         xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
 128         xmm2 = _mm_unpackhi_epi8(xmm1, xmm2);
 129
 130         _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
 131         _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2);
 132       }
 133     }
 134   }
 135 }