[dxva] CProcessorHD::Convert - Optimize method with sse2 instructions.
authorAnton Fedchin <afedchin@ruswizards.com>
Sat, 18 Jul 2015 10:28:14 +0000 (13:28 +0300)
committerAnton Fedchin <afedchin@ruswizards.com>
Fri, 21 Aug 2015 06:49:38 +0000 (09:49 +0300)
project/VS2010Express/XBMC.vcxproj
project/VS2010Express/XBMC.vcxproj.filters
xbmc/cores/VideoRenderers/DXVAHD.cpp
xbmc/utils/win32/memcpy_sse2.h [new file with mode: 0644]

index 764d85c..199bd8d 100644 (file)
     <ClInclude Include="..\..\xbmc\utils\uXstrings.h" />
     <ClInclude Include="..\..\xbmc\utils\Vector.h" />
     <ClInclude Include="..\..\xbmc\utils\win32\gpu_memcpy_sse4.h" />
+    <ClInclude Include="..\..\xbmc\utils\win32\memcpy_sse2.h" />
     <ClInclude Include="..\..\xbmc\utils\win32\Win32InterfaceForCLog.h" />
     <ClInclude Include="..\..\xbmc\utils\win32\Win32Log.h" />
     <ClInclude Include="..\..\xbmc\utils\XSLTUtils.h" />
index 66dafd3..4a8cab1 100644 (file)
     <ClInclude Include="..\..\xbmc\utils\win32\gpu_memcpy_sse4.h">
       <Filter>utils\win32</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\xbmc\utils\win32\memcpy_sse2.h">
+      <Filter>utils\win32</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ResourceCompile Include="..\..\xbmc\win32\XBMC_PC.rc">
index 8e90a3f..1ca33db 100644 (file)
@@ -34,6 +34,7 @@
 #include "settings/MediaSettings.h"
 #include "utils/AutoPtrHandle.h"
 #include "utils/Log.h"
+#include "utils/win32/memcpy_sse2.h"
 #include "win32/WIN32Util.h"
 #include "windowing/WindowingFactory.h"
 
@@ -434,9 +435,9 @@ bool CProcessorHD::CreateSurfaces()
 CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
 {
   // RENDER_FMT_YUV420P -> DXGI_FORMAT_NV12
-  // RENDER_FMT_YUV420P10 -> DXGI_FORMAT_P010/DXGI_FORMAT_Y410
-  // RENDER_FMT_YUV420P16 -> DXGI_FORMAT_P016/DXGI_FORMAT_Y416
-  if (picture->format != RENDER_FMT_YUV420P
+  // RENDER_FMT_YUV420P10 -> DXGI_FORMAT_P010
+  // RENDER_FMT_YUV420P16 -> DXGI_FORMAT_P016
+  if ( picture->format != RENDER_FMT_YUV420P
     && picture->format != RENDER_FMT_YUV420P10
     && picture->format != RENDER_FMT_YUV420P16)
   {
@@ -470,28 +471,38 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
     return nullptr;
   }
 
-  // Convert to NV12 - Luma
-  // TODO: Optimize this later using shaders/swscale/etc.
-  uint8_t *s = picture->data[0];
-  uint8_t* bits = (uint8_t*)rectangle.pData;
-  for (unsigned y = 0; y < picture->iHeight; y++)
+  if (picture->format == RENDER_FMT_YUV420P)
   {
-    memcpy(bits, s, picture->iWidth);
-    s += picture->iLineSize[0];
-    bits += rectangle.RowPitch;
+    uint8_t*  pData = static_cast<uint8_t*>(rectangle.pData);
+    uint8_t*  dst[] = { pData, pData + sDesc.Height * rectangle.RowPitch };
+    int dstStride[] = { rectangle.RowPitch, rectangle.RowPitch };
+    convert_yuv420_nv12(picture->data, picture->iLineSize, picture->iHeight, picture->iWidth, dst, dstStride);
   }
-
-  // Convert to NV12 - Chroma
-  uint8_t *s_u, *s_v, *d_uv;
-  for (unsigned y = 0; y < picture->iHeight / 2; y++)
+  else
   {
-    s_u = picture->data[1] + y * picture->iLineSize[1];
-    s_v = picture->data[2] + y * picture->iLineSize[2];
-    d_uv = (uint8_t*)rectangle.pData + (sDesc.Height + y) * rectangle.RowPitch;
-    for (unsigned x = 0; x < picture->iWidth / 2; x++)
+    // TODO: Optimize this later using sse2/sse4
+    uint16_t * d_y = static_cast<uint16_t*>(rectangle.pData);
+    uint16_t * d_uv = d_y + sDesc.Height * rectangle.RowPitch;
+    // Convert to NV12 - Luma
+    for (size_t line = 0; line < picture->iHeight; ++line)
     {
-      *d_uv++ = *s_u++;
-      *d_uv++ = *s_v++;
+      uint16_t * y = (uint16_t*)(picture->data[0] + picture->iLineSize[0] * line);
+      uint16_t * d = d_y + rectangle.RowPitch * line;
+      memcpy(d, y, picture->iLineSize[0]);
+    }
+    // Convert to NV12 - Chroma
+    size_t chromaWidth = (picture->iWidth + 1) >> 1;
+    size_t chromaHeight = picture->iHeight >> 1;
+    for (size_t line = 0; line < chromaHeight; ++line)
+    {
+      uint16_t * u = (uint16_t*)picture->data[1] + line * picture->iLineSize[1];
+      uint16_t * v = (uint16_t*)picture->data[2] + line * picture->iLineSize[2];
+      uint16_t * d = d_uv + line * rectangle.RowPitch;
+      for (size_t x = 0; x < chromaWidth; x++)
+      {
+        *d++ = *u++; 
+        *d++ = *v++;
+      }
     }
   }
   pContext->Unmap(texture, subresource);
@@ -503,7 +514,6 @@ CRenderPicture *CProcessorHD::Convert(DVDVideoPicture* picture)
   return pic;
 }
 
-
 bool CProcessorHD::ApplyFilter(D3D11_VIDEO_PROCESSOR_FILTER filter, int value, int min, int max, int def)
 {
   if (filter >= NUM_FILTERS)
diff --git a/xbmc/utils/win32/memcpy_sse2.h b/xbmc/utils/win32/memcpy_sse2.h
new file mode 100644 (file)
index 0000000..c585136
--- /dev/null
@@ -0,0 +1,113 @@
+/*
+*      Copyright (C) 2005-2015 Team Kodi
+*      http://kodi.tv
+*
+*  This library is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  This library is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with this library; if not, write to the Free Software
+*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+*
+*/
+
+#include <emmintrin.h>
+
+inline void* memcpy_aligned(void* dst, const void* src, size_t size)
+{
+  size_t i;
+  __m128i xmm1, xmm2, xmm3, xmm4;
+
+  // if memory is not aligned, use memcpy
+  if ((((size_t)(src) | (size_t)(dst)) & 0xF))
+    return memcpy(dst, src, size);
+
+  uint8_t* d = (uint8_t*)(dst);
+  uint8_t* s = (uint8_t*)(src);
+
+  for (i = 0; i < size - 63; i += 64)
+  {
+    xmm1 = _mm_load_si128((__m128i*)(s + i +  0));
+    xmm2 = _mm_load_si128((__m128i*)(s + i + 16));
+    xmm3 = _mm_load_si128((__m128i*)(s + i + 32));
+    xmm4 = _mm_load_si128((__m128i*)(s + i + 48));
+    _mm_stream_si128((__m128i*)(d + i +  0), xmm1);
+    _mm_stream_si128((__m128i*)(d + i + 16), xmm2);
+    _mm_stream_si128((__m128i*)(d + i + 32), xmm3);
+    _mm_stream_si128((__m128i*)(d + i + 48), xmm4);
+  }
+  for (; i < size; i += 16)
+  {
+    xmm1 = _mm_load_si128((__m128i*)(s + i));
+    _mm_stream_si128((__m128i*)(d + i), xmm1);
+  }
+  return dst;
+}
+
+inline void convert_yuv420_nv12(uint8_t *const src[], const int srcStride[], int height, int width, uint8_t *const dst[], const int dstStride[])
+{
+  __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+  _mm_sfence();
+
+  // Convert to NV12 - Luma
+  if (srcStride[0] == dstStride[0])
+    memcpy_aligned(dst[0], src[0], srcStride[0] * height);
+  else
+  {
+    for (size_t line = 0; line < height; ++line)
+    {
+      uint8_t * s = src[0] + srcStride[0] * line;
+      uint8_t * d = dst[0] + dstStride[0] * line;
+      memcpy_aligned(d, s, srcStride[0]);
+    }
+  }
+  // Convert to NV12 - Chroma
+  size_t chromaWidth = (width + 1) >> 1;
+  size_t chromaHeight = height >> 1;
+  for (size_t line = 0; line < chromaHeight; ++line)
+  {
+    size_t i;
+    uint8_t * u = src[1] + line * srcStride[1];
+    uint8_t * v = src[2] + line * srcStride[2];
+    uint8_t * d = dst[1] + line * dstStride[1];
+    for (i = 0; i < (chromaWidth - 31); i += 32)
+    {
+      xmm0 = _mm_load_si128((__m128i*)(v + i));
+      xmm1 = _mm_load_si128((__m128i*)(u + i));
+      xmm2 = _mm_load_si128((__m128i*)(v + i + 16));
+      xmm3 = _mm_load_si128((__m128i*)(u + i + 16));
+
+      xmm4 = xmm0;
+      xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
+      xmm4 = _mm_unpackhi_epi8(xmm1, xmm4);
+
+      xmm1 = xmm2;
+      xmm2 = _mm_unpacklo_epi8(xmm3, xmm2);
+      xmm1 = _mm_unpackhi_epi8(xmm3, xmm1);
+
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm4);
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 32), xmm2);
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 48), xmm1);
+    }
+    for (; i < chromaWidth; i += 16)
+    {
+      xmm0 = _mm_load_si128((__m128i*)(v + i));
+      xmm1 = _mm_load_si128((__m128i*)(u + i));
+
+      xmm2 = xmm0;
+      xmm0 = _mm_unpacklo_epi8(xmm1, xmm0);
+      xmm2 = _mm_unpackhi_epi8(xmm1, xmm2);
+
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 0), xmm0);
+      _mm_stream_si128((__m128i *)(d + (i << 1) + 16), xmm2);
+    }
+  }
+}