code.vuplus.com Git - vuplus_openembedded/blob - recipes/mythtv/mythtv-0.21/ffmpeg-arm-update.diff

   1 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_arm.c mythtv/libs/libavcodec/armv4l/dsputil_arm.c
   2 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_arm.c    2008-07-23 12:19:05.000000000 +0200
   3 +++ mythtv/libs/libavcodec/armv4l/dsputil_arm.c 2008-07-24 19:54:00.753198000 +0200
   4 @@ -19,12 +19,14 @@
   5   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   6   */
   7
   8 -#include "dsputil.h"
   9 +#include "libavcodec/dsputil.h"
  10  #ifdef HAVE_IPP
  11 -#include "ipp.h"
  12 +#include <ipp.h>
  13  #endif
  14
  15  extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
  16 +extern void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx);
  17 +extern void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
  18
  19  extern void j_rev_dct_ARM(DCTELEM *data);
  20  extern void simple_idct_ARM(DCTELEM *data);
  21 @@ -41,6 +43,12 @@
  22  extern void ff_simple_idct_add_armv6(uint8_t *dest, int line_size,
  23                                       DCTELEM *data);
  24
  25 +extern void ff_simple_idct_neon(DCTELEM *data);
  26 +extern void ff_simple_idct_put_neon(uint8_t *dest, int line_size,
  27 +                                    DCTELEM *data);
  28 +extern void ff_simple_idct_add_neon(uint8_t *dest, int line_size,
  29 +                                    DCTELEM *data);
  30 +
  31  /* XXX: local hack */
  32  static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
  33  static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
  34 @@ -202,6 +210,24 @@
  35  }
  36  #endif
  37
  38 +#ifdef HAVE_ARMV5TE
  39 +static void prefetch_arm(void *mem, int stride, int h)
  40 +{
  41 +    asm volatile(
  42 +        "1:              \n\t"
  43 +        "subs %0, %0, #1 \n\t"
  44 +        "pld  [%1]       \n\t"
  45 +        "add  %1, %1, %2 \n\t"
  46 +        "bgt  1b         \n\t"
  47 +        : "+r"(h), "+r"(mem) : "r"(stride));
  48 +}
  49 +#endif
  50 +
  51 +int mm_support(void)
  52 +{
  53 +    return ENABLE_IWMMXT * MM_IWMMXT;
  54 +}
  55 +
  56  void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
  57  {
  58      int idct_algo= avctx->idct_algo;
  59 @@ -209,49 +235,60 @@
  60      ff_put_pixels_clamped = c->put_pixels_clamped;
  61      ff_add_pixels_clamped = c->add_pixels_clamped;
  62
  63 -    if(idct_algo == FF_IDCT_AUTO){
  64 +    if (avctx->lowres == 0) {
  65 +        if(idct_algo == FF_IDCT_AUTO){
  66  #if defined(HAVE_IPP)
  67 -        idct_algo = FF_IDCT_IPP;
  68 +            idct_algo = FF_IDCT_IPP;
  69 +#elif defined(HAVE_NEON)
  70 +            idct_algo = FF_IDCT_SIMPLENEON;
  71  #elif defined(HAVE_ARMV6)
  72 -        idct_algo = FF_IDCT_SIMPLEARMV6;
  73 +            idct_algo = FF_IDCT_SIMPLEARMV6;
  74  #elif defined(HAVE_ARMV5TE)
  75 -        idct_algo = FF_IDCT_SIMPLEARMV5TE;
  76 +            idct_algo = FF_IDCT_SIMPLEARMV5TE;
  77  #else
  78 -        idct_algo = FF_IDCT_ARM;
  79 +            idct_algo = FF_IDCT_ARM;
  80  #endif
  81 -    }
  82 +        }
  83
  84 -    if(idct_algo==FF_IDCT_ARM){
  85 -        c->idct_put= j_rev_dct_ARM_put;
  86 -        c->idct_add= j_rev_dct_ARM_add;
  87 -        c->idct    = j_rev_dct_ARM;
  88 -        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */
  89 -    } else if (idct_algo==FF_IDCT_SIMPLEARM){
  90 -        c->idct_put= simple_idct_ARM_put;
  91 -        c->idct_add= simple_idct_ARM_add;
  92 -        c->idct    = simple_idct_ARM;
  93 -        c->idct_permutation_type= FF_NO_IDCT_PERM;
  94 +        if(idct_algo==FF_IDCT_ARM){
  95 +            c->idct_put= j_rev_dct_ARM_put;
  96 +            c->idct_add= j_rev_dct_ARM_add;
  97 +            c->idct    = j_rev_dct_ARM;
  98 +            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */
  99 +        } else if (idct_algo==FF_IDCT_SIMPLEARM){
 100 +            c->idct_put= simple_idct_ARM_put;
 101 +            c->idct_add= simple_idct_ARM_add;
 102 +            c->idct    = simple_idct_ARM;
 103 +            c->idct_permutation_type= FF_NO_IDCT_PERM;
 104  #ifdef HAVE_ARMV6
 105 -    } else if (idct_algo==FF_IDCT_SIMPLEARMV6){
 106 -        c->idct_put= ff_simple_idct_put_armv6;
 107 -        c->idct_add= ff_simple_idct_add_armv6;
 108 -        c->idct    = ff_simple_idct_armv6;
 109 -        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
 110 +        } else if (idct_algo==FF_IDCT_SIMPLEARMV6){
 111 +            c->idct_put= ff_simple_idct_put_armv6;
 112 +            c->idct_add= ff_simple_idct_add_armv6;
 113 +            c->idct    = ff_simple_idct_armv6;
 114 +            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
 115  #endif
 116  #ifdef HAVE_ARMV5TE
 117 -    } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
 118 -        c->idct_put= simple_idct_put_armv5te;
 119 -        c->idct_add= simple_idct_add_armv5te;
 120 -        c->idct    = simple_idct_armv5te;
 121 -        c->idct_permutation_type = FF_NO_IDCT_PERM;
 122 +        } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
 123 +            c->idct_put= simple_idct_put_armv5te;
 124 +            c->idct_add= simple_idct_add_armv5te;
 125 +            c->idct    = simple_idct_armv5te;
 126 +            c->idct_permutation_type = FF_NO_IDCT_PERM;
 127  #endif
 128  #ifdef HAVE_IPP
 129 -    } else if (idct_algo==FF_IDCT_IPP){
 130 -        c->idct_put= simple_idct_ipp_put;
 131 -        c->idct_add= simple_idct_ipp_add;
 132 -        c->idct    = simple_idct_ipp;
 133 -        c->idct_permutation_type= FF_NO_IDCT_PERM;
 134 +        } else if (idct_algo==FF_IDCT_IPP){
 135 +            c->idct_put= simple_idct_ipp_put;
 136 +            c->idct_add= simple_idct_ipp_add;
 137 +            c->idct    = simple_idct_ipp;
 138 +            c->idct_permutation_type= FF_NO_IDCT_PERM;
 139 +#endif
 140 +#ifdef HAVE_NEON
 141 +        } else if (idct_algo==FF_IDCT_SIMPLENEON){
 142 +            c->idct_put= ff_simple_idct_put_neon;
 143 +            c->idct_add= ff_simple_idct_add_neon;
 144 +            c->idct    = ff_simple_idct_neon;
 145 +            c->idct_permutation_type = FF_NO_IDCT_PERM;
 146  #endif
 147 +        }
 148      }
 149
 150      c->put_pixels_tab[0][0] = put_pixels16_arm;
 151 @@ -271,7 +308,17 @@
 152      c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
 153      c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;
 154
 155 +#ifdef HAVE_ARMV5TE
 156 +    c->prefetch = prefetch_arm;
 157 +#endif
 158 +
 159  #ifdef HAVE_IWMMXT
 160      dsputil_init_iwmmxt(c, avctx);
 161  #endif
 162 +#ifdef HAVE_ARMVFP
 163 +    ff_float_init_arm_vfp(c, avctx);
 164 +#endif
 165 +#ifdef HAVE_NEON
 166 +    ff_dsputil_init_neon(c, avctx);
 167 +#endif
 168  }
 169 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_arm_s.S mythtv/libs/libavcodec/armv4l/dsputil_arm_s.S
 170 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_arm_s.S  2008-07-23 12:19:05.000000000 +0200
 171 +++ mythtv/libs/libavcodec/armv4l/dsputil_arm_s.S       2008-07-24 19:54:00.753198000 +0200
 172 @@ -19,6 +19,13 @@
 173  @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 174  @
 175
 176 +#include "config.h"
 177 +
 178 +#ifndef HAVE_PLD
 179 +.macro pld reg
 180 +.endm
 181 +#endif
 182 +
 183  .macro  ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
 184          mov \Rd0, \Rn0, lsr #(\shift * 8)
 185          mov \Rd1, \Rn1, lsr #(\shift * 8)
 186 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt.c mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt.c
 187 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt.c 2008-07-23 12:19:05.000000000 +0200
 188 +++ mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt.c      2008-07-24 19:54:00.753198000 +0200
 189 @@ -19,10 +19,10 @@
 190   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 191   */
 192
 193 -#include "dsputil.h"
 194 +#include "libavcodec/dsputil.h"
 195
 196  #define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
 197 -#define SET_RND(regd)  __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
 198 +#define SET_RND(regd)  asm volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
 199  #define WAVG2B "wavg2b"
 200  #include "dsputil_iwmmxt_rnd.h"
 201  #undef DEF
 202 @@ -30,7 +30,7 @@
 203  #undef WAVG2B
 204
 205  #define DEF(x, y) x ## _ ## y ##_iwmmxt
 206 -#define SET_RND(regd)  __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
 207 +#define SET_RND(regd)  asm volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
 208  #define WAVG2B "wavg2br"
 209  #include "dsputil_iwmmxt_rnd.h"
 210  #undef DEF
 211 @@ -89,7 +89,7 @@
 212  {
 213      uint8_t *pixels2 = pixels + line_size;
 214
 215 -    __asm__ __volatile__ (
 216 +    asm volatile (
 217          "mov            r12, #4                 \n\t"
 218          "1:                                     \n\t"
 219          "pld            [%[pixels], %[line_size2]]              \n\t"
 220 @@ -125,7 +125,7 @@
 221
 222  static void clear_blocks_iwmmxt(DCTELEM *blocks)
 223  {
 224 -    __asm __volatile(
 225 +    asm volatile(
 226                  "wzero wr0                      \n\t"
 227                  "mov r1, #(128 * 6 / 32)        \n\t"
 228                  "1:                             \n\t"
 229 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h
 230 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h     2008-07-23 12:19:05.000000000 +0200
 231 +++ mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h  2008-07-24 19:54:01.023198000 +0200
 232 @@ -19,13 +19,14 @@
 233   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 234   */
 235
 236 -#ifndef FFMPEG_DSPUTIL_IWMMXT_RND_H
 237 -#define FFMPEG_DSPUTIL_IWMMXT_RND_H
 238 +/* This header intentionally has no multiple inclusion guards. It is meant to
 239 + * be included multiple times and generates different code depending on the
 240 + * value of certain #defines. */
 241
 242  void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 243  {
 244      int stride = line_size;
 245 -    __asm__ __volatile__ (
 246 +    asm volatile (
 247          "and r12, %[pixels], #7 \n\t"
 248          "bic %[pixels], %[pixels], #7 \n\t"
 249          "tmcr wcgr1, r12 \n\t"
 250 @@ -59,7 +60,7 @@
 251  void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 252  {
 253      int stride = line_size;
 254 -    __asm__ __volatile__ (
 255 +    asm volatile (
 256          "and r12, %[pixels], #7 \n\t"
 257          "bic %[pixels], %[pixels], #7 \n\t"
 258          "tmcr wcgr1, r12 \n\t"
 259 @@ -101,7 +102,7 @@
 260  void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 261  {
 262      int stride = line_size;
 263 -    __asm__ __volatile__ (
 264 +    asm volatile (
 265          "and r12, %[pixels], #7 \n\t"
 266          "bic %[pixels], %[pixels], #7 \n\t"
 267          "tmcr wcgr1, r12 \n\t"
 268 @@ -141,7 +142,7 @@
 269  void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 270  {
 271      int stride = line_size;
 272 -    __asm__ __volatile__ (
 273 +    asm volatile (
 274          "pld [%[pixels]]                \n\t"
 275          "pld [%[pixels], #32]           \n\t"
 276          "pld [%[block]]                 \n\t"
 277 @@ -200,7 +201,7 @@
 278      // [wr0 wr1 wr2 wr3] for previous line
 279      // [wr4 wr5 wr6 wr7] for current line
 280      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 281 -    __asm__ __volatile__(
 282 +    asm volatile(
 283          "pld [%[pixels]]                \n\t"
 284          "pld [%[pixels], #32]           \n\t"
 285          "and r12, %[pixels], #7         \n\t"
 286 @@ -249,7 +250,7 @@
 287      // [wr0 wr1 wr2 wr3] for previous line
 288      // [wr4 wr5 wr6 wr7] for current line
 289      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 290 -    __asm__ __volatile__(
 291 +    asm volatile(
 292          "pld [%[pixels]]                \n\t"
 293          "pld [%[pixels], #32]           \n\t"
 294          "and r12, %[pixels], #7         \n\t"
 295 @@ -310,7 +311,7 @@
 296      // [wr0 wr1 wr2 wr3] for previous line
 297      // [wr4 wr5 wr6 wr7] for current line
 298      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 299 -    __asm__ __volatile__(
 300 +    asm volatile(
 301          "pld [%[pixels]]                \n\t"
 302          "pld [%[pixels], #32]           \n\t"
 303          "pld [%[block]]                 \n\t"
 304 @@ -371,7 +372,7 @@
 305      // [wr0 wr1 wr2 wr3] for previous line
 306      // [wr4 wr5 wr6 wr7] for current line
 307      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 308 -    __asm__ __volatile__(
 309 +    asm volatile(
 310          "pld [%[pixels]]                \n\t"
 311          "pld [%[pixels], #32]           \n\t"
 312          "pld [%[block]]                 \n\t"
 313 @@ -447,7 +448,7 @@
 314      int stride = line_size;
 315      // [wr0 wr1 wr2 wr3] for previous line
 316      // [wr4 wr5 wr6 wr7] for current line
 317 -    __asm__ __volatile__(
 318 +    asm volatile(
 319          "pld            [%[pixels]]                             \n\t"
 320          "pld            [%[pixels], #32]                        \n\t"
 321          "and            r12, %[pixels], #7                      \n\t"
 322 @@ -501,7 +502,7 @@
 323      int stride = line_size;
 324      // [wr0 wr1 wr2 wr3] for previous line
 325      // [wr4 wr5 wr6 wr7] for current line
 326 -    __asm__ __volatile__(
 327 +    asm volatile(
 328          "pld [%[pixels]]                \n\t"
 329          "pld [%[pixels], #32]           \n\t"
 330          "and r12, %[pixels], #7         \n\t"
 331 @@ -558,7 +559,7 @@
 332      int stride = line_size;
 333      // [wr0 wr1 wr2 wr3] for previous line
 334      // [wr4 wr5 wr6 wr7] for current line
 335 -    __asm__ __volatile__(
 336 +    asm volatile(
 337          "pld [%[pixels]]                \n\t"
 338          "pld [%[pixels], #32]           \n\t"
 339          "and r12, %[pixels], #7         \n\t"
 340 @@ -626,7 +627,7 @@
 341      // [wr0 wr1 wr2 wr3] for previous line
 342      // [wr4 wr5 wr6 wr7] for current line
 343      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 344 -    __asm__ __volatile__(
 345 +    asm volatile(
 346          "pld [%[pixels]]                \n\t"
 347          "mov r12, #2                    \n\t"
 348          "pld [%[pixels], #32]           \n\t"
 349 @@ -720,7 +721,7 @@
 350      // [wr0 wr1 wr2 wr3] for previous line
 351      // [wr4 wr5 wr6 wr7] for current line
 352      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 353 -    __asm__ __volatile__(
 354 +    asm volatile(
 355          "pld [%[pixels]]                \n\t"
 356          "mov r12, #2                    \n\t"
 357          "pld [%[pixels], #32]           \n\t"
 358 @@ -862,7 +863,7 @@
 359      // [wr0 wr1 wr2 wr3] for previous line
 360      // [wr4 wr5 wr6 wr7] for current line
 361      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 362 -    __asm__ __volatile__(
 363 +    asm volatile(
 364          "pld [%[block]]                 \n\t"
 365          "pld [%[block], #32]            \n\t"
 366          "pld [%[pixels]]                \n\t"
 367 @@ -966,7 +967,7 @@
 368      // [wr0 wr1 wr2 wr3] for previous line
 369      // [wr4 wr5 wr6 wr7] for current line
 370      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 371 -    __asm__ __volatile__(
 372 +    asm volatile(
 373          "pld [%[block]]                 \n\t"
 374          "pld [%[block], #32]            \n\t"
 375          "pld [%[pixels]]                \n\t"
 376 @@ -1115,5 +1116,3 @@
 377          : [line_size]"r"(line_size)
 378          : "r12", "memory");
 379  }
 380 -
 381 -#endif /* FFMPEG_DSPUTIL_IWMMXT_RND_H */
 382 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_neon.c mythtv/libs/libavcodec/armv4l/dsputil_neon.c
 383 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_neon.c   1970-01-01 01:00:00.000000000 +0100
 384 +++ mythtv/libs/libavcodec/armv4l/dsputil_neon.c        2008-07-24 19:54:01.023198000 +0200
 385 @@ -0,0 +1,397 @@
 386 +/*
 387 + * ARM NEON optimised DSP functions
 388 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
 389 + *
 390 + * This file is part of FFmpeg.
 391 + *
 392 + * FFmpeg is free software; you can redistribute it and/or
 393 + * modify it under the terms of the GNU Lesser General Public
 394 + * License as published by the Free Software Foundation; either
 395 + * version 2.1 of the License, or (at your option) any later version.
 396 + *
 397 + * FFmpeg is distributed in the hope that it will be useful,
 398 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 399 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 400 + * Lesser General Public License for more details.
 401 + *
 402 + * You should have received a copy of the GNU Lesser General Public
 403 + * License along with FFmpeg; if not, write to the Free Software
 404 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 405 + */
 406 +
 407 +#include <stdint.h>
 408 +
 409 +#include "libavcodec/avcodec.h"
 410 +#include "libavcodec/dsputil.h"
 411 +
 412 +extern void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
 413 +                                        int h, int x, int y);
 414 +
 415 +#define PUT_PIXELS_16_X2(vhadd)                                 \
 416 +        "1:                                          \n\t"      \
 417 +        "vld1.64   {d0,d1,d2}, [%[p]], %[line_size]  \n\t"      \
 418 +        "vld1.64   {d4,d5,d6}, [%[p]], %[line_size]  \n\t"      \
 419 +        "pld       [%[p]]                            \n\t"      \
 420 +        "subs      %[h], %[h], #2                    \n\t"      \
 421 +        "vext.8    q1, q0, q1, #1                    \n\t"      \
 422 +        "vext.8    q3, q2, q3, #1                    \n\t"      \
 423 +         vhadd".u8 q0, q0, q1                        \n\t"      \
 424 +         vhadd".u8 q2, q2, q3                        \n\t"      \
 425 +        "vst1.64   {d0,d1}, [%[b],:64], %[line_size] \n\t"      \
 426 +        "vst1.64   {d4,d5}, [%[b],:64], %[line_size] \n\t"      \
 427 +        "bne       1b                                \n\t"
 428 +
 429 +#define PUT_PIXELS_16_Y2(vhadd)                                 \
 430 +        "add       %[p1], %[p0], %[line_size]         \n\t"     \
 431 +        "lsl       %[l2], %[line_size], #1            \n\t"     \
 432 +        "vld1.64   {d0,d1}, [%[p0]], %[l2]            \n\t"     \
 433 +        "vld1.64   {d2,d3}, [%[p1]], %[l2]            \n\t"     \
 434 +        "1:                                           \n\t"     \
 435 +        "subs      %[h], %[h], #2                     \n\t"     \
 436 +         vhadd".u8 q2, q0, q1                         \n\t"     \
 437 +        "vst1.64   {d4,d5}, [%[b],:128], %[line_size] \n\t"     \
 438 +        "vld1.64   {d0,d1}, [%[p0]],     %[l2]        \n\t"     \
 439 +         vhadd".u8 q2, q0, q1                         \n\t"     \
 440 +        "vst1.64   {d4,d5}, [%[b],:128], %[line_size] \n\t"     \
 441 +        "vld1.64   {d2,d3}, [%[p1]],     %[l2]        \n\t"     \
 442 +        "bne 1b                                       \n\t"
 443 +
 444 +#define PUT_PIXELS_16_XY2(vshrn, no_rnd)                        \
 445 +        "lsl        %[l2], %[line_size], #1              \n\t"  \
 446 +        "add        %[p1], %[p0], %[line_size]           \n\t"  \
 447 +        "vld1.64    {d0,d1,d2}, [%[p0]], %[l2]           \n\t"  \
 448 +        "vld1.64    {d4,d5,d6}, [%[p1]], %[l2]           \n\t"  \
 449 +        "pld        [%[p0]]                              \n\t"  \
 450 +        "pld        [%[p1]]                              \n\t"  \
 451 +        "vext.8     q1,  q0, q1, #1                      \n\t"  \
 452 +        "vext.8     q3,  q2, q3, #1                      \n\t"  \
 453 +        "vaddl.u8   q8,  d0, d2                          \n\t"  \
 454 +        "vaddl.u8   q10, d1, d3                          \n\t"  \
 455 +        "vaddl.u8   q9,  d4, d6                          \n\t"  \
 456 +        "vaddl.u8   q11, d5, d7                          \n\t"  \
 457 +        "1:                                              \n\t"  \
 458 +        "subs       %[h], %[h], #2                       \n\t"  \
 459 +        "vld1.64    {d0,d1,d2}, [%[p0]], %[l2]           \n\t"  \
 460 +        "vadd.u16   q12, q8, q9                          \n\t"  \
 461 +        "pld        [%[p0]]                              \n\t"  \
 462 + no_rnd "vadd.u16   q12, q12, q13                        \n\t"  \
 463 +        "vext.8     q15, q0, q1, #1                      \n\t"  \
 464 +        "vadd.u16   q1, q10, q11                         \n\t"  \
 465 +         vshrn".u16 d28, q12, #2                         \n\t"  \
 466 + no_rnd "vadd.u16   q1, q1, q13                          \n\t"  \
 467 +         vshrn".u16 d29, q1, #2                          \n\t"  \
 468 +        "vaddl.u8   q8, d0, d30                          \n\t"  \
 469 +        "vld1.64    {d2,d3,d4}, [%[p1]], %[l2]           \n\t"  \
 470 +        "vaddl.u8   q10, d1, d31                         \n\t"  \
 471 +        "vst1.64    {d28,d29}, [%[b],:128], %[line_size] \n\t"  \
 472 +        "vadd.u16   q12, q8, q9                          \n\t"  \
 473 +        "pld        [%[p1]]                              \n\t"  \
 474 + no_rnd "vadd.u16   q12, q12, q13                        \n\t"  \
 475 +        "vext.8     q2, q1, q2, #1                       \n\t"  \
 476 +        "vadd.u16   q0, q10, q11                         \n\t"  \
 477 +         vshrn".u16 d30, q12, #2                         \n\t"  \
 478 + no_rnd "vadd.u16   q0, q0, q13                          \n\t"  \
 479 +         vshrn".u16 d31, q0, #2                          \n\t"  \
 480 +        "vaddl.u8   q9, d2, d4                           \n\t"  \
 481 +        "vst1.64    {d30,d31}, [%[b],:128], %[line_size] \n\t"  \
 482 +        "vaddl.u8   q11, d3, d5                          \n\t"  \
 483 +        "bgt     1b                                      \n\t"
 484 +
 485 +#define PUT_PIXELS_8_X2(vhadd)                          \
 486 +        "1:                                       \n\t" \
 487 +        "vld1.64   {d0,d1}, [%[p]], %[line_size]  \n\t" \
 488 +        "vld1.64   {d2,d3}, [%[p]], %[line_size]  \n\t" \
 489 +        "pld       [%[p]]                         \n\t" \
 490 +        "subs      %[h], %[h], #2                 \n\t" \
 491 +        "vext.8    d1, d0, d1, #1                 \n\t" \
 492 +        "vext.8    d3, d2, d3, #1                 \n\t" \
 493 +        "vswp      d1, d2                         \n\t" \
 494 +         vhadd".u8 q0, q0, q1                     \n\t" \
 495 +        "vst1.64   {d0}, [%[b],:64], %[line_size] \n\t" \
 496 +        "vst1.64   {d1}, [%[b],:64], %[line_size] \n\t" \
 497 +        "bne       1b                             \n\t"
 498 +
 499 +#define PUT_PIXELS_8_Y2(vhadd)                          \
 500 +        "add       %[p1], %[p0], %[line_size]     \n\t" \
 501 +        "lsl       %[l2], %[line_size], #1        \n\t" \
 502 +        "vld1.64   {d0}, [%[p0]], %[l2]           \n\t" \
 503 +        "vld1.64   {d1}, [%[p1]], %[l2]           \n\t" \
 504 +        "1:                                       \n\t" \
 505 +        "subs      %[h], %[h], #2                 \n\t" \
 506 +         vhadd".u8 d4, d0, d1                     \n\t" \
 507 +        "vst1.64   {d4}, [%[b],:64], %[line_size] \n\t" \
 508 +        "vld1.64   {d0}, [%[p0]],    %[l2]        \n\t" \
 509 +         vhadd".u8 d4, d0, d1                     \n\t" \
 510 +        "vst1.64   {d4}, [%[b],:64], %[line_size] \n\t" \
 511 +        "vld1.64   {d1}, [%[p1]],     %[l2]       \n\t" \
 512 +        "bne 1b                                   \n\t"
 513 +
 514 +#define PUT_PIXELS8_XY2(vshrn, no_rnd)                          \
 515 +        "lsl        %[l2],   %[line_size], #1       \n\t"       \
 516 +        "add        %[p1],   %[p0], %[line_size]    \n\t"       \
 517 +        "vld1.64    {d0,d1}, [%[p0]], %[l2]         \n\t"       \
 518 +        "vld1.64    {d2,d3}, [%[p1]], %[l2]         \n\t"       \
 519 +        "pld        [%[p0]]                         \n\t"       \
 520 +        "pld        [%[p1]]                         \n\t"       \
 521 +        "vext.8     d4, d0, d1, #1                  \n\t"       \
 522 +        "vext.8     d6, d2, d3, #1                  \n\t"       \
 523 +        "vaddl.u8   q8, d0, d4                      \n\t"       \
 524 +        "vaddl.u8   q9, d2, d6                      \n\t"       \
 525 +        "1:                                         \n\t"       \
 526 +        "subs       %[h], %[h], #2                  \n\t"       \
 527 +        "vld1.64    {d0,d1}, [%[p0]], %[l2]         \n\t"       \
 528 +        "pld        [%[p0]]                         \n\t"       \
 529 +        "vadd.u16   q10, q8, q9                     \n\t"       \
 530 +        "vext.8     d4, d0, d1, #1                  \n\t"       \
 531 + no_rnd "vadd.u16   q10, q10, q11                   \n\t"       \
 532 +        "vaddl.u8   q8, d0, d4                      \n\t"       \
 533 +         vshrn".u16 d5, q10, #2                     \n\t"       \
 534 +        "vld1.64    {d2,d3}, [%[p1]], %[l2]         \n\t"       \
 535 +        "vadd.u16   q10, q8, q9                     \n\t"       \
 536 +        "pld        [%[p1]]                         \n\t"       \
 537 + no_rnd "vadd.u16   q10, q10, q11                   \n\t"       \
 538 +        "vst1.64    {d5}, [%[b],:64], %[line_size]  \n\t"       \
 539 +         vshrn".u16 d7, q10, #2                     \n\t"       \
 540 +        "vext.8     d6, d2, d3, #1                  \n\t"       \
 541 +        "vaddl.u8   q9, d2, d6                      \n\t"       \
 542 +        "vst1.64    {d7}, [%[b],:64], %[line_size]  \n\t"       \
 543 +        "bgt     1b                                 \n\t"
 544 +
 545 +static void put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
 546 +                              int line_size, int h)
 547 +{
 548 +    asm volatile(
 549 +        "1:                                         \n\t"
 550 +        "vld1.64 {d0,d1}, [%[pixels]], %[line_size] \n\t"
 551 +        "vld1.64 {d2,d3}, [%[pixels]], %[line_size] \n\t"
 552 +        "vld1.64 {d4,d5}, [%[pixels]], %[line_size] \n\t"
 553 +        "vld1.64 {d6,d7}, [%[pixels]], %[line_size] \n\t"
 554 +        "pld     [%[pixels]]                        \n\t"
 555 +        "subs    %[h], %[h], #4                     \n\t"
 556 +        "vst1.64 {d0,d1}, [%[block],:128], %[line_size]  \n\t"
 557 +        "vst1.64 {d2,d3}, [%[block],:128], %[line_size]  \n\t"
 558 +        "vst1.64 {d4,d5}, [%[block],:128], %[line_size]  \n\t"
 559 +        "vst1.64 {d6,d7}, [%[block],:128], %[line_size]  \n\t"
 560 +        "bne     1b                                 \n\t"
 561 +        : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h)
 562 +        : [line_size]"r"(line_size)
 563 +        : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "memory");
 564 +}
 565 +
 566 +static void put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
 567 +                                 int line_size, int h)
 568 +{
 569 +    asm volatile(
 570 +        PUT_PIXELS_16_X2("vrhadd")
 571 +        : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
 572 +        : [line_size]"r"(line_size)
 573 +        : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "memory");
 574 +}
 575 +
 576 +static void put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
 577 +                                 int line_size, int h)
 578 +{
 579 +    const uint8_t *p1;
 580 +    int l2;
 581 +
 582 +    asm volatile(
 583 +        PUT_PIXELS_16_Y2("vrhadd")
 584 +        : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
 585 +          [l2]"=&r"(l2)
 586 +        : [line_size]"r"(line_size)
 587 +        : "d0", "d1", "d2", "d3", "d4", "d5", "memory");
 588 +}
 589 +
 590 +static void put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
 591 +                                  int line_size, int h)
 592 +{
 593 +    const uint8_t *p1;
 594 +    int l2;
 595 +
 596 +    asm volatile(
 597 +        PUT_PIXELS_16_XY2("vrshrn", "@")
 598 +        : [b]"+r"(block),
 599 +          [p0]"+r"(pixels),
 600 +          [p1]"=&r"(p1), [h]"+r"(h),
 601 +          [l2]"=&r"(l2)
 602 +        : [line_size]"r"(line_size)
 603 +        : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 604 +          "d28", "d29", "d30", "d31",
 605 +          "q8", "q9", "q10", "q11", "q12", "memory");
 606 +}
 607 +
 608 +static void put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
 609 +                             int line_size, int h)
 610 +{
 611 +    asm volatile(
 612 +        "1:                                 \n\t"
 613 +        "vld1.64 {d0}, [%[p]], %[line_size] \n\t"
 614 +        "vld1.64 {d1}, [%[p]], %[line_size] \n\t"
 615 +        "vld1.64 {d2}, [%[p]], %[line_size] \n\t"
 616 +        "vld1.64 {d3}, [%[p]], %[line_size] \n\t"
 617 +        "subs    %[h], %[h], #4             \n\t"
 618 +        "vst1.64 {d0}, [%[b],:64], %[line_size] \n\t"
 619 +        "vst1.64 {d1}, [%[b],:64], %[line_size] \n\t"
 620 +        "vst1.64 {d2}, [%[b],:64], %[line_size] \n\t"
 621 +        "vst1.64 {d3}, [%[b],:64], %[line_size] \n\t"
 622 +        "bne     1b                         \n\t"
 623 +        : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
 624 +        : [line_size]"r"(line_size)
 625 +        : "d0", "d1", "d2", "d3", "memory");
 626 +}
 627 +
 628 +static void put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
 629 +                                int line_size, int h)
 630 +{
 631 +    asm volatile(
 632 +        PUT_PIXELS_8_X2("vrhadd")
 633 +        : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
 634 +        : [line_size]"r"(line_size)
 635 +        : "d0", "d1", "d2", "d3", "memory");
 636 +}
 637 +
 638 +static void put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
 639 +                                int line_size, int h)
 640 +{
 641 +    const uint8_t *p1;
 642 +    int l2;
 643 +
 644 +    asm volatile(
 645 +        PUT_PIXELS_8_Y2("vrhadd")
 646 +        : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
 647 +          [l2]"=&r"(l2)
 648 +        : [line_size]"r"(line_size)
 649 +        : "d0", "d1", "d4", "memory");
 650 +}
 651 +
 652 +static void put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
 653 +                                 int line_size, int h)
 654 +{
 655 +    const uint8_t *p1;
 656 +    int l2;
 657 +
 658 +    asm volatile(
 659 +        PUT_PIXELS8_XY2("vrshrn", "@")
 660 +        : [b]"+r"(block),
 661 +          [p0]"+r"(pixels),
 662 +          [p1]"=&r"(p1), [h]"+r"(h),
 663 +          [l2]"=&r"(l2)
 664 +        : [line_size]"r"(line_size)
 665 +        : "d0", "d1", "d2", "d3", "d4", "d6", "d7",
 666 +          "q8", "q9", "q10", "memory");
 667 +}
 668 +
 669 +static void put_no_rnd_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
 670 +                                        int line_size, int h)
 671 +{
 672 +    asm volatile(
 673 +        PUT_PIXELS_16_X2("vhadd")
 674 +        : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
 675 +        : [line_size]"r"(line_size)
 676 +        : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "memory");
 677 +}
 678 +
 679 +static void put_no_rnd_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
 680 +                                        int line_size, int h)
 681 +{
 682 +    const uint8_t *p1;
 683 +    int l2;
 684 +
 685 +    asm volatile(
 686 +        PUT_PIXELS_16_Y2("vhadd")
 687 +        : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
 688 +          [l2]"=&r"(l2)
 689 +        : [line_size]"r"(line_size)
 690 +        : "d0", "d1", "d2", "d3", "d4", "d5", "memory");
 691 +}
 692 +
 693 +static void put_no_rnd_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
 694 +                                         int line_size, int h)
 695 +{
 696 +    const uint8_t *p1;
 697 +    int l2;
 698 +
 699 +    asm volatile(
 700 +        "vmov.i16   q13, #1                         \n\t"
 701 +        PUT_PIXELS_16_XY2("vshrn", "")
 702 +        : [b]"+r"(block),
 703 +          [p0]"+r"(pixels),
 704 +          [p1]"=&r"(p1), [h]"+r"(h),
 705 +          [l2]"=&r"(l2)
 706 +        : [line_size]"r"(line_size)
 707 +        : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
 708 +          "d28", "d29", "d30", "d31",
 709 +          "q8", "q9", "q10", "q11", "q12", "q13", "memory");
 710 +}
 711 +
 712 +static void put_no_rnd_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
 713 +                                       int line_size, int h)
 714 +{
 715 +    asm volatile(
 716 +        PUT_PIXELS_8_X2("vhadd")
 717 +        : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
 718 +        : [line_size]"r"(line_size)
 719 +        : "d0", "d1", "d2", "d3", "memory");
 720 +}
 721 +
 722 +static void put_no_rnd_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
 723 +                                       int line_size, int h)
 724 +{
 725 +    const uint8_t *p1;
 726 +    int l2;
 727 +
 728 +    asm volatile(
 729 +        PUT_PIXELS_8_Y2("vhadd")
 730 +        : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
 731 +          [l2]"=&r"(l2)
 732 +        : [line_size]"r"(line_size)
 733 +        : "d0", "d1", "d4", "memory");
 734 +}
 735 +
 736 +static void put_no_rnd_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
 737 +                                        int line_size, int h)
 738 +{
 739 +    const uint8_t *p1;
 740 +    int l2;
 741 +
 742 +    asm volatile(
 743 +        "vmov.i16   q11, #1                         \n\t"
 744 +        PUT_PIXELS8_XY2("vshrn", "")
 745 +        : [b]"+r"(block),
 746 +          [p0]"+r"(pixels),
 747 +          [p1]"=&r"(p1), [h]"+r"(h),
 748 +          [l2]"=&r"(l2)
 749 +        : [line_size]"r"(line_size)
 750 +        : "d0", "d1", "d2", "d3", "d4", "d6", "d7",
 751 +          "q8", "q9", "q10", "q11", "memory");
 752 +}
 753 +
 754 +static void put_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, int stride)
 755 +{
 756 +    put_pixels16_neon(dst, src, stride, 16);
 757 +}
 758 +
 759 +void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
 760 +{
 761 +    c->put_pixels_tab[0][0] = put_pixels16_neon;
 762 +    c->put_pixels_tab[0][1] = put_pixels16_x2_neon;
 763 +    c->put_pixels_tab[0][2] = put_pixels16_y2_neon;
 764 +    c->put_pixels_tab[0][3] = put_pixels16_xy2_neon;
 765 +    c->put_pixels_tab[1][0] = put_pixels8_neon;
 766 +    c->put_pixels_tab[1][1] = put_pixels8_x2_neon;
 767 +    c->put_pixels_tab[1][2] = put_pixels8_y2_neon;
 768 +    c->put_pixels_tab[1][3] = put_pixels8_xy2_neon;
 769 +
 770 +    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_neon;
 771 +    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_neon;
 772 +    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_neon;
 773 +    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_neon;
 774 +    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_neon;
 775 +    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_neon;
 776 +    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_neon;
 777 +    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_neon;
 778 +
 779 +    c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
 780 +
 781 +    c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_neon;
 782 +}
 783 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/float_arm_vfp.c mythtv/libs/libavcodec/armv4l/float_arm_vfp.c
 784 --- mythtv.orig/libs/libavcodec/armv4l/float_arm_vfp.c  1970-01-01 01:00:00.000000000 +0100
 785 +++ mythtv/libs/libavcodec/armv4l/float_arm_vfp.c       2008-07-24 19:54:01.023198000 +0200
 786 @@ -0,0 +1,208 @@
 787 +/*
 788 + * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
 789 + *
 790 + * This file is part of FFmpeg.
 791 + *
 792 + * FFmpeg is free software; you can redistribute it and/or
 793 + * modify it under the terms of the GNU Lesser General Public
 794 + * License as published by the Free Software Foundation; either
 795 + * version 2.1 of the License, or (at your option) any later version.
 796 + *
 797 + * FFmpeg is distributed in the hope that it will be useful,
 798 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 799 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 800 + * Lesser General Public License for more details.
 801 + *
 802 + * You should have received a copy of the GNU Lesser General Public
 803 + * License along with FFmpeg; if not, write to the Free Software
 804 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 805 + */
 806 +
 807 +#include "libavcodec/dsputil.h"
 808 +
 809 +/*
 810 + * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
 811 + * throughput for almost all the instructions (except for double precision
 812 + * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
 813 + * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
 814 + * important for performance. One more interesting feature is that VFP has
 815 + * independent load/store and arithmetics pipelines, so it is possible to make
 816 + * them work simultaneously and get more than 1 operation per cycle. Load/store
 817 + * pipeline can process 2 single precision floating point values per cycle and
 818 + * supports bulk loads and stores for large sets of registers. Arithmetic operations
 819 + * can be done on vectors, which allows to keep the arithmetics pipeline busy,
 820 + * while the processor may issue and execute other instructions. Detailed
 821 + * optimization manuals can be found at http://www.arm.com
 822 + */
 823 +
 824 +/**
 825 + * ARM VFP optimized implementation of 'vector_fmul_c' function.
 826 + * Assume that len is a positive number and is multiple of 8
 827 + */
 828 +static void vector_fmul_vfp(float *dst, const float *src, int len)
 829 +{
 830 +    int tmp;
 831 +    asm volatile(
 832 +        "fmrx       %[tmp], fpscr\n\t"
 833 +        "orr        %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */
 834 +        "fmxr       fpscr, %[tmp]\n\t"
 835 +
 836 +        "fldmias    %[dst_r]!, {s0-s3}\n\t"
 837 +        "fldmias    %[src]!, {s8-s11}\n\t"
 838 +        "fldmias    %[dst_r]!, {s4-s7}\n\t"
 839 +        "fldmias    %[src]!, {s12-s15}\n\t"
 840 +        "fmuls      s8, s0, s8\n\t"
 841 +    "1:\n\t"
 842 +        "subs       %[len], %[len], #16\n\t"
 843 +        "fmuls      s12, s4, s12\n\t"
 844 +        "fldmiasge  %[dst_r]!, {s16-s19}\n\t"
 845 +        "fldmiasge  %[src]!, {s24-s27}\n\t"
 846 +        "fldmiasge  %[dst_r]!, {s20-s23}\n\t"
 847 +        "fldmiasge  %[src]!, {s28-s31}\n\t"
 848 +        "fmulsge    s24, s16, s24\n\t"
 849 +        "fstmias    %[dst_w]!, {s8-s11}\n\t"
 850 +        "fstmias    %[dst_w]!, {s12-s15}\n\t"
 851 +        "fmulsge    s28, s20, s28\n\t"
 852 +        "fldmiasgt  %[dst_r]!, {s0-s3}\n\t"
 853 +        "fldmiasgt  %[src]!, {s8-s11}\n\t"
 854 +        "fldmiasgt  %[dst_r]!, {s4-s7}\n\t"
 855 +        "fldmiasgt  %[src]!, {s12-s15}\n\t"
 856 +        "fmulsge    s8, s0, s8\n\t"
 857 +        "fstmiasge  %[dst_w]!, {s24-s27}\n\t"
 858 +        "fstmiasge  %[dst_w]!, {s28-s31}\n\t"
 859 +        "bgt        1b\n\t"
 860 +
 861 +        "bic        %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */
 862 +        "fmxr       fpscr, %[tmp]\n\t"
 863 +        : [dst_w] "+&r" (dst), [dst_r] "+&r" (dst), [src] "+&r" (src), [len] "+&r" (len), [tmp] "=&r" (tmp)
 864 +        :
 865 +        : "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
 866 +          "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
 867 +          "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
 868 +          "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
 869 +          "cc", "memory");
 870 +}
 871 +
 872 +/**
 873 + * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
 874 + * Assume that len is a positive number and is multiple of 8
 875 + */
 876 +static void vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len)
 877 +{
 878 +    src1 += len;
 879 +    asm volatile(
 880 +        "fldmdbs    %[src1]!, {s0-s3}\n\t"
 881 +        "fldmias    %[src0]!, {s8-s11}\n\t"
 882 +        "fldmdbs    %[src1]!, {s4-s7}\n\t"
 883 +        "fldmias    %[src0]!, {s12-s15}\n\t"
 884 +        "fmuls      s8, s3, s8\n\t"
 885 +        "fmuls      s9, s2, s9\n\t"
 886 +        "fmuls      s10, s1, s10\n\t"
 887 +        "fmuls      s11, s0, s11\n\t"
 888 +    "1:\n\t"
 889 +        "subs       %[len], %[len], #16\n\t"
 890 +        "fldmdbsge  %[src1]!, {s16-s19}\n\t"
 891 +        "fmuls      s12, s7, s12\n\t"
 892 +        "fldmiasge  %[src0]!, {s24-s27}\n\t"
 893 +        "fmuls      s13, s6, s13\n\t"
 894 +        "fldmdbsge  %[src1]!, {s20-s23}\n\t"
 895 +        "fmuls      s14, s5, s14\n\t"
 896 +        "fldmiasge  %[src0]!, {s28-s31}\n\t"
 897 +        "fmuls      s15, s4, s15\n\t"
 898 +        "fmulsge    s24, s19, s24\n\t"
 899 +        "fldmdbsgt  %[src1]!, {s0-s3}\n\t"
 900 +        "fmulsge    s25, s18, s25\n\t"
 901 +        "fstmias    %[dst]!, {s8-s13}\n\t"
 902 +        "fmulsge    s26, s17, s26\n\t"
 903 +        "fldmiasgt  %[src0]!, {s8-s11}\n\t"
 904 +        "fmulsge    s27, s16, s27\n\t"
 905 +        "fmulsge    s28, s23, s28\n\t"
 906 +        "fldmdbsgt  %[src1]!, {s4-s7}\n\t"
 907 +        "fmulsge    s29, s22, s29\n\t"
 908 +        "fstmias    %[dst]!, {s14-s15}\n\t"
 909 +        "fmulsge    s30, s21, s30\n\t"
 910 +        "fmulsge    s31, s20, s31\n\t"
 911 +        "fmulsge    s8, s3, s8\n\t"
 912 +        "fldmiasgt  %[src0]!, {s12-s15}\n\t"
 913 +        "fmulsge    s9, s2, s9\n\t"
 914 +        "fmulsge    s10, s1, s10\n\t"
 915 +        "fstmiasge  %[dst]!, {s24-s27}\n\t"
 916 +        "fmulsge    s11, s0, s11\n\t"
 917 +        "fstmiasge  %[dst]!, {s28-s31}\n\t"
 918 +        "bgt        1b\n\t"
 919 +
 920 +        : [dst] "+&r" (dst), [src0] "+&r" (src0), [src1] "+&r" (src1), [len] "+&r" (len)
 921 +        :
 922 +        : "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
 923 +          "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
 924 +          "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
 925 +          "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
 926 +          "cc", "memory");
 927 +}
 928 +
 929 +#ifdef HAVE_ARMV6
 930 +/**
 931 + * ARM VFP optimized float to int16 conversion.
 932 + * Assume that len is a positive number and is multiple of 8, destination
 933 + * buffer is at least 4 bytes aligned (8 bytes alignment is better for
 934 + * performance), little endian byte sex
 935 + */
 936 +void float_to_int16_vfp(int16_t *dst, const float *src, int len)
 937 +{
 938 +    asm volatile(
 939 +        "fldmias    %[src]!, {s16-s23}\n\t"
 940 +        "ftosis     s0, s16\n\t"
 941 +        "ftosis     s1, s17\n\t"
 942 +        "ftosis     s2, s18\n\t"
 943 +        "ftosis     s3, s19\n\t"
 944 +        "ftosis     s4, s20\n\t"
 945 +        "ftosis     s5, s21\n\t"
 946 +        "ftosis     s6, s22\n\t"
 947 +        "ftosis     s7, s23\n\t"
 948 +    "1:\n\t"
 949 +        "subs       %[len], %[len], #8\n\t"
 950 +        "fmrrs      r3, r4, {s0, s1}\n\t"
 951 +        "fmrrs      r5, r6, {s2, s3}\n\t"
 952 +        "fmrrs      r7, r8, {s4, s5}\n\t"
 953 +        "fmrrs      ip, lr, {s6, s7}\n\t"
 954 +        "fldmiasgt  %[src]!, {s16-s23}\n\t"
 955 +        "ssat       r4, #16, r4\n\t"
 956 +        "ssat       r3, #16, r3\n\t"
 957 +        "ssat       r6, #16, r6\n\t"
 958 +        "ssat       r5, #16, r5\n\t"
 959 +        "pkhbt      r3, r3, r4, lsl #16\n\t"
 960 +        "pkhbt      r4, r5, r6, lsl #16\n\t"
 961 +        "ftosisgt   s0, s16\n\t"
 962 +        "ftosisgt   s1, s17\n\t"
 963 +        "ftosisgt   s2, s18\n\t"
 964 +        "ftosisgt   s3, s19\n\t"
 965 +        "ftosisgt   s4, s20\n\t"
 966 +        "ftosisgt   s5, s21\n\t"
 967 +        "ftosisgt   s6, s22\n\t"
 968 +        "ftosisgt   s7, s23\n\t"
 969 +        "ssat       r8, #16, r8\n\t"
 970 +        "ssat       r7, #16, r7\n\t"
 971 +        "ssat       lr, #16, lr\n\t"
 972 +        "ssat       ip, #16, ip\n\t"
 973 +        "pkhbt      r5, r7, r8, lsl #16\n\t"
 974 +        "pkhbt      r6, ip, lr, lsl #16\n\t"
 975 +        "stmia      %[dst]!, {r3-r6}\n\t"
 976 +        "bgt        1b\n\t"
 977 +
 978 +        : [dst] "+&r" (dst), [src] "+&r" (src), [len] "+&r" (len)
 979 +        :
 980 +        : "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
 981 +          "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
 982 +          "r3", "r4", "r5", "r6", "r7", "r8", "ip", "lr",
 983 +          "cc", "memory");
 984 +}
 985 +#endif
 986 +
 987 +void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx)
 988 +{
 989 +    c->vector_fmul = vector_fmul_vfp;
 990 +    c->vector_fmul_reverse = vector_fmul_reverse_vfp;
 991 +#ifdef HAVE_ARMV6
 992 +    c->float_to_int16 = float_to_int16_vfp;
 993 +#endif
 994 +}
 995 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/h264dsp_neon.S mythtv/libs/libavcodec/armv4l/h264dsp_neon.S
 996 --- mythtv.orig/libs/libavcodec/armv4l/h264dsp_neon.S   1970-01-01 01:00:00.000000000 +0100
 997 +++ mythtv/libs/libavcodec/armv4l/h264dsp_neon.S        2008-07-24 19:54:01.033198000 +0200
 998 @@ -0,0 +1,148 @@
 999 +/*
1000 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
1001 + *
1002 + * This file is part of FFmpeg.
1003 + *
1004 + * FFmpeg is free software; you can redistribute it and/or
1005 + * modify it under the terms of the GNU Lesser General Public
1006 + * License as published by the Free Software Foundation; either
1007 + * version 2.1 of the License, or (at your option) any later version.
1008 + *
1009 + * FFmpeg is distributed in the hope that it will be useful,
1010 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1011 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
1012 + * Lesser General Public License for more details.
1013 + *
1014 + * You should have received a copy of the GNU Lesser General Public
1015 + * License along with FFmpeg; if not, write to the Free Software
1016 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1017 + */
1018 +
1019 +        .fpu neon
1020 +
1021 +        .text
1022 +        .align
1023 +        .global ff_put_h264_chroma_mc8_neon
1024 +        .func   ff_put_h264_chroma_mc8_neon
1025 +/* void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
1026 +                                    int h, int x, int y) */
1027 +ff_put_h264_chroma_mc8_neon:
1028 +        push      {r4-r7}
1029 +        ldrd      r4, [sp, #16]
1030 +
1031 +        pld       [r1]
1032 +        pld       [r1, r2]
1033 +
1034 +        muls      r7, r4, r5
1035 +        rsb       r6, r7, r5, lsl #3
1036 +        rsb       ip, r7, r4, lsl #3
1037 +        sub       r4, r7, r4, lsl #3
1038 +        sub       r4, r4, r5, lsl #3
1039 +        add       r4, r4, #64
1040 +
1041 +        beq       2f
1042 +
1043 +        add       r5, r1, r2
1044 +
1045 +        vdup.8    d0, r4
1046 +        lsl       r4, r2, #1
1047 +        vdup.8    d1, ip
1048 +        vld1.64   {d4,d5}, [r1], r4
1049 +        vdup.8    d2, r6
1050 +        vld1.64   {d6,d7}, [r5], r4
1051 +        vdup.8    d3, r7
1052 +
1053 +        mov       r6, #32
1054 +        vext.8    d5, d4, d5, #1
1055 +        vdup.16   q12, r6
1056 +        vext.8    d7, d6, d7, #1
1057 +1:
1058 +        pld       [r5]
1059 +        vmull.u8  q8, d4, d0
1060 +        vmlal.u8  q8, d5, d1
1061 +        vld1.64   {d4,d5}, [r1], r4
1062 +        vmlal.u8  q8, d6, d2
1063 +        vext.8    d5, d4, d5, #1
1064 +        vmlal.u8  q8, d7, d3
1065 +        vmull.u8  q9, d6, d0
1066 +        vadd.i16  q8, q8, q12
1067 +        subs      r3, r3, #2
1068 +        vmlal.u8  q9, d7, d1
1069 +        vshrn.u16 d16, q8, #6
1070 +        vld1.64   {d6,d7}, [r5], r4
1071 +        vmlal.u8  q9, d4, d2
1072 +        vmlal.u8  q9, d5, d3
1073 +        pld       [r1]
1074 +        vadd.i16  q9, q9, q12
1075 +        vst1.64   {d16}, [r0,:64], r2
1076 +        vshrn.u16 d17, q9, #6
1077 +        vext.8    d7, d6, d7, #1
1078 +        vst1.64   {d17}, [r0,:64], r2
1079 +        bgt       1b
1080 +
1081 +        pop       {r4-r7}
1082 +        bx        lr
1083 +
1084 +2:
1085 +        tst       r6, r6
1086 +        add       ip, ip, r6
1087 +        vdup.8    d0, r4
1088 +        vdup.8    d1, ip
1089 +        mov       r6, #32
1090 +        vdup.16   q12, r6
1091 +
1092 +        beq       4f
1093 +
1094 +        add       r5, r1, r2
1095 +        lsl       r4, r2, #1
1096 +        vld1.64   {d4}, [r1], r4
1097 +        vld1.64   {d6}, [r5], r4
1098 +3:
1099 +        pld       [r5]
1100 +        vmull.u8  q8, d4, d0
1101 +        vmlal.u8  q8, d6, d1
1102 +        vld1.64   {d4}, [r1], r4
1103 +        vmull.u8  q9, d6, d0
1104 +        vadd.i16  q8, q8, q12
1105 +        vmlal.u8  q9, d4, d1
1106 +        vshrn.u16 d16, q8, #6
1107 +        vadd.i16  q9, q9, q12
1108 +        vst1.64   {d16}, [r0,:64], r2
1109 +        vshrn.u16 d17, q9, #6
1110 +        subs      r3, r3, #2
1111 +        vld1.64   {d6}, [r5], r4
1112 +        pld       [r1]
1113 +        vst1.64   {d17}, [r0,:64], r2
1114 +        bgt       3b
1115 +
1116 +        pop       {r4-r7}
1117 +        bx        lr
1118 +
1119 +4:
1120 +        vld1.64   {d4,d5}, [r1], r2
1121 +        vld1.64   {d6,d7}, [r1], r2
1122 +        vext.8    d5, d4, d5, #1
1123 +        vext.8    d7, d6, d7, #1
1124 +5:
1125 +        pld       [r1]
1126 +        subs      r3, r3, #2
1127 +        vmull.u8  q8, d4, d0
1128 +        vmlal.u8  q8, d5, d1
1129 +        vld1.64   {d4,d5}, [r1], r2
1130 +        vmull.u8  q9, d6, d0
1131 +        vmlal.u8  q9, d7, d1
1132 +        pld       [r1]
1133 +        vadd.i16  q8, q8, q12
1134 +        vadd.i16  q9, q9, q12
1135 +        vext.8    d5, d4, d5, #1
1136 +        vshrn.u16 d16, q8, #6
1137 +        vld1.64   {d6,d7}, [r1], r2
1138 +        vshrn.u16 d17, q9, #6
1139 +        vst1.64   {d16}, [r0,:64], r2
1140 +        vext.8    d7, d6, d7, #1
1141 +        vst1.64   {d17}, [r0,:64], r2
1142 +        bgt       5b
1143 +
1144 +        pop       {r4-r7}
1145 +        bx        lr
1146 +        .endfunc
1147 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_arm.c mythtv/libs/libavcodec/armv4l/mpegvideo_arm.c
1148 --- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_arm.c  2008-07-23 12:19:05.000000000 +0200
1149 +++ mythtv/libs/libavcodec/armv4l/mpegvideo_arm.c       2008-07-24 19:54:01.263198000 +0200
1150 @@ -18,9 +18,9 @@
1151   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1152   */
1153
1154 -#include "dsputil.h"
1155 -#include "mpegvideo.h"
1156 -#include "avcodec.h"
1157 +#include "libavcodec/avcodec.h"
1158 +#include "libavcodec/dsputil.h"
1159 +#include "libavcodec/mpegvideo.h"
1160
1161  extern void MPV_common_init_iwmmxt(MpegEncContext *s);
1162  extern void MPV_common_init_armv5te(MpegEncContext *s);
1163 @@ -28,7 +28,7 @@
1164  void MPV_common_init_armv4l(MpegEncContext *s)
1165  {
1166      /* IWMMXT support is a superset of armv5te, so
1167 -     * allow optimised functions for armv5te unless
1168 +     * allow optimized functions for armv5te unless
1169       * a better iwmmxt function exists
1170       */
1171  #ifdef HAVE_ARMV5TE
1172 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_armv5te.c mythtv/libs/libavcodec/armv4l/mpegvideo_armv5te.c
1173 --- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_armv5te.c      2008-07-23 12:19:05.000000000 +0200
1174 +++ mythtv/libs/libavcodec/armv4l/mpegvideo_armv5te.c   2008-07-24 19:54:01.263198000 +0200
1175 @@ -19,9 +19,9 @@
1176   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1177   */
1178
1179 -#include "dsputil.h"
1180 -#include "mpegvideo.h"
1181 -#include "avcodec.h"
1182 +#include "libavcodec/avcodec.h"
1183 +#include "libavcodec/dsputil.h"
1184 +#include "libavcodec/mpegvideo.h"
1185
1186
1187  #ifdef ENABLE_ARM_TESTS
1188 @@ -65,7 +65,7 @@
1189  ({ DCTELEM *xblock = xxblock; \
1190     int xqmul = xxqmul, xqadd = xxqadd, xcount = xxcount, xtmp; \
1191     int xdata1, xdata2; \
1192 -__asm__ __volatile__( \
1193 +asm volatile( \
1194          "subs %[count], %[count], #2       \n\t" \
1195          "ble 2f                            \n\t" \
1196          "ldrd r4, [%[block], #0]           \n\t" \
1197 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c mythtv/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c
1198 --- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c       2008-07-23 12:19:05.000000000 +0200
1199 +++ mythtv/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c    2008-07-24 19:54:01.273198000 +0200
1200 @@ -18,9 +18,9 @@
1201   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1202   */
1203
1204 -#include "dsputil.h"
1205 -#include "mpegvideo.h"
1206 -#include "avcodec.h"
1207 +#include "libavcodec/avcodec.h"
1208 +#include "libavcodec/dsputil.h"
1209 +#include "libavcodec/mpegvideo.h"
1210
1211  static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s,
1212                                               DCTELEM *block, int n, int qscale)
1213 @@ -48,7 +48,7 @@
1214      else
1215          nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
1216
1217 -    __asm__ __volatile__ (
1218 +    asm volatile (
1219  /*      "movd %1, %%mm6                 \n\t" //qmul */
1220  /*      "packssdw %%mm6, %%mm6          \n\t" */
1221  /*      "packssdw %%mm6, %%mm6          \n\t" */
1222 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/simple_idct_arm.S mythtv/libs/libavcodec/armv4l/simple_idct_arm.S
1223 --- mythtv.orig/libs/libavcodec/armv4l/simple_idct_arm.S        2008-07-23 12:19:05.000000000 +0200
1224 +++ mythtv/libs/libavcodec/armv4l/simple_idct_arm.S     2008-07-24 19:54:01.503198000 +0200
1225 @@ -79,7 +79,7 @@
1226
1227
1228  __row_loop:
1229 -        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
1230 +        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
1231          ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
1232          ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
1233          ldr r3, [r14, #8]        @ R3=ROWr32[2]
1234 @@ -421,7 +421,7 @@
1235          @@ col[40] = ((a2 - b2) >> COL_SHIFT);
1236          @@ col[48] = ((a1 - b1) >> COL_SHIFT);
1237          @@ col[56] = ((a0 - b0) >> COL_SHIFT);
1238 -        @@@@@ no optimisation here @@@@@
1239 +        @@@@@ no optimization here @@@@@
1240          add r8, r6, r0           @ R8=a0+b0
1241          add r9, r2, r1           @ R9=a1+b1
1242          mov r8, r8, asr #COL_SHIFT
1243 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/simple_idct_neon.S mythtv/libs/libavcodec/armv4l/simple_idct_neon.S
1244 --- mythtv.orig/libs/libavcodec/armv4l/simple_idct_neon.S       1970-01-01 01:00:00.000000000 +0100
1245 +++ mythtv/libs/libavcodec/armv4l/simple_idct_neon.S    2008-07-24 19:54:01.503198000 +0200
1246 @@ -0,0 +1,388 @@
1247 +/*
1248 + * ARM NEON IDCT
1249 + *
1250 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
1251 + *
1252 + * Based on Simple IDCT
1253 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
1254 + *
1255 + * This file is part of FFmpeg.
1256 + *
1257 + * FFmpeg is free software; you can redistribute it and/or
1258 + * modify it under the terms of the GNU Lesser General Public
1259 + * License as published by the Free Software Foundation; either
1260 + * version 2.1 of the License, or (at your option) any later version.
1261 + *
1262 + * FFmpeg is distributed in the hope that it will be useful,
1263 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1264 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
1265 + * Lesser General Public License for more details.
1266 + *
1267 + * You should have received a copy of the GNU Lesser General Public
1268 + * License along with FFmpeg; if not, write to the Free Software
1269 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1270 + */
1271 +
1272 +#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1273 +#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1274 +#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1275 +#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1276 +#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1277 +#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1278 +#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1279 +#define W4c ((1<<(COL_SHIFT-1))/W4)
1280 +#define ROW_SHIFT 11
1281 +#define COL_SHIFT 20
1282 +
1283 +#define w1 d0[0]
1284 +#define w2 d0[1]
1285 +#define w3 d0[2]
1286 +#define w4 d0[3]
1287 +#define w5 d1[0]
1288 +#define w6 d1[1]
1289 +#define w7 d1[2]
1290 +#define w4c d1[3]
1291 +
1292 +        .fpu neon
1293 +
1294 +        .macro idct_col4_top
1295 +        vmull.s16 q7,  d6,  w2    /* q9   = W2 * col[2] */
1296 +        vmull.s16 q8,  d6,  w6    /* q10  = W6 * col[2] */
1297 +        vmull.s16 q9,  d4,  w1    /* q9  = W1 * col[1] */
1298 +        vadd.i32  q11, q15, q7
1299 +        vmull.s16 q10, d4,  w3    /* q10 = W3 * col[1] */
1300 +        vadd.i32  q12, q15, q8
1301 +        vmull.s16 q5,  d4,  w5    /* q5  = W5 * col[1] */
1302 +        vsub.i32  q13, q15, q8
1303 +        vmull.s16 q6,  d4,  w7    /* q6  = W7 * col[1] */
1304 +        vsub.i32  q14, q15, q7
1305 +
1306 +        vmlal.s16 q9,  d8, w3     /* q9  += W3 * col[3] */
1307 +        vmlsl.s16 q10, d8, w7     /* q10 -= W7 * col[3] */
1308 +        vmlsl.s16 q5,  d8, w1     /* q5  -= W1 * col[3] */
1309 +        vmlsl.s16 q6,  d8, w5     /* q6  -= W5 * col[3] */
1310 +        .endm
1311 +
1312 +        .macro idct_col4_mid1
1313 +        vmull.s16 q7,  d3,  w4    /* q7 = W4 * col[4] */
1314 +        vadd.i32  q11, q11, q7
1315 +        vsub.i32  q12, q12, q7
1316 +        vsub.i32  q13, q13, q7
1317 +        vadd.i32  q14, q14, q7
1318 +        .endm
1319 +
1320 +        .macro idct_col4_mid2
1321 +        vmlal.s16 q9,  d5, w5     /* q9  += W5 * col[5] */
1322 +        vmlsl.s16 q10, d5, w1     /* q10 -= W1 * col[5] */
1323 +        vmlal.s16 q5,  d5, w7     /* q5  += W7 * col[5] */
1324 +        vmlal.s16 q6,  d5, w3     /* q6  += W3 * col[5] */
1325 +        .endm
1326 +
1327 +        .macro idct_col4_mid3
1328 +        vmull.s16 q7,  d7, w6     /* q7 = W6 * col[6] */
1329 +        vmull.s16 q8,  d7, w2     /* q8 = W2 * col[6] */
1330 +        vadd.i32  q11, q11, q7
1331 +        vsub.i32  q12, q12, q8
1332 +        vadd.i32  q13, q13, q8
1333 +        vsub.i32  q14, q14, q7
1334 +        .endm
1335 +
1336 +        .macro idct_col4_mid4
1337 +        vmlal.s16 q9,  d9, w7
1338 +        vmlsl.s16 q10, d9, w5
1339 +        vmlal.s16 q5,  d9, w3
1340 +        vmlsl.s16 q6,  d9, w1
1341 +        .endm
1342 +
1343 +        .macro idct_col4_mid
1344 +        vmull.s16 q7,  d3,  w4    /* q7   = W4 * col[4] */
1345 +        vmlal.s16 q9,  d5,  w5    /* q9  += W5 * col[5] */
1346 +        vmlsl.s16 q10, d5,  w1    /* q10 -= W1 * col[5] */
1347 +        vadd.i32  q11, q11, q7
1348 +        vmull.s16 q8,  d7,  w2    /* q8   = W2 * col[6] */
1349 +        vsub.i32  q12, q12, q7
1350 +        vmlal.s16 q5,  d5,  w7    /* q5  += W7 * col[5] */
1351 +        vsub.i32  q13, q13, q7
1352 +        vmlal.s16 q6,  d5,  w3    /* q6  += W3 * col[5] */
1353 +        vadd.i32  q14, q14, q7
1354 +        vmull.s16 q7,  d7,  w6    /* q7   = W6 * col[6] */
1355 +        vadd.i32  q11, q11, q7
1356 +        vmlal.s16 q9,  d9,  w7
1357 +        vsub.i32  q12, q12, q8
1358 +        vmlsl.s16 q10, d9,  w5
1359 +        vadd.i32  q13, q13, q8
1360 +        vmlal.s16 q5,  d9,  w3
1361 +        vsub.i32  q14, q14, q7
1362 +        vmlsl.s16 q6,  d9,  w1
1363 +        .endm
1364 +
1365 +        .macro idct_col4_end
1366 +        vadd.i32 q3,  q11, q9
1367 +        vadd.i32 q4,  q12, q10
1368 +        vadd.i32 q7,  q13, q5
1369 +        vadd.i32 q8,  q14, q6
1370 +        vsub.i32 q11, q11, q9
1371 +        vsub.i32 q12, q12, q10
1372 +        vsub.i32 q13, q13, q5
1373 +        vsub.i32 q14, q14, q6
1374 +        .endm
1375 +
1376 +       .text
1377 +        .align
1378 +        .type idct_row4_neon, %function
1379 +        .func idct_row4_neon
1380 +idct_row4_neon:
1381 +        vld1.64 {d2,d3}, [a3,:128]!
1382 +        vld1.64 {d4,d5}, [a3,:128]!
1383 +        vld1.64 {d6,d7}, [a3,:128]!
1384 +        vld1.64 {d8,d9}, [a3,:128]!
1385 +        add a3, a3, #-64
1386 +
1387 +        vmov.i32  q15, #(1<<(ROW_SHIFT-1))
1388 +        vorr      d10, d3,  d5
1389 +        vtrn.16   q1,  q2
1390 +        vorr      d11, d7,  d9
1391 +        vtrn.16   q3,  q4
1392 +        vorr      d10, d10, d11
1393 +        vtrn.32   q1,  q3
1394 +        vmlal.s16 q15, d2,  w4    /* q15 += W4 * col[0] */
1395 +        vtrn.32   q2,  q4
1396 +        vmov      a4,  v1,  d10
1397 +
1398 +        idct_col4_top
1399 +
1400 +        orrs a4, a4, v1
1401 +        beq 1f
1402 +        idct_col4_mid
1403 +1:
1404 +        vadd.i32 q3,  q11, q9
1405 +        vadd.i32 q4,  q12, q10
1406 +        vshrn.i32 d2, q3,  #ROW_SHIFT
1407 +        vadd.i32 q7,  q13, q5
1408 +        vshrn.i32 d4, q4,  #ROW_SHIFT
1409 +        vadd.i32 q8,  q14, q6
1410 +        vshrn.i32 d6, q7,  #ROW_SHIFT
1411 +        vsub.i32 q11, q11, q9
1412 +        vshrn.i32 d8, q8,  #ROW_SHIFT
1413 +        vsub.i32 q12, q12, q10
1414 +        vshrn.i32 d9, q11, #ROW_SHIFT
1415 +        vsub.i32 q13, q13, q5
1416 +        vshrn.i32 d7, q12, #ROW_SHIFT
1417 +        vsub.i32 q14, q14, q6
1418 +        vshrn.i32 d5, q13, #ROW_SHIFT
1419 +        vshrn.i32 d3, q14, #ROW_SHIFT
1420 +
1421 +        vtrn.16   q1, q2
1422 +        vtrn.16   q3, q4
1423 +        vtrn.32   q1, q3
1424 +        vtrn.32   q2, q4
1425 +
1426 +        vst1.64 {d2,d3}, [a3,:128]!
1427 +        vst1.64 {d4,d5}, [a3,:128]!
1428 +        vst1.64 {d6,d7}, [a3,:128]!
1429 +        vst1.64 {d8,d9}, [a3,:128]!
1430 +
1431 +        mov pc, lr
1432 +        .endfunc
1433 +
1434 +        .align
1435 +        .type idct_col4_neon, %function
1436 +        .func idct_col4_neon
1437 +idct_col4_neon:
1438 +        mov ip, #16
1439 +        vld1.64 {d2}, [a3,:64], ip /* d2 = col[0] */
1440 +        vld1.64 {d4}, [a3,:64], ip /* d3 = col[1] */
1441 +        vld1.64 {d6}, [a3,:64], ip /* d4 = col[2] */
1442 +        vld1.64 {d8}, [a3,:64], ip /* d5 = col[3] */
1443 +        vld1.64 {d3}, [a3,:64], ip /* d6 = col[4] */
1444 +        vld1.64 {d5}, [a3,:64], ip /* d7 = col[5] */
1445 +        vld1.64 {d7}, [a3,:64], ip /* d8 = col[6] */
1446 +        vld1.64 {d9}, [a3,:64], ip /* d9 = col[7] */
1447 +
1448 +        vrev64.32 d11, d3
1449 +        vrev64.32 d13, d5
1450 +        vorr      d11, d3, d11
1451 +        vrev64.32 d15, d7
1452 +        vorr      d13, d5, d13
1453 +        vrev64.32 d17, d9
1454 +        vorr      d15, d7, d15
1455 +        vmov.32   v1,  d11[0]
1456 +        vmov.32   v2,  d13[0]
1457 +        vorr      d17, d9, d17
1458 +        vmov.32   v3,  d15[0]
1459 +        vmov.32   ip,  d17[0]
1460 +        vdup.16   d30, w4c
1461 +        vadd.i16  d30, d30, d2
1462 +        vmull.s16 q15, d30, w4 /* q15 = W4 * (col[0]+(1<<(COL_SHIFT-1))/W4) */
1463 +
1464 +        idct_col4_top
1465 +        tst v1, v1
1466 +        beq 1f
1467 +        idct_col4_mid1
1468 +1:      tst v2, v2
1469 +        beq 2f
1470 +        idct_col4_mid2
1471 +2:      tst v3, v3
1472 +        beq 3f
1473 +        idct_col4_mid3
1474 +3:      tst ip, ip
1475 +        beq 4f
1476 +        idct_col4_mid4
1477 +4:
1478 +        idct_col4_end
1479 +
1480 +        vshr.s32  q2, q3,  #COL_SHIFT
1481 +        vshr.s32  q3, q4,  #COL_SHIFT
1482 +        vmovn.i32 d2, q2
1483 +        vshr.s32  q4, q7,  #COL_SHIFT
1484 +        vmovn.i32 d3, q3
1485 +        vshr.s32  q5, q8,  #COL_SHIFT
1486 +        vmovn.i32 d4, q4
1487 +        vshr.s32  q6, q14, #COL_SHIFT
1488 +        vmovn.i32 d5, q5
1489 +        vshr.s32  q7, q13, #COL_SHIFT
1490 +        vmovn.i32 d6, q6
1491 +        vshr.s32  q8, q12, #COL_SHIFT
1492 +        vmovn.i32 d7, q7
1493 +        vshr.s32  q9, q11, #COL_SHIFT
1494 +        vmovn.i32 d8, q8
1495 +        vmovn.i32 d9, q9
1496 +
1497 +        mov pc, lr
1498 +        .endfunc
1499 +
1500 +        .macro idct_col4_st16
1501 +        mov ip, #16
1502 +        vst1.64 {d2}, [a3,:64], ip
1503 +        vst1.64 {d3}, [a3,:64], ip
1504 +        vst1.64 {d4}, [a3,:64], ip
1505 +        vst1.64 {d5}, [a3,:64], ip
1506 +        vst1.64 {d6}, [a3,:64], ip
1507 +        vst1.64 {d7}, [a3,:64], ip
1508 +        vst1.64 {d8}, [a3,:64], ip
1509 +        vst1.64 {d9}, [a3,:64], ip
1510 +        .endm
1511 +
1512 +        .align
1513 +        .type idct_col4_add8, %function
1514 +        .func idct_col4_add8
1515 +idct_col4_add8:
1516 +        vld1.32 {d10[0]}, [a1,:32], a2
1517 +        vld1.32 {d10[1]}, [a1,:32], a2
1518 +        vld1.32 {d11[0]}, [a1,:32], a2
1519 +        vld1.32 {d11[1]}, [a1,:32], a2
1520 +        vld1.32 {d12[0]}, [a1,:32], a2
1521 +        vld1.32 {d12[1]}, [a1,:32], a2
1522 +        vld1.32 {d13[0]}, [a1,:32], a2
1523 +        vld1.32 {d13[1]}, [a1,:32], a2
1524 +
1525 +        vaddw.u8 q1, q1, d10
1526 +        vaddw.u8 q2, q2, d11
1527 +        vaddw.u8 q3, q3, d12
1528 +        vaddw.u8 q4, q4, d13
1529 +
1530 +        sub a1, a1, a2, lsl #3
1531 +        .endfunc
1532 +
1533 +        .type idct_col4_st8, %function
1534 +        .func idct_col4_st8
1535 +idct_col4_st8:
1536 +        vqmovun.s16 d2, q1
1537 +        vqmovun.s16 d3, q2
1538 +        vqmovun.s16 d4, q3
1539 +        vqmovun.s16 d5, q4
1540 +
1541 +        vst1.32 {d2[0]}, [a1,:32], a2
1542 +        vst1.32 {d2[1]}, [a1,:32], a2
1543 +        vst1.32 {d3[0]}, [a1,:32], a2
1544 +        vst1.32 {d3[1]}, [a1,:32], a2
1545 +        vst1.32 {d4[0]}, [a1,:32], a2
1546 +        vst1.32 {d4[1]}, [a1,:32], a2
1547 +        vst1.32 {d5[0]}, [a1,:32], a2
1548 +        vst1.32 {d5[1]}, [a1,:32], a2
1549 +        mov pc, lr
1550 +        .endfunc
1551 +
1552 +        .align 4
1553 +const:  .short W1, W2, W3, W4, W5, W6, W7, W4c
1554 +
1555 +        .macro idct_start data
1556 +        pld [\data]
1557 +        pld [\data, #64]
1558 +        push {v1-v3, lr}
1559 +        vpush {d8-d15}
1560 +        adr a4, const
1561 +        vld1.64 {d0,d1}, [a4,:128]
1562 +        .endm
1563 +
1564 +        .macro idct_end
1565 +        vpop {d8-d15}
1566 +        pop {v1-v3, pc}
1567 +        .endm
1568 +
1569 +        .align
1570 +        .global ff_simple_idct_neon
1571 +        .type ff_simple_idct_neon, %function
1572 +        .func ff_simple_idct_neon
1573 +/* void ff_simple_idct_neon(DCTELEM *data); */
1574 +ff_simple_idct_neon:
1575 +        idct_start a1
1576 +
1577 +        mov a3, a1
1578 +        bl idct_row4_neon
1579 +        bl idct_row4_neon
1580 +        add a3, a3, #-128
1581 +        bl idct_col4_neon
1582 +        add a3, a3, #-128
1583 +        idct_col4_st16
1584 +        add a3, a3, #-120
1585 +        bl idct_col4_neon
1586 +        add a3, a3, #-128
1587 +        idct_col4_st16
1588 +
1589 +        idct_end
1590 +        .endfunc
1591 +
1592 +        .align
1593 +        .global ff_simple_idct_put_neon
1594 +        .type ff_simple_idct_put_neon, %function
1595 +        .func ff_simple_idct_put_neon
1596 +/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */
1597 +ff_simple_idct_put_neon:
1598 +        idct_start a3
1599 +
1600 +        bl idct_row4_neon
1601 +        bl idct_row4_neon
1602 +        add a3, a3, #-128
1603 +        bl idct_col4_neon
1604 +        bl idct_col4_st8
1605 +        sub a1, a1, a2, lsl #3
1606 +        add a1, a1, #4
1607 +        add a3, a3, #-120
1608 +        bl idct_col4_neon
1609 +        bl idct_col4_st8
1610 +
1611 +        idct_end
1612 +        .endfunc
1613 +
1614 +        .align
1615 +        .global ff_simple_idct_add_neon
1616 +        .type ff_simple_idct_add_neon, %function
1617 +        .func ff_simple_idct_add_neon
1618 +/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */
1619 +ff_simple_idct_add_neon:
1620 +        idct_start a3
1621 +
1622 +        bl idct_row4_neon
1623 +        bl idct_row4_neon
1624 +        add a3, a3, #-128
1625 +        bl idct_col4_neon
1626 +        bl idct_col4_add8
1627 +        sub a1, a1, a2, lsl #3
1628 +        add a1, a1, #4
1629 +        add a3, a3, #-120
1630 +        bl idct_col4_neon
1631 +        bl idct_col4_add8
1632 +
1633 +        idct_end
1634 +        .endfunc
1635 diff -Nurd mythtv.orig/libs/libavcodec/avcodec.h mythtv/libs/libavcodec/avcodec.h
1636 --- mythtv.orig/libs/libavcodec/avcodec.h       2008-07-23 12:19:11.000000000 +0200
1637 +++ mythtv/libs/libavcodec/avcodec.h    2008-07-24 19:56:46.953198000 +0200
1638 @@ -1328,6 +1328,8 @@
1639  #define FF_IDCT_SIMPLEARMV6   17
1640  #define FF_IDCT_SIMPLEVIS     18
1641  #define FF_IDCT_WMV2          19
1642 +#define FF_IDCT_FAAN          20
1643 +#define FF_IDCT_SIMPLENEON    21
1644
1645      /**
1646       * slice count
1647 diff -Nurd mythtv.orig/libs/libavcodec/libavcodec.pro mythtv/libs/libavcodec/libavcodec.pro
1648 --- mythtv.orig/libs/libavcodec/libavcodec.pro  2008-07-23 12:19:10.000000000 +0200
1649 +++ mythtv/libs/libavcodec/libavcodec.pro       2008-07-24 19:54:01.503198000 +0200
1650 @@ -413,6 +413,8 @@
1651
1652  contains( HAVE_ARMV6, yes )      { SOURCES += armv4l/simple_idct_armv6.S }
1653
1654 +contains( HAVE_NEON, yes )      { SOURCES += armv4l/float_arm_vfp.c armv4l/simple_idct_neon.S armv4l/dsputil_neon.c armv4l/h264dsp_neon.S }
1655 +
1656  contains( HAVE_VIS, yes ) {
1657      SOURCES += sparc/dsputil_vis.c
1658      SOURCES += sparc/simple_idct_vis.c
1659 diff -Nurd mythtv.orig/libs/libavcodec/utils.c mythtv/libs/libavcodec/utils.c
1660 --- mythtv.orig/libs/libavcodec/utils.c 2008-07-23 12:19:10.000000000 +0200
1661 +++ mythtv/libs/libavcodec/utils.c      2008-07-24 19:58:12.403198000 +0200
1662 @@ -594,6 +594,7 @@
1663  {"sh4", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SH4, INT_MIN, INT_MAX, V|E|D, "idct"},
1664  {"simplearm", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARM, INT_MIN, INT_MAX, V|E|D, "idct"},
1665  {"simplearmv5te", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV5TE, INT_MIN, INT_MAX, V|E|D, "idct"},
1666 +{"simpleneon", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLENEON, INT_MIN, INT_MAX, V|E|D, "idct"},
1667  {"h264", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_H264, INT_MIN, INT_MAX, V|E|D, "idct"},
1668  {"vp3", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_VP3, INT_MIN, INT_MAX, V|E|D, "idct"},
1669  {"ipp", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_IPP, INT_MIN, INT_MAX, V|E|D, "idct"},