surpport seeking the recorded video
[vuplus_openembedded] / recipes / mythtv / mythtv-0.21 / ffmpeg-arm-update.diff
1 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_arm.c mythtv/libs/libavcodec/armv4l/dsputil_arm.c
2 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_arm.c    2008-07-23 12:19:05.000000000 +0200
3 +++ mythtv/libs/libavcodec/armv4l/dsputil_arm.c 2008-07-24 19:54:00.753198000 +0200
4 @@ -19,12 +19,14 @@
5   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6   */
7  
8 -#include "dsputil.h"
9 +#include "libavcodec/dsputil.h"
10  #ifdef HAVE_IPP
11 -#include "ipp.h"
12 +#include <ipp.h>
13  #endif
14  
15  extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
16 +extern void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx);
17 +extern void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
18  
19  extern void j_rev_dct_ARM(DCTELEM *data);
20  extern void simple_idct_ARM(DCTELEM *data);
21 @@ -41,6 +43,12 @@
22  extern void ff_simple_idct_add_armv6(uint8_t *dest, int line_size,
23                                       DCTELEM *data);
24  
25 +extern void ff_simple_idct_neon(DCTELEM *data);
26 +extern void ff_simple_idct_put_neon(uint8_t *dest, int line_size,
27 +                                    DCTELEM *data);
28 +extern void ff_simple_idct_add_neon(uint8_t *dest, int line_size,
29 +                                    DCTELEM *data);
30 +
31  /* XXX: local hack */
32  static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
33  static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
34 @@ -202,6 +210,24 @@
35  }
36  #endif
37  
38 +#ifdef HAVE_ARMV5TE
39 +static void prefetch_arm(void *mem, int stride, int h)
40 +{
41 +    asm volatile(
42 +        "1:              \n\t"
43 +        "subs %0, %0, #1 \n\t"
44 +        "pld  [%1]       \n\t"
45 +        "add  %1, %1, %2 \n\t"
46 +        "bgt  1b         \n\t"
47 +        : "+r"(h), "+r"(mem) : "r"(stride));
48 +}
49 +#endif
50 +
51 +int mm_support(void)
52 +{
53 +    return ENABLE_IWMMXT * MM_IWMMXT;
54 +}
55 +
56  void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
57  {
58      int idct_algo= avctx->idct_algo;
59 @@ -209,49 +235,60 @@
60      ff_put_pixels_clamped = c->put_pixels_clamped;
61      ff_add_pixels_clamped = c->add_pixels_clamped;
62  
63 -    if(idct_algo == FF_IDCT_AUTO){
64 +    if (avctx->lowres == 0) {
65 +        if(idct_algo == FF_IDCT_AUTO){
66  #if defined(HAVE_IPP)
67 -        idct_algo = FF_IDCT_IPP;
68 +            idct_algo = FF_IDCT_IPP;
69 +#elif defined(HAVE_NEON)
70 +            idct_algo = FF_IDCT_SIMPLENEON;
71  #elif defined(HAVE_ARMV6)
72 -        idct_algo = FF_IDCT_SIMPLEARMV6;
73 +            idct_algo = FF_IDCT_SIMPLEARMV6;
74  #elif defined(HAVE_ARMV5TE)
75 -        idct_algo = FF_IDCT_SIMPLEARMV5TE;
76 +            idct_algo = FF_IDCT_SIMPLEARMV5TE;
77  #else
78 -        idct_algo = FF_IDCT_ARM;
79 +            idct_algo = FF_IDCT_ARM;
80  #endif
81 -    }
82 +        }
83  
84 -    if(idct_algo==FF_IDCT_ARM){
85 -        c->idct_put= j_rev_dct_ARM_put;
86 -        c->idct_add= j_rev_dct_ARM_add;
87 -        c->idct    = j_rev_dct_ARM;
88 -        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */
89 -    } else if (idct_algo==FF_IDCT_SIMPLEARM){
90 -        c->idct_put= simple_idct_ARM_put;
91 -        c->idct_add= simple_idct_ARM_add;
92 -        c->idct    = simple_idct_ARM;
93 -        c->idct_permutation_type= FF_NO_IDCT_PERM;
94 +        if(idct_algo==FF_IDCT_ARM){
95 +            c->idct_put= j_rev_dct_ARM_put;
96 +            c->idct_add= j_rev_dct_ARM_add;
97 +            c->idct    = j_rev_dct_ARM;
98 +            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */
99 +        } else if (idct_algo==FF_IDCT_SIMPLEARM){
100 +            c->idct_put= simple_idct_ARM_put;
101 +            c->idct_add= simple_idct_ARM_add;
102 +            c->idct    = simple_idct_ARM;
103 +            c->idct_permutation_type= FF_NO_IDCT_PERM;
104  #ifdef HAVE_ARMV6
105 -    } else if (idct_algo==FF_IDCT_SIMPLEARMV6){
106 -        c->idct_put= ff_simple_idct_put_armv6;
107 -        c->idct_add= ff_simple_idct_add_armv6;
108 -        c->idct    = ff_simple_idct_armv6;
109 -        c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
110 +        } else if (idct_algo==FF_IDCT_SIMPLEARMV6){
111 +            c->idct_put= ff_simple_idct_put_armv6;
112 +            c->idct_add= ff_simple_idct_add_armv6;
113 +            c->idct    = ff_simple_idct_armv6;
114 +            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
115  #endif
116  #ifdef HAVE_ARMV5TE
117 -    } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
118 -        c->idct_put= simple_idct_put_armv5te;
119 -        c->idct_add= simple_idct_add_armv5te;
120 -        c->idct    = simple_idct_armv5te;
121 -        c->idct_permutation_type = FF_NO_IDCT_PERM;
122 +        } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
123 +            c->idct_put= simple_idct_put_armv5te;
124 +            c->idct_add= simple_idct_add_armv5te;
125 +            c->idct    = simple_idct_armv5te;
126 +            c->idct_permutation_type = FF_NO_IDCT_PERM;
127  #endif
128  #ifdef HAVE_IPP
129 -    } else if (idct_algo==FF_IDCT_IPP){
130 -        c->idct_put= simple_idct_ipp_put;
131 -        c->idct_add= simple_idct_ipp_add;
132 -        c->idct    = simple_idct_ipp;
133 -        c->idct_permutation_type= FF_NO_IDCT_PERM;
134 +        } else if (idct_algo==FF_IDCT_IPP){
135 +            c->idct_put= simple_idct_ipp_put;
136 +            c->idct_add= simple_idct_ipp_add;
137 +            c->idct    = simple_idct_ipp;
138 +            c->idct_permutation_type= FF_NO_IDCT_PERM;
139 +#endif
140 +#ifdef HAVE_NEON
141 +        } else if (idct_algo==FF_IDCT_SIMPLENEON){
142 +            c->idct_put= ff_simple_idct_put_neon;
143 +            c->idct_add= ff_simple_idct_add_neon;
144 +            c->idct    = ff_simple_idct_neon;
145 +            c->idct_permutation_type = FF_NO_IDCT_PERM;
146  #endif
147 +        }
148      }
149  
150      c->put_pixels_tab[0][0] = put_pixels16_arm;
151 @@ -271,7 +308,17 @@
152      c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
153      c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;
154  
155 +#ifdef HAVE_ARMV5TE
156 +    c->prefetch = prefetch_arm;
157 +#endif
158 +
159  #ifdef HAVE_IWMMXT
160      dsputil_init_iwmmxt(c, avctx);
161  #endif
162 +#ifdef HAVE_ARMVFP
163 +    ff_float_init_arm_vfp(c, avctx);
164 +#endif
165 +#ifdef HAVE_NEON
166 +    ff_dsputil_init_neon(c, avctx);
167 +#endif
168  }
169 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_arm_s.S mythtv/libs/libavcodec/armv4l/dsputil_arm_s.S
170 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_arm_s.S  2008-07-23 12:19:05.000000000 +0200
171 +++ mythtv/libs/libavcodec/armv4l/dsputil_arm_s.S       2008-07-24 19:54:00.753198000 +0200
172 @@ -19,6 +19,13 @@
173  @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
174  @
175  
176 +#include "config.h"
177 +
178 +#ifndef HAVE_PLD
179 +.macro pld reg
180 +.endm
181 +#endif
182 +
183  .macro  ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
184          mov \Rd0, \Rn0, lsr #(\shift * 8)
185          mov \Rd1, \Rn1, lsr #(\shift * 8)
186 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt.c mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt.c
187 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt.c 2008-07-23 12:19:05.000000000 +0200
188 +++ mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt.c      2008-07-24 19:54:00.753198000 +0200
189 @@ -19,10 +19,10 @@
190   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
191   */
192  
193 -#include "dsputil.h"
194 +#include "libavcodec/dsputil.h"
195  
196  #define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
197 -#define SET_RND(regd)  __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
198 +#define SET_RND(regd)  asm volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
199  #define WAVG2B "wavg2b"
200  #include "dsputil_iwmmxt_rnd.h"
201  #undef DEF
202 @@ -30,7 +30,7 @@
203  #undef WAVG2B
204  
205  #define DEF(x, y) x ## _ ## y ##_iwmmxt
206 -#define SET_RND(regd)  __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
207 +#define SET_RND(regd)  asm volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
208  #define WAVG2B "wavg2br"
209  #include "dsputil_iwmmxt_rnd.h"
210  #undef DEF
211 @@ -89,7 +89,7 @@
212  {
213      uint8_t *pixels2 = pixels + line_size;
214  
215 -    __asm__ __volatile__ (
216 +    asm volatile (
217          "mov            r12, #4                 \n\t"
218          "1:                                     \n\t"
219          "pld            [%[pixels], %[line_size2]]              \n\t"
220 @@ -125,7 +125,7 @@
221  
222  static void clear_blocks_iwmmxt(DCTELEM *blocks)
223  {
224 -    __asm __volatile(
225 +    asm volatile(
226                  "wzero wr0                      \n\t"
227                  "mov r1, #(128 * 6 / 32)        \n\t"
228                  "1:                             \n\t"
229 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h
230 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h     2008-07-23 12:19:05.000000000 +0200
231 +++ mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h  2008-07-24 19:54:01.023198000 +0200
232 @@ -19,13 +19,14 @@
233   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
234   */
235  
236 -#ifndef FFMPEG_DSPUTIL_IWMMXT_RND_H
237 -#define FFMPEG_DSPUTIL_IWMMXT_RND_H
238 +/* This header intentionally has no multiple inclusion guards. It is meant to
239 + * be included multiple times and generates different code depending on the
240 + * value of certain #defines. */
241  
242  void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
243  {
244      int stride = line_size;
245 -    __asm__ __volatile__ (
246 +    asm volatile (
247          "and r12, %[pixels], #7 \n\t"
248          "bic %[pixels], %[pixels], #7 \n\t"
249          "tmcr wcgr1, r12 \n\t"
250 @@ -59,7 +60,7 @@
251  void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
252  {
253      int stride = line_size;
254 -    __asm__ __volatile__ (
255 +    asm volatile (
256          "and r12, %[pixels], #7 \n\t"
257          "bic %[pixels], %[pixels], #7 \n\t"
258          "tmcr wcgr1, r12 \n\t"
259 @@ -101,7 +102,7 @@
260  void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
261  {
262      int stride = line_size;
263 -    __asm__ __volatile__ (
264 +    asm volatile (
265          "and r12, %[pixels], #7 \n\t"
266          "bic %[pixels], %[pixels], #7 \n\t"
267          "tmcr wcgr1, r12 \n\t"
268 @@ -141,7 +142,7 @@
269  void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
270  {
271      int stride = line_size;
272 -    __asm__ __volatile__ (
273 +    asm volatile (
274          "pld [%[pixels]]                \n\t"
275          "pld [%[pixels], #32]           \n\t"
276          "pld [%[block]]                 \n\t"
277 @@ -200,7 +201,7 @@
278      // [wr0 wr1 wr2 wr3] for previous line
279      // [wr4 wr5 wr6 wr7] for current line
280      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
281 -    __asm__ __volatile__(
282 +    asm volatile(
283          "pld [%[pixels]]                \n\t"
284          "pld [%[pixels], #32]           \n\t"
285          "and r12, %[pixels], #7         \n\t"
286 @@ -249,7 +250,7 @@
287      // [wr0 wr1 wr2 wr3] for previous line
288      // [wr4 wr5 wr6 wr7] for current line
289      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
290 -    __asm__ __volatile__(
291 +    asm volatile(
292          "pld [%[pixels]]                \n\t"
293          "pld [%[pixels], #32]           \n\t"
294          "and r12, %[pixels], #7         \n\t"
295 @@ -310,7 +311,7 @@
296      // [wr0 wr1 wr2 wr3] for previous line
297      // [wr4 wr5 wr6 wr7] for current line
298      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
299 -    __asm__ __volatile__(
300 +    asm volatile(
301          "pld [%[pixels]]                \n\t"
302          "pld [%[pixels], #32]           \n\t"
303          "pld [%[block]]                 \n\t"
304 @@ -371,7 +372,7 @@
305      // [wr0 wr1 wr2 wr3] for previous line
306      // [wr4 wr5 wr6 wr7] for current line
307      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
308 -    __asm__ __volatile__(
309 +    asm volatile(
310          "pld [%[pixels]]                \n\t"
311          "pld [%[pixels], #32]           \n\t"
312          "pld [%[block]]                 \n\t"
313 @@ -447,7 +448,7 @@
314      int stride = line_size;
315      // [wr0 wr1 wr2 wr3] for previous line
316      // [wr4 wr5 wr6 wr7] for current line
317 -    __asm__ __volatile__(
318 +    asm volatile(
319          "pld            [%[pixels]]                             \n\t"
320          "pld            [%[pixels], #32]                        \n\t"
321          "and            r12, %[pixels], #7                      \n\t"
322 @@ -501,7 +502,7 @@
323      int stride = line_size;
324      // [wr0 wr1 wr2 wr3] for previous line
325      // [wr4 wr5 wr6 wr7] for current line
326 -    __asm__ __volatile__(
327 +    asm volatile(
328          "pld [%[pixels]]                \n\t"
329          "pld [%[pixels], #32]           \n\t"
330          "and r12, %[pixels], #7         \n\t"
331 @@ -558,7 +559,7 @@
332      int stride = line_size;
333      // [wr0 wr1 wr2 wr3] for previous line
334      // [wr4 wr5 wr6 wr7] for current line
335 -    __asm__ __volatile__(
336 +    asm volatile(
337          "pld [%[pixels]]                \n\t"
338          "pld [%[pixels], #32]           \n\t"
339          "and r12, %[pixels], #7         \n\t"
340 @@ -626,7 +627,7 @@
341      // [wr0 wr1 wr2 wr3] for previous line
342      // [wr4 wr5 wr6 wr7] for current line
343      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
344 -    __asm__ __volatile__(
345 +    asm volatile(
346          "pld [%[pixels]]                \n\t"
347          "mov r12, #2                    \n\t"
348          "pld [%[pixels], #32]           \n\t"
349 @@ -720,7 +721,7 @@
350      // [wr0 wr1 wr2 wr3] for previous line
351      // [wr4 wr5 wr6 wr7] for current line
352      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
353 -    __asm__ __volatile__(
354 +    asm volatile(
355          "pld [%[pixels]]                \n\t"
356          "mov r12, #2                    \n\t"
357          "pld [%[pixels], #32]           \n\t"
358 @@ -862,7 +863,7 @@
359      // [wr0 wr1 wr2 wr3] for previous line
360      // [wr4 wr5 wr6 wr7] for current line
361      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
362 -    __asm__ __volatile__(
363 +    asm volatile(
364          "pld [%[block]]                 \n\t"
365          "pld [%[block], #32]            \n\t"
366          "pld [%[pixels]]                \n\t"
367 @@ -966,7 +967,7 @@
368      // [wr0 wr1 wr2 wr3] for previous line
369      // [wr4 wr5 wr6 wr7] for current line
370      SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
371 -    __asm__ __volatile__(
372 +    asm volatile(
373          "pld [%[block]]                 \n\t"
374          "pld [%[block], #32]            \n\t"
375          "pld [%[pixels]]                \n\t"
376 @@ -1115,5 +1116,3 @@
377          : [line_size]"r"(line_size)
378          : "r12", "memory");
379  }
380 -
381 -#endif /* FFMPEG_DSPUTIL_IWMMXT_RND_H */
382 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_neon.c mythtv/libs/libavcodec/armv4l/dsputil_neon.c
383 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_neon.c   1970-01-01 01:00:00.000000000 +0100
384 +++ mythtv/libs/libavcodec/armv4l/dsputil_neon.c        2008-07-24 19:54:01.023198000 +0200
385 @@ -0,0 +1,397 @@
386 +/*
387 + * ARM NEON optimised DSP functions
388 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
389 + *
390 + * This file is part of FFmpeg.
391 + *
392 + * FFmpeg is free software; you can redistribute it and/or
393 + * modify it under the terms of the GNU Lesser General Public
394 + * License as published by the Free Software Foundation; either
395 + * version 2.1 of the License, or (at your option) any later version.
396 + *
397 + * FFmpeg is distributed in the hope that it will be useful,
398 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
399 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
400 + * Lesser General Public License for more details.
401 + *
402 + * You should have received a copy of the GNU Lesser General Public
403 + * License along with FFmpeg; if not, write to the Free Software
404 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
405 + */
406 +
407 +#include <stdint.h>
408 +
409 +#include "libavcodec/avcodec.h"
410 +#include "libavcodec/dsputil.h"
411 +
412 +extern void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
413 +                                        int h, int x, int y);
414 +
415 +#define PUT_PIXELS_16_X2(vhadd)                                 \
416 +        "1:                                          \n\t"      \
417 +        "vld1.64   {d0,d1,d2}, [%[p]], %[line_size]  \n\t"      \
418 +        "vld1.64   {d4,d5,d6}, [%[p]], %[line_size]  \n\t"      \
419 +        "pld       [%[p]]                            \n\t"      \
420 +        "subs      %[h], %[h], #2                    \n\t"      \
421 +        "vext.8    q1, q0, q1, #1                    \n\t"      \
422 +        "vext.8    q3, q2, q3, #1                    \n\t"      \
423 +         vhadd".u8 q0, q0, q1                        \n\t"      \
424 +         vhadd".u8 q2, q2, q3                        \n\t"      \
425 +        "vst1.64   {d0,d1}, [%[b],:64], %[line_size] \n\t"      \
426 +        "vst1.64   {d4,d5}, [%[b],:64], %[line_size] \n\t"      \
427 +        "bne       1b                                \n\t"
428 +
429 +#define PUT_PIXELS_16_Y2(vhadd)                                 \
430 +        "add       %[p1], %[p0], %[line_size]         \n\t"     \
431 +        "lsl       %[l2], %[line_size], #1            \n\t"     \
432 +        "vld1.64   {d0,d1}, [%[p0]], %[l2]            \n\t"     \
433 +        "vld1.64   {d2,d3}, [%[p1]], %[l2]            \n\t"     \
434 +        "1:                                           \n\t"     \
435 +        "subs      %[h], %[h], #2                     \n\t"     \
436 +         vhadd".u8 q2, q0, q1                         \n\t"     \
437 +        "vst1.64   {d4,d5}, [%[b],:128], %[line_size] \n\t"     \
438 +        "vld1.64   {d0,d1}, [%[p0]],     %[l2]        \n\t"     \
439 +         vhadd".u8 q2, q0, q1                         \n\t"     \
440 +        "vst1.64   {d4,d5}, [%[b],:128], %[line_size] \n\t"     \
441 +        "vld1.64   {d2,d3}, [%[p1]],     %[l2]        \n\t"     \
442 +        "bne 1b                                       \n\t"
443 +
444 +#define PUT_PIXELS_16_XY2(vshrn, no_rnd)                        \
445 +        "lsl        %[l2], %[line_size], #1              \n\t"  \
446 +        "add        %[p1], %[p0], %[line_size]           \n\t"  \
447 +        "vld1.64    {d0,d1,d2}, [%[p0]], %[l2]           \n\t"  \
448 +        "vld1.64    {d4,d5,d6}, [%[p1]], %[l2]           \n\t"  \
449 +        "pld        [%[p0]]                              \n\t"  \
450 +        "pld        [%[p1]]                              \n\t"  \
451 +        "vext.8     q1,  q0, q1, #1                      \n\t"  \
452 +        "vext.8     q3,  q2, q3, #1                      \n\t"  \
453 +        "vaddl.u8   q8,  d0, d2                          \n\t"  \
454 +        "vaddl.u8   q10, d1, d3                          \n\t"  \
455 +        "vaddl.u8   q9,  d4, d6                          \n\t"  \
456 +        "vaddl.u8   q11, d5, d7                          \n\t"  \
457 +        "1:                                              \n\t"  \
458 +        "subs       %[h], %[h], #2                       \n\t"  \
459 +        "vld1.64    {d0,d1,d2}, [%[p0]], %[l2]           \n\t"  \
460 +        "vadd.u16   q12, q8, q9                          \n\t"  \
461 +        "pld        [%[p0]]                              \n\t"  \
462 + no_rnd "vadd.u16   q12, q12, q13                        \n\t"  \
463 +        "vext.8     q15, q0, q1, #1                      \n\t"  \
464 +        "vadd.u16   q1, q10, q11                         \n\t"  \
465 +         vshrn".u16 d28, q12, #2                         \n\t"  \
466 + no_rnd "vadd.u16   q1, q1, q13                          \n\t"  \
467 +         vshrn".u16 d29, q1, #2                          \n\t"  \
468 +        "vaddl.u8   q8, d0, d30                          \n\t"  \
469 +        "vld1.64    {d2,d3,d4}, [%[p1]], %[l2]           \n\t"  \
470 +        "vaddl.u8   q10, d1, d31                         \n\t"  \
471 +        "vst1.64    {d28,d29}, [%[b],:128], %[line_size] \n\t"  \
472 +        "vadd.u16   q12, q8, q9                          \n\t"  \
473 +        "pld        [%[p1]]                              \n\t"  \
474 + no_rnd "vadd.u16   q12, q12, q13                        \n\t"  \
475 +        "vext.8     q2, q1, q2, #1                       \n\t"  \
476 +        "vadd.u16   q0, q10, q11                         \n\t"  \
477 +         vshrn".u16 d30, q12, #2                         \n\t"  \
478 + no_rnd "vadd.u16   q0, q0, q13                          \n\t"  \
479 +         vshrn".u16 d31, q0, #2                          \n\t"  \
480 +        "vaddl.u8   q9, d2, d4                           \n\t"  \
481 +        "vst1.64    {d30,d31}, [%[b],:128], %[line_size] \n\t"  \
482 +        "vaddl.u8   q11, d3, d5                          \n\t"  \
483 +        "bgt     1b                                      \n\t"
484 +
485 +#define PUT_PIXELS_8_X2(vhadd)                          \
486 +        "1:                                       \n\t" \
487 +        "vld1.64   {d0,d1}, [%[p]], %[line_size]  \n\t" \
488 +        "vld1.64   {d2,d3}, [%[p]], %[line_size]  \n\t" \
489 +        "pld       [%[p]]                         \n\t" \
490 +        "subs      %[h], %[h], #2                 \n\t" \
491 +        "vext.8    d1, d0, d1, #1                 \n\t" \
492 +        "vext.8    d3, d2, d3, #1                 \n\t" \
493 +        "vswp      d1, d2                         \n\t" \
494 +         vhadd".u8 q0, q0, q1                     \n\t" \
495 +        "vst1.64   {d0}, [%[b],:64], %[line_size] \n\t" \
496 +        "vst1.64   {d1}, [%[b],:64], %[line_size] \n\t" \
497 +        "bne       1b                             \n\t"
498 +
499 +#define PUT_PIXELS_8_Y2(vhadd)                          \
500 +        "add       %[p1], %[p0], %[line_size]     \n\t" \
501 +        "lsl       %[l2], %[line_size], #1        \n\t" \
502 +        "vld1.64   {d0}, [%[p0]], %[l2]           \n\t" \
503 +        "vld1.64   {d1}, [%[p1]], %[l2]           \n\t" \
504 +        "1:                                       \n\t" \
505 +        "subs      %[h], %[h], #2                 \n\t" \
506 +         vhadd".u8 d4, d0, d1                     \n\t" \
507 +        "vst1.64   {d4}, [%[b],:64], %[line_size] \n\t" \
508 +        "vld1.64   {d0}, [%[p0]],    %[l2]        \n\t" \
509 +         vhadd".u8 d4, d0, d1                     \n\t" \
510 +        "vst1.64   {d4}, [%[b],:64], %[line_size] \n\t" \
511 +        "vld1.64   {d1}, [%[p1]],     %[l2]       \n\t" \
512 +        "bne 1b                                   \n\t"
513 +
514 +#define PUT_PIXELS8_XY2(vshrn, no_rnd)                          \
515 +        "lsl        %[l2],   %[line_size], #1       \n\t"       \
516 +        "add        %[p1],   %[p0], %[line_size]    \n\t"       \
517 +        "vld1.64    {d0,d1}, [%[p0]], %[l2]         \n\t"       \
518 +        "vld1.64    {d2,d3}, [%[p1]], %[l2]         \n\t"       \
519 +        "pld        [%[p0]]                         \n\t"       \
520 +        "pld        [%[p1]]                         \n\t"       \
521 +        "vext.8     d4, d0, d1, #1                  \n\t"       \
522 +        "vext.8     d6, d2, d3, #1                  \n\t"       \
523 +        "vaddl.u8   q8, d0, d4                      \n\t"       \
524 +        "vaddl.u8   q9, d2, d6                      \n\t"       \
525 +        "1:                                         \n\t"       \
526 +        "subs       %[h], %[h], #2                  \n\t"       \
527 +        "vld1.64    {d0,d1}, [%[p0]], %[l2]         \n\t"       \
528 +        "pld        [%[p0]]                         \n\t"       \
529 +        "vadd.u16   q10, q8, q9                     \n\t"       \
530 +        "vext.8     d4, d0, d1, #1                  \n\t"       \
531 + no_rnd "vadd.u16   q10, q10, q11                   \n\t"       \
532 +        "vaddl.u8   q8, d0, d4                      \n\t"       \
533 +         vshrn".u16 d5, q10, #2                     \n\t"       \
534 +        "vld1.64    {d2,d3}, [%[p1]], %[l2]         \n\t"       \
535 +        "vadd.u16   q10, q8, q9                     \n\t"       \
536 +        "pld        [%[p1]]                         \n\t"       \
537 + no_rnd "vadd.u16   q10, q10, q11                   \n\t"       \
538 +        "vst1.64    {d5}, [%[b],:64], %[line_size]  \n\t"       \
539 +         vshrn".u16 d7, q10, #2                     \n\t"       \
540 +        "vext.8     d6, d2, d3, #1                  \n\t"       \
541 +        "vaddl.u8   q9, d2, d6                      \n\t"       \
542 +        "vst1.64    {d7}, [%[b],:64], %[line_size]  \n\t"       \
543 +        "bgt     1b                                 \n\t"
544 +
545 +static void put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
546 +                              int line_size, int h)
547 +{
548 +    asm volatile(
549 +        "1:                                         \n\t"
550 +        "vld1.64 {d0,d1}, [%[pixels]], %[line_size] \n\t"
551 +        "vld1.64 {d2,d3}, [%[pixels]], %[line_size] \n\t"
552 +        "vld1.64 {d4,d5}, [%[pixels]], %[line_size] \n\t"
553 +        "vld1.64 {d6,d7}, [%[pixels]], %[line_size] \n\t"
554 +        "pld     [%[pixels]]                        \n\t"
555 +        "subs    %[h], %[h], #4                     \n\t"
556 +        "vst1.64 {d0,d1}, [%[block],:128], %[line_size]  \n\t"
557 +        "vst1.64 {d2,d3}, [%[block],:128], %[line_size]  \n\t"
558 +        "vst1.64 {d4,d5}, [%[block],:128], %[line_size]  \n\t"
559 +        "vst1.64 {d6,d7}, [%[block],:128], %[line_size]  \n\t"
560 +        "bne     1b                                 \n\t"
561 +        : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h)
562 +        : [line_size]"r"(line_size)
563 +        : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "memory");
564 +}
565 +
566 +static void put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
567 +                                 int line_size, int h)
568 +{
569 +    asm volatile(
570 +        PUT_PIXELS_16_X2("vrhadd")
571 +        : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
572 +        : [line_size]"r"(line_size)
573 +        : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "memory");
574 +}
575 +
576 +static void put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
577 +                                 int line_size, int h)
578 +{
579 +    const uint8_t *p1;
580 +    int l2;
581 +
582 +    asm volatile(
583 +        PUT_PIXELS_16_Y2("vrhadd")
584 +        : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
585 +          [l2]"=&r"(l2)
586 +        : [line_size]"r"(line_size)
587 +        : "d0", "d1", "d2", "d3", "d4", "d5", "memory");
588 +}
589 +
590 +static void put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
591 +                                  int line_size, int h)
592 +{
593 +    const uint8_t *p1;
594 +    int l2;
595 +
596 +    asm volatile(
597 +        PUT_PIXELS_16_XY2("vrshrn", "@")
598 +        : [b]"+r"(block),
599 +          [p0]"+r"(pixels),
600 +          [p1]"=&r"(p1), [h]"+r"(h),
601 +          [l2]"=&r"(l2)
602 +        : [line_size]"r"(line_size)
603 +        : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
604 +          "d28", "d29", "d30", "d31",
605 +          "q8", "q9", "q10", "q11", "q12", "memory");
606 +}
607 +
608 +static void put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
609 +                             int line_size, int h)
610 +{
611 +    asm volatile(
612 +        "1:                                 \n\t"
613 +        "vld1.64 {d0}, [%[p]], %[line_size] \n\t"
614 +        "vld1.64 {d1}, [%[p]], %[line_size] \n\t"
615 +        "vld1.64 {d2}, [%[p]], %[line_size] \n\t"
616 +        "vld1.64 {d3}, [%[p]], %[line_size] \n\t"
617 +        "subs    %[h], %[h], #4             \n\t"
618 +        "vst1.64 {d0}, [%[b],:64], %[line_size] \n\t"
619 +        "vst1.64 {d1}, [%[b],:64], %[line_size] \n\t"
620 +        "vst1.64 {d2}, [%[b],:64], %[line_size] \n\t"
621 +        "vst1.64 {d3}, [%[b],:64], %[line_size] \n\t"
622 +        "bne     1b                         \n\t"
623 +        : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
624 +        : [line_size]"r"(line_size)
625 +        : "d0", "d1", "d2", "d3", "memory");
626 +}
627 +
628 +static void put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
629 +                                int line_size, int h)
630 +{
631 +    asm volatile(
632 +        PUT_PIXELS_8_X2("vrhadd")
633 +        : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
634 +        : [line_size]"r"(line_size)
635 +        : "d0", "d1", "d2", "d3", "memory");
636 +}
637 +
638 +static void put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
639 +                                int line_size, int h)
640 +{
641 +    const uint8_t *p1;
642 +    int l2;
643 +
644 +    asm volatile(
645 +        PUT_PIXELS_8_Y2("vrhadd")
646 +        : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
647 +          [l2]"=&r"(l2)
648 +        : [line_size]"r"(line_size)
649 +        : "d0", "d1", "d4", "memory");
650 +}
651 +
652 +static void put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
653 +                                 int line_size, int h)
654 +{
655 +    const uint8_t *p1;
656 +    int l2;
657 +
658 +    asm volatile(
659 +        PUT_PIXELS8_XY2("vrshrn", "@")
660 +        : [b]"+r"(block),
661 +          [p0]"+r"(pixels),
662 +          [p1]"=&r"(p1), [h]"+r"(h),
663 +          [l2]"=&r"(l2)
664 +        : [line_size]"r"(line_size)
665 +        : "d0", "d1", "d2", "d3", "d4", "d6", "d7",
666 +          "q8", "q9", "q10", "memory");
667 +}
668 +
669 +static void put_no_rnd_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
670 +                                        int line_size, int h)
671 +{
672 +    asm volatile(
673 +        PUT_PIXELS_16_X2("vhadd")
674 +        : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
675 +        : [line_size]"r"(line_size)
676 +        : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "memory");
677 +}
678 +
679 +static void put_no_rnd_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
680 +                                        int line_size, int h)
681 +{
682 +    const uint8_t *p1;
683 +    int l2;
684 +
685 +    asm volatile(
686 +        PUT_PIXELS_16_Y2("vhadd")
687 +        : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
688 +          [l2]"=&r"(l2)
689 +        : [line_size]"r"(line_size)
690 +        : "d0", "d1", "d2", "d3", "d4", "d5", "memory");
691 +}
692 +
693 +static void put_no_rnd_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
694 +                                         int line_size, int h)
695 +{
696 +    const uint8_t *p1;
697 +    int l2;
698 +
699 +    asm volatile(
700 +        "vmov.i16   q13, #1                         \n\t"
701 +        PUT_PIXELS_16_XY2("vshrn", "")
702 +        : [b]"+r"(block),
703 +          [p0]"+r"(pixels),
704 +          [p1]"=&r"(p1), [h]"+r"(h),
705 +          [l2]"=&r"(l2)
706 +        : [line_size]"r"(line_size)
707 +        : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
708 +          "d28", "d29", "d30", "d31",
709 +          "q8", "q9", "q10", "q11", "q12", "q13", "memory");
710 +}
711 +
712 +static void put_no_rnd_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
713 +                                       int line_size, int h)
714 +{
715 +    asm volatile(
716 +        PUT_PIXELS_8_X2("vhadd")
717 +        : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
718 +        : [line_size]"r"(line_size)
719 +        : "d0", "d1", "d2", "d3", "memory");
720 +}
721 +
722 +static void put_no_rnd_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
723 +                                       int line_size, int h)
724 +{
725 +    const uint8_t *p1;
726 +    int l2;
727 +
728 +    asm volatile(
729 +        PUT_PIXELS_8_Y2("vhadd")
730 +        : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
731 +          [l2]"=&r"(l2)
732 +        : [line_size]"r"(line_size)
733 +        : "d0", "d1", "d4", "memory");
734 +}
735 +
736 +static void put_no_rnd_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
737 +                                        int line_size, int h)
738 +{
739 +    const uint8_t *p1;
740 +    int l2;
741 +
742 +    asm volatile(
743 +        "vmov.i16   q11, #1                         \n\t"
744 +        PUT_PIXELS8_XY2("vshrn", "")
745 +        : [b]"+r"(block),
746 +          [p0]"+r"(pixels),
747 +          [p1]"=&r"(p1), [h]"+r"(h),
748 +          [l2]"=&r"(l2)
749 +        : [line_size]"r"(line_size)
750 +        : "d0", "d1", "d2", "d3", "d4", "d6", "d7",
751 +          "q8", "q9", "q10", "q11", "memory");
752 +}
753 +
754 +static void put_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, int stride)
755 +{
756 +    put_pixels16_neon(dst, src, stride, 16);
757 +}
758 +
759 +void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
760 +{
761 +    c->put_pixels_tab[0][0] = put_pixels16_neon;
762 +    c->put_pixels_tab[0][1] = put_pixels16_x2_neon;
763 +    c->put_pixels_tab[0][2] = put_pixels16_y2_neon;
764 +    c->put_pixels_tab[0][3] = put_pixels16_xy2_neon;
765 +    c->put_pixels_tab[1][0] = put_pixels8_neon;
766 +    c->put_pixels_tab[1][1] = put_pixels8_x2_neon;
767 +    c->put_pixels_tab[1][2] = put_pixels8_y2_neon;
768 +    c->put_pixels_tab[1][3] = put_pixels8_xy2_neon;
769 +
770 +    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_neon;
771 +    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_neon;
772 +    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_neon;
773 +    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_neon;
774 +    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_neon;
775 +    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_neon;
776 +    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_neon;
777 +    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_neon;
778 +
779 +    c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
780 +
781 +    c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_neon;
782 +}
783 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/float_arm_vfp.c mythtv/libs/libavcodec/armv4l/float_arm_vfp.c
784 --- mythtv.orig/libs/libavcodec/armv4l/float_arm_vfp.c  1970-01-01 01:00:00.000000000 +0100
785 +++ mythtv/libs/libavcodec/armv4l/float_arm_vfp.c       2008-07-24 19:54:01.023198000 +0200
786 @@ -0,0 +1,208 @@
787 +/*
788 + * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
789 + *
790 + * This file is part of FFmpeg.
791 + *
792 + * FFmpeg is free software; you can redistribute it and/or
793 + * modify it under the terms of the GNU Lesser General Public
794 + * License as published by the Free Software Foundation; either
795 + * version 2.1 of the License, or (at your option) any later version.
796 + *
797 + * FFmpeg is distributed in the hope that it will be useful,
798 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
799 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
800 + * Lesser General Public License for more details.
801 + *
802 + * You should have received a copy of the GNU Lesser General Public
803 + * License along with FFmpeg; if not, write to the Free Software
804 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
805 + */
806 +
807 +#include "libavcodec/dsputil.h"
808 +
809 +/*
810 + * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
811 + * throughput for almost all the instructions (except for double precision
812 + * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
813 + * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
814 + * important for performance. One more interesting feature is that VFP has
815 + * independent load/store and arithmetics pipelines, so it is possible to make
816 + * them work simultaneously and get more than 1 operation per cycle. Load/store
817 + * pipeline can process 2 single precision floating point values per cycle and
818 + * supports bulk loads and stores for large sets of registers. Arithmetic operations
819 + * can be done on vectors, which allows to keep the arithmetics pipeline busy,
820 + * while the processor may issue and execute other instructions. Detailed
821 + * optimization manuals can be found at http://www.arm.com
822 + */
823 +
824 +/**
825 + * ARM VFP optimized implementation of 'vector_fmul_c' function.
826 + * Assume that len is a positive number and is multiple of 8
827 + */
828 +static void vector_fmul_vfp(float *dst, const float *src, int len)
829 +{
830 +    int tmp;
831 +    asm volatile(
832 +        "fmrx       %[tmp], fpscr\n\t"
833 +        "orr        %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */
834 +        "fmxr       fpscr, %[tmp]\n\t"
835 +
836 +        "fldmias    %[dst_r]!, {s0-s3}\n\t"
837 +        "fldmias    %[src]!, {s8-s11}\n\t"
838 +        "fldmias    %[dst_r]!, {s4-s7}\n\t"
839 +        "fldmias    %[src]!, {s12-s15}\n\t"
840 +        "fmuls      s8, s0, s8\n\t"
841 +    "1:\n\t"
842 +        "subs       %[len], %[len], #16\n\t"
843 +        "fmuls      s12, s4, s12\n\t"
844 +        "fldmiasge  %[dst_r]!, {s16-s19}\n\t"
845 +        "fldmiasge  %[src]!, {s24-s27}\n\t"
846 +        "fldmiasge  %[dst_r]!, {s20-s23}\n\t"
847 +        "fldmiasge  %[src]!, {s28-s31}\n\t"
848 +        "fmulsge    s24, s16, s24\n\t"
849 +        "fstmias    %[dst_w]!, {s8-s11}\n\t"
850 +        "fstmias    %[dst_w]!, {s12-s15}\n\t"
851 +        "fmulsge    s28, s20, s28\n\t"
852 +        "fldmiasgt  %[dst_r]!, {s0-s3}\n\t"
853 +        "fldmiasgt  %[src]!, {s8-s11}\n\t"
854 +        "fldmiasgt  %[dst_r]!, {s4-s7}\n\t"
855 +        "fldmiasgt  %[src]!, {s12-s15}\n\t"
856 +        "fmulsge    s8, s0, s8\n\t"
857 +        "fstmiasge  %[dst_w]!, {s24-s27}\n\t"
858 +        "fstmiasge  %[dst_w]!, {s28-s31}\n\t"
859 +        "bgt        1b\n\t"
860 +
861 +        "bic        %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */
862 +        "fmxr       fpscr, %[tmp]\n\t"
863 +        : [dst_w] "+&r" (dst), [dst_r] "+&r" (dst), [src] "+&r" (src), [len] "+&r" (len), [tmp] "=&r" (tmp)
864 +        :
865 +        : "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
866 +          "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
867 +          "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
868 +          "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
869 +          "cc", "memory");
870 +}
871 +
872 +/**
873 + * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
874 + * Assume that len is a positive number and is multiple of 8
875 + */
876 +static void vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len)
877 +{
878 +    src1 += len;
879 +    asm volatile(
880 +        "fldmdbs    %[src1]!, {s0-s3}\n\t"
881 +        "fldmias    %[src0]!, {s8-s11}\n\t"
882 +        "fldmdbs    %[src1]!, {s4-s7}\n\t"
883 +        "fldmias    %[src0]!, {s12-s15}\n\t"
884 +        "fmuls      s8, s3, s8\n\t"
885 +        "fmuls      s9, s2, s9\n\t"
886 +        "fmuls      s10, s1, s10\n\t"
887 +        "fmuls      s11, s0, s11\n\t"
888 +    "1:\n\t"
889 +        "subs       %[len], %[len], #16\n\t"
890 +        "fldmdbsge  %[src1]!, {s16-s19}\n\t"
891 +        "fmuls      s12, s7, s12\n\t"
892 +        "fldmiasge  %[src0]!, {s24-s27}\n\t"
893 +        "fmuls      s13, s6, s13\n\t"
894 +        "fldmdbsge  %[src1]!, {s20-s23}\n\t"
895 +        "fmuls      s14, s5, s14\n\t"
896 +        "fldmiasge  %[src0]!, {s28-s31}\n\t"
897 +        "fmuls      s15, s4, s15\n\t"
898 +        "fmulsge    s24, s19, s24\n\t"
899 +        "fldmdbsgt  %[src1]!, {s0-s3}\n\t"
900 +        "fmulsge    s25, s18, s25\n\t"
901 +        "fstmias    %[dst]!, {s8-s13}\n\t"
902 +        "fmulsge    s26, s17, s26\n\t"
903 +        "fldmiasgt  %[src0]!, {s8-s11}\n\t"
904 +        "fmulsge    s27, s16, s27\n\t"
905 +        "fmulsge    s28, s23, s28\n\t"
906 +        "fldmdbsgt  %[src1]!, {s4-s7}\n\t"
907 +        "fmulsge    s29, s22, s29\n\t"
908 +        "fstmias    %[dst]!, {s14-s15}\n\t"
909 +        "fmulsge    s30, s21, s30\n\t"
910 +        "fmulsge    s31, s20, s31\n\t"
911 +        "fmulsge    s8, s3, s8\n\t"
912 +        "fldmiasgt  %[src0]!, {s12-s15}\n\t"
913 +        "fmulsge    s9, s2, s9\n\t"
914 +        "fmulsge    s10, s1, s10\n\t"
915 +        "fstmiasge  %[dst]!, {s24-s27}\n\t"
916 +        "fmulsge    s11, s0, s11\n\t"
917 +        "fstmiasge  %[dst]!, {s28-s31}\n\t"
918 +        "bgt        1b\n\t"
919 +
920 +        : [dst] "+&r" (dst), [src0] "+&r" (src0), [src1] "+&r" (src1), [len] "+&r" (len)
921 +        :
922 +        : "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
923 +          "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
924 +          "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
925 +          "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
926 +          "cc", "memory");
927 +}
928 +
929 +#ifdef HAVE_ARMV6
930 +/**
931 + * ARM VFP optimized float to int16 conversion.
932 + * Assume that len is a positive number and is multiple of 8, destination
933 + * buffer is at least 4 bytes aligned (8 bytes alignment is better for
934 + * performance), little endian byte sex
935 + */
936 +void float_to_int16_vfp(int16_t *dst, const float *src, int len)
937 +{
938 +    asm volatile(
939 +        "fldmias    %[src]!, {s16-s23}\n\t"
940 +        "ftosis     s0, s16\n\t"
941 +        "ftosis     s1, s17\n\t"
942 +        "ftosis     s2, s18\n\t"
943 +        "ftosis     s3, s19\n\t"
944 +        "ftosis     s4, s20\n\t"
945 +        "ftosis     s5, s21\n\t"
946 +        "ftosis     s6, s22\n\t"
947 +        "ftosis     s7, s23\n\t"
948 +    "1:\n\t"
949 +        "subs       %[len], %[len], #8\n\t"
950 +        "fmrrs      r3, r4, {s0, s1}\n\t"
951 +        "fmrrs      r5, r6, {s2, s3}\n\t"
952 +        "fmrrs      r7, r8, {s4, s5}\n\t"
953 +        "fmrrs      ip, lr, {s6, s7}\n\t"
954 +        "fldmiasgt  %[src]!, {s16-s23}\n\t"
955 +        "ssat       r4, #16, r4\n\t"
956 +        "ssat       r3, #16, r3\n\t"
957 +        "ssat       r6, #16, r6\n\t"
958 +        "ssat       r5, #16, r5\n\t"
959 +        "pkhbt      r3, r3, r4, lsl #16\n\t"
960 +        "pkhbt      r4, r5, r6, lsl #16\n\t"
961 +        "ftosisgt   s0, s16\n\t"
962 +        "ftosisgt   s1, s17\n\t"
963 +        "ftosisgt   s2, s18\n\t"
964 +        "ftosisgt   s3, s19\n\t"
965 +        "ftosisgt   s4, s20\n\t"
966 +        "ftosisgt   s5, s21\n\t"
967 +        "ftosisgt   s6, s22\n\t"
968 +        "ftosisgt   s7, s23\n\t"
969 +        "ssat       r8, #16, r8\n\t"
970 +        "ssat       r7, #16, r7\n\t"
971 +        "ssat       lr, #16, lr\n\t"
972 +        "ssat       ip, #16, ip\n\t"
973 +        "pkhbt      r5, r7, r8, lsl #16\n\t"
974 +        "pkhbt      r6, ip, lr, lsl #16\n\t"
975 +        "stmia      %[dst]!, {r3-r6}\n\t"
976 +        "bgt        1b\n\t"
977 +
978 +        : [dst] "+&r" (dst), [src] "+&r" (src), [len] "+&r" (len)
979 +        :
980 +        : "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
981 +          "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
982 +          "r3", "r4", "r5", "r6", "r7", "r8", "ip", "lr",
983 +          "cc", "memory");
984 +}
985 +#endif
986 +
987 +void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx)
988 +{
989 +    c->vector_fmul = vector_fmul_vfp;
990 +    c->vector_fmul_reverse = vector_fmul_reverse_vfp;
991 +#ifdef HAVE_ARMV6
992 +    c->float_to_int16 = float_to_int16_vfp;
993 +#endif
994 +}
995 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/h264dsp_neon.S mythtv/libs/libavcodec/armv4l/h264dsp_neon.S
996 --- mythtv.orig/libs/libavcodec/armv4l/h264dsp_neon.S   1970-01-01 01:00:00.000000000 +0100
997 +++ mythtv/libs/libavcodec/armv4l/h264dsp_neon.S        2008-07-24 19:54:01.033198000 +0200
998 @@ -0,0 +1,148 @@
999 +/*
1000 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
1001 + *
1002 + * This file is part of FFmpeg.
1003 + *
1004 + * FFmpeg is free software; you can redistribute it and/or
1005 + * modify it under the terms of the GNU Lesser General Public
1006 + * License as published by the Free Software Foundation; either
1007 + * version 2.1 of the License, or (at your option) any later version.
1008 + *
1009 + * FFmpeg is distributed in the hope that it will be useful,
1010 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1011 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
1012 + * Lesser General Public License for more details.
1013 + *
1014 + * You should have received a copy of the GNU Lesser General Public
1015 + * License along with FFmpeg; if not, write to the Free Software
1016 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1017 + */
1018 +
1019 +        .fpu neon
1020 +
1021 +        .text
1022 +        .align
1023 +        .global ff_put_h264_chroma_mc8_neon
1024 +        .func   ff_put_h264_chroma_mc8_neon
1025 +/* void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
1026 +                                    int h, int x, int y) */
1027 +ff_put_h264_chroma_mc8_neon:
1028 +        push      {r4-r7}
1029 +        ldrd      r4, [sp, #16]
1030 +
1031 +        pld       [r1]
1032 +        pld       [r1, r2]
1033 +
1034 +        muls      r7, r4, r5
1035 +        rsb       r6, r7, r5, lsl #3
1036 +        rsb       ip, r7, r4, lsl #3
1037 +        sub       r4, r7, r4, lsl #3
1038 +        sub       r4, r4, r5, lsl #3
1039 +        add       r4, r4, #64
1040 +
1041 +        beq       2f
1042 +
1043 +        add       r5, r1, r2
1044 +
1045 +        vdup.8    d0, r4
1046 +        lsl       r4, r2, #1
1047 +        vdup.8    d1, ip
1048 +        vld1.64   {d4,d5}, [r1], r4
1049 +        vdup.8    d2, r6
1050 +        vld1.64   {d6,d7}, [r5], r4
1051 +        vdup.8    d3, r7
1052 +
1053 +        mov       r6, #32
1054 +        vext.8    d5, d4, d5, #1
1055 +        vdup.16   q12, r6
1056 +        vext.8    d7, d6, d7, #1
1057 +1:
1058 +        pld       [r5]
1059 +        vmull.u8  q8, d4, d0
1060 +        vmlal.u8  q8, d5, d1
1061 +        vld1.64   {d4,d5}, [r1], r4
1062 +        vmlal.u8  q8, d6, d2
1063 +        vext.8    d5, d4, d5, #1
1064 +        vmlal.u8  q8, d7, d3
1065 +        vmull.u8  q9, d6, d0
1066 +        vadd.i16  q8, q8, q12
1067 +        subs      r3, r3, #2
1068 +        vmlal.u8  q9, d7, d1
1069 +        vshrn.u16 d16, q8, #6
1070 +        vld1.64   {d6,d7}, [r5], r4
1071 +        vmlal.u8  q9, d4, d2
1072 +        vmlal.u8  q9, d5, d3
1073 +        pld       [r1]
1074 +        vadd.i16  q9, q9, q12
1075 +        vst1.64   {d16}, [r0,:64], r2
1076 +        vshrn.u16 d17, q9, #6
1077 +        vext.8    d7, d6, d7, #1
1078 +        vst1.64   {d17}, [r0,:64], r2
1079 +        bgt       1b
1080 +
1081 +        pop       {r4-r7}
1082 +        bx        lr
1083 +
1084 +2:
1085 +        tst       r6, r6
1086 +        add       ip, ip, r6
1087 +        vdup.8    d0, r4
1088 +        vdup.8    d1, ip
1089 +        mov       r6, #32
1090 +        vdup.16   q12, r6
1091 +
1092 +        beq       4f
1093 +
1094 +        add       r5, r1, r2
1095 +        lsl       r4, r2, #1
1096 +        vld1.64   {d4}, [r1], r4
1097 +        vld1.64   {d6}, [r5], r4
1098 +3:
1099 +        pld       [r5]
1100 +        vmull.u8  q8, d4, d0
1101 +        vmlal.u8  q8, d6, d1
1102 +        vld1.64   {d4}, [r1], r4
1103 +        vmull.u8  q9, d6, d0
1104 +        vadd.i16  q8, q8, q12
1105 +        vmlal.u8  q9, d4, d1
1106 +        vshrn.u16 d16, q8, #6
1107 +        vadd.i16  q9, q9, q12
1108 +        vst1.64   {d16}, [r0,:64], r2
1109 +        vshrn.u16 d17, q9, #6
1110 +        subs      r3, r3, #2
1111 +        vld1.64   {d6}, [r5], r4
1112 +        pld       [r1]
1113 +        vst1.64   {d17}, [r0,:64], r2
1114 +        bgt       3b
1115 +
1116 +        pop       {r4-r7}
1117 +        bx        lr
1118 +
1119 +4:
1120 +        vld1.64   {d4,d5}, [r1], r2
1121 +        vld1.64   {d6,d7}, [r1], r2
1122 +        vext.8    d5, d4, d5, #1
1123 +        vext.8    d7, d6, d7, #1
1124 +5:
1125 +        pld       [r1]
1126 +        subs      r3, r3, #2
1127 +        vmull.u8  q8, d4, d0
1128 +        vmlal.u8  q8, d5, d1
1129 +        vld1.64   {d4,d5}, [r1], r2
1130 +        vmull.u8  q9, d6, d0
1131 +        vmlal.u8  q9, d7, d1
1132 +        pld       [r1]
1133 +        vadd.i16  q8, q8, q12
1134 +        vadd.i16  q9, q9, q12
1135 +        vext.8    d5, d4, d5, #1
1136 +        vshrn.u16 d16, q8, #6
1137 +        vld1.64   {d6,d7}, [r1], r2
1138 +        vshrn.u16 d17, q9, #6
1139 +        vst1.64   {d16}, [r0,:64], r2
1140 +        vext.8    d7, d6, d7, #1
1141 +        vst1.64   {d17}, [r0,:64], r2
1142 +        bgt       5b
1143 +
1144 +        pop       {r4-r7}
1145 +        bx        lr
1146 +        .endfunc
1147 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_arm.c mythtv/libs/libavcodec/armv4l/mpegvideo_arm.c
1148 --- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_arm.c  2008-07-23 12:19:05.000000000 +0200
1149 +++ mythtv/libs/libavcodec/armv4l/mpegvideo_arm.c       2008-07-24 19:54:01.263198000 +0200
1150 @@ -18,9 +18,9 @@
1151   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1152   */
1153  
1154 -#include "dsputil.h"
1155 -#include "mpegvideo.h"
1156 -#include "avcodec.h"
1157 +#include "libavcodec/avcodec.h"
1158 +#include "libavcodec/dsputil.h"
1159 +#include "libavcodec/mpegvideo.h"
1160  
1161  extern void MPV_common_init_iwmmxt(MpegEncContext *s);
1162  extern void MPV_common_init_armv5te(MpegEncContext *s);
1163 @@ -28,7 +28,7 @@
1164  void MPV_common_init_armv4l(MpegEncContext *s)
1165  {
1166      /* IWMMXT support is a superset of armv5te, so
1167 -     * allow optimised functions for armv5te unless
1168 +     * allow optimized functions for armv5te unless
1169       * a better iwmmxt function exists
1170       */
1171  #ifdef HAVE_ARMV5TE
1172 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_armv5te.c mythtv/libs/libavcodec/armv4l/mpegvideo_armv5te.c
1173 --- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_armv5te.c      2008-07-23 12:19:05.000000000 +0200
1174 +++ mythtv/libs/libavcodec/armv4l/mpegvideo_armv5te.c   2008-07-24 19:54:01.263198000 +0200
1175 @@ -19,9 +19,9 @@
1176   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1177   */
1178  
1179 -#include "dsputil.h"
1180 -#include "mpegvideo.h"
1181 -#include "avcodec.h"
1182 +#include "libavcodec/avcodec.h"
1183 +#include "libavcodec/dsputil.h"
1184 +#include "libavcodec/mpegvideo.h"
1185  
1186  
1187  #ifdef ENABLE_ARM_TESTS
1188 @@ -65,7 +65,7 @@
1189  ({ DCTELEM *xblock = xxblock; \
1190     int xqmul = xxqmul, xqadd = xxqadd, xcount = xxcount, xtmp; \
1191     int xdata1, xdata2; \
1192 -__asm__ __volatile__( \
1193 +asm volatile( \
1194          "subs %[count], %[count], #2       \n\t" \
1195          "ble 2f                            \n\t" \
1196          "ldrd r4, [%[block], #0]           \n\t" \
1197 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c mythtv/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c
1198 --- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c       2008-07-23 12:19:05.000000000 +0200
1199 +++ mythtv/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c    2008-07-24 19:54:01.273198000 +0200
1200 @@ -18,9 +18,9 @@
1201   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1202   */
1203  
1204 -#include "dsputil.h"
1205 -#include "mpegvideo.h"
1206 -#include "avcodec.h"
1207 +#include "libavcodec/avcodec.h"
1208 +#include "libavcodec/dsputil.h"
1209 +#include "libavcodec/mpegvideo.h"
1210  
1211  static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s,
1212                                               DCTELEM *block, int n, int qscale)
1213 @@ -48,7 +48,7 @@
1214      else
1215          nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
1216  
1217 -    __asm__ __volatile__ (
1218 +    asm volatile (
1219  /*      "movd %1, %%mm6                 \n\t" //qmul */
1220  /*      "packssdw %%mm6, %%mm6          \n\t" */
1221  /*      "packssdw %%mm6, %%mm6          \n\t" */
1222 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/simple_idct_arm.S mythtv/libs/libavcodec/armv4l/simple_idct_arm.S
1223 --- mythtv.orig/libs/libavcodec/armv4l/simple_idct_arm.S        2008-07-23 12:19:05.000000000 +0200
1224 +++ mythtv/libs/libavcodec/armv4l/simple_idct_arm.S     2008-07-24 19:54:01.503198000 +0200
1225 @@ -79,7 +79,7 @@
1226  
1227  
1228  __row_loop:
1229 -        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
1230 +        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
1231          ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
1232          ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
1233          ldr r3, [r14, #8]        @ R3=ROWr32[2]
1234 @@ -421,7 +421,7 @@
1235          @@ col[40] = ((a2 - b2) >> COL_SHIFT);
1236          @@ col[48] = ((a1 - b1) >> COL_SHIFT);
1237          @@ col[56] = ((a0 - b0) >> COL_SHIFT);
1238 -        @@@@@ no optimisation here @@@@@
1239 +        @@@@@ no optimization here @@@@@
1240          add r8, r6, r0           @ R8=a0+b0
1241          add r9, r2, r1           @ R9=a1+b1
1242          mov r8, r8, asr #COL_SHIFT
1243 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/simple_idct_neon.S mythtv/libs/libavcodec/armv4l/simple_idct_neon.S
1244 --- mythtv.orig/libs/libavcodec/armv4l/simple_idct_neon.S       1970-01-01 01:00:00.000000000 +0100
1245 +++ mythtv/libs/libavcodec/armv4l/simple_idct_neon.S    2008-07-24 19:54:01.503198000 +0200
1246 @@ -0,0 +1,388 @@
1247 +/*
1248 + * ARM NEON IDCT
1249 + *
1250 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
1251 + *
1252 + * Based on Simple IDCT
1253 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
1254 + *
1255 + * This file is part of FFmpeg.
1256 + *
1257 + * FFmpeg is free software; you can redistribute it and/or
1258 + * modify it under the terms of the GNU Lesser General Public
1259 + * License as published by the Free Software Foundation; either
1260 + * version 2.1 of the License, or (at your option) any later version.
1261 + *
1262 + * FFmpeg is distributed in the hope that it will be useful,
1263 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1264 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
1265 + * Lesser General Public License for more details.
1266 + *
1267 + * You should have received a copy of the GNU Lesser General Public
1268 + * License along with FFmpeg; if not, write to the Free Software
1269 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1270 + */
1271 +
1272 +#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1273 +#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1274 +#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1275 +#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1276 +#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1277 +#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1278 +#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1279 +#define W4c ((1<<(COL_SHIFT-1))/W4)
1280 +#define ROW_SHIFT 11
1281 +#define COL_SHIFT 20
1282 +
1283 +#define w1 d0[0]
1284 +#define w2 d0[1]
1285 +#define w3 d0[2]
1286 +#define w4 d0[3]
1287 +#define w5 d1[0]
1288 +#define w6 d1[1]
1289 +#define w7 d1[2]
1290 +#define w4c d1[3]
1291 +
1292 +        .fpu neon
1293 +
1294 +        .macro idct_col4_top
1295 +        vmull.s16 q7,  d6,  w2    /* q9   = W2 * col[2] */
1296 +        vmull.s16 q8,  d6,  w6    /* q10  = W6 * col[2] */
1297 +        vmull.s16 q9,  d4,  w1    /* q9  = W1 * col[1] */
1298 +        vadd.i32  q11, q15, q7
1299 +        vmull.s16 q10, d4,  w3    /* q10 = W3 * col[1] */
1300 +        vadd.i32  q12, q15, q8
1301 +        vmull.s16 q5,  d4,  w5    /* q5  = W5 * col[1] */
1302 +        vsub.i32  q13, q15, q8
1303 +        vmull.s16 q6,  d4,  w7    /* q6  = W7 * col[1] */
1304 +        vsub.i32  q14, q15, q7
1305 +
1306 +        vmlal.s16 q9,  d8, w3     /* q9  += W3 * col[3] */
1307 +        vmlsl.s16 q10, d8, w7     /* q10 -= W7 * col[3] */
1308 +        vmlsl.s16 q5,  d8, w1     /* q5  -= W1 * col[3] */
1309 +        vmlsl.s16 q6,  d8, w5     /* q6  -= W5 * col[3] */
1310 +        .endm
1311 +
1312 +        .macro idct_col4_mid1
1313 +        vmull.s16 q7,  d3,  w4    /* q7 = W4 * col[4] */
1314 +        vadd.i32  q11, q11, q7
1315 +        vsub.i32  q12, q12, q7
1316 +        vsub.i32  q13, q13, q7
1317 +        vadd.i32  q14, q14, q7
1318 +        .endm
1319 +
1320 +        .macro idct_col4_mid2
1321 +        vmlal.s16 q9,  d5, w5     /* q9  += W5 * col[5] */
1322 +        vmlsl.s16 q10, d5, w1     /* q10 -= W1 * col[5] */
1323 +        vmlal.s16 q5,  d5, w7     /* q5  += W7 * col[5] */
1324 +        vmlal.s16 q6,  d5, w3     /* q6  += W3 * col[5] */
1325 +        .endm
1326 +
1327 +        .macro idct_col4_mid3
1328 +        vmull.s16 q7,  d7, w6     /* q7 = W6 * col[6] */
1329 +        vmull.s16 q8,  d7, w2     /* q8 = W2 * col[6] */
1330 +        vadd.i32  q11, q11, q7
1331 +        vsub.i32  q12, q12, q8
1332 +        vadd.i32  q13, q13, q8
1333 +        vsub.i32  q14, q14, q7
1334 +        .endm
1335 +
1336 +        .macro idct_col4_mid4
1337 +        vmlal.s16 q9,  d9, w7
1338 +        vmlsl.s16 q10, d9, w5
1339 +        vmlal.s16 q5,  d9, w3
1340 +        vmlsl.s16 q6,  d9, w1
1341 +        .endm
1342 +
1343 +        .macro idct_col4_mid
1344 +        vmull.s16 q7,  d3,  w4    /* q7   = W4 * col[4] */
1345 +        vmlal.s16 q9,  d5,  w5    /* q9  += W5 * col[5] */
1346 +        vmlsl.s16 q10, d5,  w1    /* q10 -= W1 * col[5] */
1347 +        vadd.i32  q11, q11, q7
1348 +        vmull.s16 q8,  d7,  w2    /* q8   = W2 * col[6] */
1349 +        vsub.i32  q12, q12, q7
1350 +        vmlal.s16 q5,  d5,  w7    /* q5  += W7 * col[5] */
1351 +        vsub.i32  q13, q13, q7
1352 +        vmlal.s16 q6,  d5,  w3    /* q6  += W3 * col[5] */
1353 +        vadd.i32  q14, q14, q7
1354 +        vmull.s16 q7,  d7,  w6    /* q7   = W6 * col[6] */
1355 +        vadd.i32  q11, q11, q7
1356 +        vmlal.s16 q9,  d9,  w7
1357 +        vsub.i32  q12, q12, q8
1358 +        vmlsl.s16 q10, d9,  w5
1359 +        vadd.i32  q13, q13, q8
1360 +        vmlal.s16 q5,  d9,  w3
1361 +        vsub.i32  q14, q14, q7
1362 +        vmlsl.s16 q6,  d9,  w1
1363 +        .endm
1364 +
1365 +        .macro idct_col4_end
1366 +        vadd.i32 q3,  q11, q9
1367 +        vadd.i32 q4,  q12, q10
1368 +        vadd.i32 q7,  q13, q5
1369 +        vadd.i32 q8,  q14, q6
1370 +        vsub.i32 q11, q11, q9
1371 +        vsub.i32 q12, q12, q10
1372 +        vsub.i32 q13, q13, q5
1373 +        vsub.i32 q14, q14, q6
1374 +        .endm
1375 +
1376 +       .text
1377 +        .align
1378 +        .type idct_row4_neon, %function
1379 +        .func idct_row4_neon
1380 +idct_row4_neon:
1381 +        vld1.64 {d2,d3}, [a3,:128]!
1382 +        vld1.64 {d4,d5}, [a3,:128]!
1383 +        vld1.64 {d6,d7}, [a3,:128]!
1384 +        vld1.64 {d8,d9}, [a3,:128]!
1385 +        add a3, a3, #-64
1386 +
1387 +        vmov.i32  q15, #(1<<(ROW_SHIFT-1))
1388 +        vorr      d10, d3,  d5
1389 +        vtrn.16   q1,  q2
1390 +        vorr      d11, d7,  d9
1391 +        vtrn.16   q3,  q4
1392 +        vorr      d10, d10, d11
1393 +        vtrn.32   q1,  q3
1394 +        vmlal.s16 q15, d2,  w4    /* q15 += W4 * col[0] */
1395 +        vtrn.32   q2,  q4
1396 +        vmov      a4,  v1,  d10
1397 +
1398 +        idct_col4_top
1399 +
1400 +        orrs a4, a4, v1
1401 +        beq 1f
1402 +        idct_col4_mid
1403 +1:
1404 +        vadd.i32 q3,  q11, q9
1405 +        vadd.i32 q4,  q12, q10
1406 +        vshrn.i32 d2, q3,  #ROW_SHIFT
1407 +        vadd.i32 q7,  q13, q5
1408 +        vshrn.i32 d4, q4,  #ROW_SHIFT
1409 +        vadd.i32 q8,  q14, q6
1410 +        vshrn.i32 d6, q7,  #ROW_SHIFT
1411 +        vsub.i32 q11, q11, q9
1412 +        vshrn.i32 d8, q8,  #ROW_SHIFT
1413 +        vsub.i32 q12, q12, q10
1414 +        vshrn.i32 d9, q11, #ROW_SHIFT
1415 +        vsub.i32 q13, q13, q5
1416 +        vshrn.i32 d7, q12, #ROW_SHIFT
1417 +        vsub.i32 q14, q14, q6
1418 +        vshrn.i32 d5, q13, #ROW_SHIFT
1419 +        vshrn.i32 d3, q14, #ROW_SHIFT
1420 +
1421 +        vtrn.16   q1, q2
1422 +        vtrn.16   q3, q4
1423 +        vtrn.32   q1, q3
1424 +        vtrn.32   q2, q4
1425 +
1426 +        vst1.64 {d2,d3}, [a3,:128]!
1427 +        vst1.64 {d4,d5}, [a3,:128]!
1428 +        vst1.64 {d6,d7}, [a3,:128]!
1429 +        vst1.64 {d8,d9}, [a3,:128]!
1430 +
1431 +        mov pc, lr
1432 +        .endfunc
1433 +
1434 +        .align
1435 +        .type idct_col4_neon, %function
1436 +        .func idct_col4_neon
1437 +idct_col4_neon:
1438 +        mov ip, #16
1439 +        vld1.64 {d2}, [a3,:64], ip /* d2 = col[0] */
1440 +        vld1.64 {d4}, [a3,:64], ip /* d3 = col[1] */
1441 +        vld1.64 {d6}, [a3,:64], ip /* d4 = col[2] */
1442 +        vld1.64 {d8}, [a3,:64], ip /* d5 = col[3] */
1443 +        vld1.64 {d3}, [a3,:64], ip /* d6 = col[4] */
1444 +        vld1.64 {d5}, [a3,:64], ip /* d7 = col[5] */
1445 +        vld1.64 {d7}, [a3,:64], ip /* d8 = col[6] */
1446 +        vld1.64 {d9}, [a3,:64], ip /* d9 = col[7] */
1447 +
1448 +        vrev64.32 d11, d3
1449 +        vrev64.32 d13, d5
1450 +        vorr      d11, d3, d11
1451 +        vrev64.32 d15, d7
1452 +        vorr      d13, d5, d13
1453 +        vrev64.32 d17, d9
1454 +        vorr      d15, d7, d15
1455 +        vmov.32   v1,  d11[0]
1456 +        vmov.32   v2,  d13[0]
1457 +        vorr      d17, d9, d17
1458 +        vmov.32   v3,  d15[0]
1459 +        vmov.32   ip,  d17[0]
1460 +        vdup.16   d30, w4c
1461 +        vadd.i16  d30, d30, d2
1462 +        vmull.s16 q15, d30, w4 /* q15 = W4 * (col[0]+(1<<(COL_SHIFT-1))/W4) */
1463 +
1464 +        idct_col4_top
1465 +        tst v1, v1
1466 +        beq 1f
1467 +        idct_col4_mid1
1468 +1:      tst v2, v2
1469 +        beq 2f
1470 +        idct_col4_mid2
1471 +2:      tst v3, v3
1472 +        beq 3f
1473 +        idct_col4_mid3
1474 +3:      tst ip, ip
1475 +        beq 4f
1476 +        idct_col4_mid4
1477 +4:
1478 +        idct_col4_end
1479 +
1480 +        vshr.s32  q2, q3,  #COL_SHIFT
1481 +        vshr.s32  q3, q4,  #COL_SHIFT
1482 +        vmovn.i32 d2, q2
1483 +        vshr.s32  q4, q7,  #COL_SHIFT
1484 +        vmovn.i32 d3, q3
1485 +        vshr.s32  q5, q8,  #COL_SHIFT
1486 +        vmovn.i32 d4, q4
1487 +        vshr.s32  q6, q14, #COL_SHIFT
1488 +        vmovn.i32 d5, q5
1489 +        vshr.s32  q7, q13, #COL_SHIFT
1490 +        vmovn.i32 d6, q6
1491 +        vshr.s32  q8, q12, #COL_SHIFT
1492 +        vmovn.i32 d7, q7
1493 +        vshr.s32  q9, q11, #COL_SHIFT
1494 +        vmovn.i32 d8, q8
1495 +        vmovn.i32 d9, q9
1496 +
1497 +        mov pc, lr
1498 +        .endfunc
1499 +
1500 +        .macro idct_col4_st16
1501 +        mov ip, #16
1502 +        vst1.64 {d2}, [a3,:64], ip
1503 +        vst1.64 {d3}, [a3,:64], ip
1504 +        vst1.64 {d4}, [a3,:64], ip
1505 +        vst1.64 {d5}, [a3,:64], ip
1506 +        vst1.64 {d6}, [a3,:64], ip
1507 +        vst1.64 {d7}, [a3,:64], ip
1508 +        vst1.64 {d8}, [a3,:64], ip
1509 +        vst1.64 {d9}, [a3,:64], ip
1510 +        .endm
1511 +
1512 +        .align
1513 +        .type idct_col4_add8, %function
1514 +        .func idct_col4_add8
1515 +idct_col4_add8:
1516 +        vld1.32 {d10[0]}, [a1,:32], a2
1517 +        vld1.32 {d10[1]}, [a1,:32], a2
1518 +        vld1.32 {d11[0]}, [a1,:32], a2
1519 +        vld1.32 {d11[1]}, [a1,:32], a2
1520 +        vld1.32 {d12[0]}, [a1,:32], a2
1521 +        vld1.32 {d12[1]}, [a1,:32], a2
1522 +        vld1.32 {d13[0]}, [a1,:32], a2
1523 +        vld1.32 {d13[1]}, [a1,:32], a2
1524 +
1525 +        vaddw.u8 q1, q1, d10
1526 +        vaddw.u8 q2, q2, d11
1527 +        vaddw.u8 q3, q3, d12
1528 +        vaddw.u8 q4, q4, d13
1529 +
1530 +        sub a1, a1, a2, lsl #3
1531 +        .endfunc
1532 +
1533 +        .type idct_col4_st8, %function
1534 +        .func idct_col4_st8
1535 +idct_col4_st8:
1536 +        vqmovun.s16 d2, q1
1537 +        vqmovun.s16 d3, q2
1538 +        vqmovun.s16 d4, q3
1539 +        vqmovun.s16 d5, q4
1540 +
1541 +        vst1.32 {d2[0]}, [a1,:32], a2
1542 +        vst1.32 {d2[1]}, [a1,:32], a2
1543 +        vst1.32 {d3[0]}, [a1,:32], a2
1544 +        vst1.32 {d3[1]}, [a1,:32], a2
1545 +        vst1.32 {d4[0]}, [a1,:32], a2
1546 +        vst1.32 {d4[1]}, [a1,:32], a2
1547 +        vst1.32 {d5[0]}, [a1,:32], a2
1548 +        vst1.32 {d5[1]}, [a1,:32], a2
1549 +        mov pc, lr
1550 +        .endfunc
1551 +
1552 +        .align 4
1553 +const:  .short W1, W2, W3, W4, W5, W6, W7, W4c
1554 +
1555 +        .macro idct_start data
1556 +        pld [\data]
1557 +        pld [\data, #64]
1558 +        push {v1-v3, lr}
1559 +        vpush {d8-d15}
1560 +        adr a4, const
1561 +        vld1.64 {d0,d1}, [a4,:128]
1562 +        .endm
1563 +
1564 +        .macro idct_end
1565 +        vpop {d8-d15}
1566 +        pop {v1-v3, pc}
1567 +        .endm
1568 +
1569 +        .align
1570 +        .global ff_simple_idct_neon
1571 +        .type ff_simple_idct_neon, %function
1572 +        .func ff_simple_idct_neon
1573 +/* void ff_simple_idct_neon(DCTELEM *data); */
1574 +ff_simple_idct_neon:
1575 +        idct_start a1
1576 +
1577 +        mov a3, a1
1578 +        bl idct_row4_neon
1579 +        bl idct_row4_neon
1580 +        add a3, a3, #-128
1581 +        bl idct_col4_neon
1582 +        add a3, a3, #-128
1583 +        idct_col4_st16
1584 +        add a3, a3, #-120
1585 +        bl idct_col4_neon
1586 +        add a3, a3, #-128
1587 +        idct_col4_st16
1588 +
1589 +        idct_end
1590 +        .endfunc
1591 +
1592 +        .align
1593 +        .global ff_simple_idct_put_neon
1594 +        .type ff_simple_idct_put_neon, %function
1595 +        .func ff_simple_idct_put_neon
1596 +/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */
1597 +ff_simple_idct_put_neon:
1598 +        idct_start a3
1599 +
1600 +        bl idct_row4_neon
1601 +        bl idct_row4_neon
1602 +        add a3, a3, #-128
1603 +        bl idct_col4_neon
1604 +        bl idct_col4_st8
1605 +        sub a1, a1, a2, lsl #3
1606 +        add a1, a1, #4
1607 +        add a3, a3, #-120
1608 +        bl idct_col4_neon
1609 +        bl idct_col4_st8
1610 +
1611 +        idct_end
1612 +        .endfunc
1613 +
1614 +        .align
1615 +        .global ff_simple_idct_add_neon
1616 +        .type ff_simple_idct_add_neon, %function
1617 +        .func ff_simple_idct_add_neon
1618 +/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */
1619 +ff_simple_idct_add_neon:
1620 +        idct_start a3
1621 +
1622 +        bl idct_row4_neon
1623 +        bl idct_row4_neon
1624 +        add a3, a3, #-128
1625 +        bl idct_col4_neon
1626 +        bl idct_col4_add8
1627 +        sub a1, a1, a2, lsl #3
1628 +        add a1, a1, #4
1629 +        add a3, a3, #-120
1630 +        bl idct_col4_neon
1631 +        bl idct_col4_add8
1632 +
1633 +        idct_end
1634 +        .endfunc
1635 diff -Nurd mythtv.orig/libs/libavcodec/avcodec.h mythtv/libs/libavcodec/avcodec.h
1636 --- mythtv.orig/libs/libavcodec/avcodec.h       2008-07-23 12:19:11.000000000 +0200
1637 +++ mythtv/libs/libavcodec/avcodec.h    2008-07-24 19:56:46.953198000 +0200
1638 @@ -1328,6 +1328,8 @@
1639  #define FF_IDCT_SIMPLEARMV6   17
1640  #define FF_IDCT_SIMPLEVIS     18
1641  #define FF_IDCT_WMV2          19
1642 +#define FF_IDCT_FAAN          20
1643 +#define FF_IDCT_SIMPLENEON    21
1644  
1645      /**
1646       * slice count
1647 diff -Nurd mythtv.orig/libs/libavcodec/libavcodec.pro mythtv/libs/libavcodec/libavcodec.pro
1648 --- mythtv.orig/libs/libavcodec/libavcodec.pro  2008-07-23 12:19:10.000000000 +0200
1649 +++ mythtv/libs/libavcodec/libavcodec.pro       2008-07-24 19:54:01.503198000 +0200
1650 @@ -413,6 +413,8 @@
1651  
1652  contains( HAVE_ARMV6, yes )      { SOURCES += armv4l/simple_idct_armv6.S }
1653  
1654 +contains( HAVE_NEON, yes )      { SOURCES += armv4l/float_arm_vfp.c armv4l/simple_idct_neon.S armv4l/dsputil_neon.c armv4l/h264dsp_neon.S }
1655 +
1656  contains( HAVE_VIS, yes ) {
1657      SOURCES += sparc/dsputil_vis.c
1658      SOURCES += sparc/simple_idct_vis.c
1659 diff -Nurd mythtv.orig/libs/libavcodec/utils.c mythtv/libs/libavcodec/utils.c
1660 --- mythtv.orig/libs/libavcodec/utils.c 2008-07-23 12:19:10.000000000 +0200
1661 +++ mythtv/libs/libavcodec/utils.c      2008-07-24 19:58:12.403198000 +0200
1662 @@ -594,6 +594,7 @@
1663  {"sh4", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SH4, INT_MIN, INT_MAX, V|E|D, "idct"},
1664  {"simplearm", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARM, INT_MIN, INT_MAX, V|E|D, "idct"},
1665  {"simplearmv5te", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV5TE, INT_MIN, INT_MAX, V|E|D, "idct"},
1666 +{"simpleneon", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLENEON, INT_MIN, INT_MAX, V|E|D, "idct"},
1667  {"h264", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_H264, INT_MIN, INT_MAX, V|E|D, "idct"},
1668  {"vp3", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_VP3, INT_MIN, INT_MAX, V|E|D, "idct"},
1669  {"ipp", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_IPP, INT_MIN, INT_MAX, V|E|D, "idct"},