1 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_arm.c mythtv/libs/libavcodec/armv4l/dsputil_arm.c
2 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_arm.c 2008-07-23 12:19:05.000000000 +0200
3 +++ mythtv/libs/libavcodec/armv4l/dsputil_arm.c 2008-07-24 19:54:00.753198000 +0200
5 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
9 +#include "libavcodec/dsputil.h"
15 extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
16 +extern void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx);
17 +extern void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
19 extern void j_rev_dct_ARM(DCTELEM *data);
20 extern void simple_idct_ARM(DCTELEM *data);
22 extern void ff_simple_idct_add_armv6(uint8_t *dest, int line_size,
25 +extern void ff_simple_idct_neon(DCTELEM *data);
26 +extern void ff_simple_idct_put_neon(uint8_t *dest, int line_size,
28 +extern void ff_simple_idct_add_neon(uint8_t *dest, int line_size,
32 static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
33 static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
39 +static void prefetch_arm(void *mem, int stride, int h)
43 + "subs %0, %0, #1 \n\t"
45 + "add %1, %1, %2 \n\t"
47 + : "+r"(h), "+r"(mem) : "r"(stride));
53 + return ENABLE_IWMMXT * MM_IWMMXT;
56 void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
58 int idct_algo= avctx->idct_algo;
60 ff_put_pixels_clamped = c->put_pixels_clamped;
61 ff_add_pixels_clamped = c->add_pixels_clamped;
63 - if(idct_algo == FF_IDCT_AUTO){
64 + if (avctx->lowres == 0) {
65 + if(idct_algo == FF_IDCT_AUTO){
67 - idct_algo = FF_IDCT_IPP;
68 + idct_algo = FF_IDCT_IPP;
69 +#elif defined(HAVE_NEON)
70 + idct_algo = FF_IDCT_SIMPLENEON;
71 #elif defined(HAVE_ARMV6)
72 - idct_algo = FF_IDCT_SIMPLEARMV6;
73 + idct_algo = FF_IDCT_SIMPLEARMV6;
74 #elif defined(HAVE_ARMV5TE)
75 - idct_algo = FF_IDCT_SIMPLEARMV5TE;
76 + idct_algo = FF_IDCT_SIMPLEARMV5TE;
78 - idct_algo = FF_IDCT_ARM;
79 + idct_algo = FF_IDCT_ARM;
84 - if(idct_algo==FF_IDCT_ARM){
85 - c->idct_put= j_rev_dct_ARM_put;
86 - c->idct_add= j_rev_dct_ARM_add;
87 - c->idct = j_rev_dct_ARM;
88 - c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */
89 - } else if (idct_algo==FF_IDCT_SIMPLEARM){
90 - c->idct_put= simple_idct_ARM_put;
91 - c->idct_add= simple_idct_ARM_add;
92 - c->idct = simple_idct_ARM;
93 - c->idct_permutation_type= FF_NO_IDCT_PERM;
94 + if(idct_algo==FF_IDCT_ARM){
95 + c->idct_put= j_rev_dct_ARM_put;
96 + c->idct_add= j_rev_dct_ARM_add;
97 + c->idct = j_rev_dct_ARM;
98 + c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */
99 + } else if (idct_algo==FF_IDCT_SIMPLEARM){
100 + c->idct_put= simple_idct_ARM_put;
101 + c->idct_add= simple_idct_ARM_add;
102 + c->idct = simple_idct_ARM;
103 + c->idct_permutation_type= FF_NO_IDCT_PERM;
105 - } else if (idct_algo==FF_IDCT_SIMPLEARMV6){
106 - c->idct_put= ff_simple_idct_put_armv6;
107 - c->idct_add= ff_simple_idct_add_armv6;
108 - c->idct = ff_simple_idct_armv6;
109 - c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
110 + } else if (idct_algo==FF_IDCT_SIMPLEARMV6){
111 + c->idct_put= ff_simple_idct_put_armv6;
112 + c->idct_add= ff_simple_idct_add_armv6;
113 + c->idct = ff_simple_idct_armv6;
114 + c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
117 - } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
118 - c->idct_put= simple_idct_put_armv5te;
119 - c->idct_add= simple_idct_add_armv5te;
120 - c->idct = simple_idct_armv5te;
121 - c->idct_permutation_type = FF_NO_IDCT_PERM;
122 + } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
123 + c->idct_put= simple_idct_put_armv5te;
124 + c->idct_add= simple_idct_add_armv5te;
125 + c->idct = simple_idct_armv5te;
126 + c->idct_permutation_type = FF_NO_IDCT_PERM;
129 - } else if (idct_algo==FF_IDCT_IPP){
130 - c->idct_put= simple_idct_ipp_put;
131 - c->idct_add= simple_idct_ipp_add;
132 - c->idct = simple_idct_ipp;
133 - c->idct_permutation_type= FF_NO_IDCT_PERM;
134 + } else if (idct_algo==FF_IDCT_IPP){
135 + c->idct_put= simple_idct_ipp_put;
136 + c->idct_add= simple_idct_ipp_add;
137 + c->idct = simple_idct_ipp;
138 + c->idct_permutation_type= FF_NO_IDCT_PERM;
141 + } else if (idct_algo==FF_IDCT_SIMPLENEON){
142 + c->idct_put= ff_simple_idct_put_neon;
143 + c->idct_add= ff_simple_idct_add_neon;
144 + c->idct = ff_simple_idct_neon;
145 + c->idct_permutation_type = FF_NO_IDCT_PERM;
150 c->put_pixels_tab[0][0] = put_pixels16_arm;
152 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
153 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;
156 + c->prefetch = prefetch_arm;
160 dsputil_init_iwmmxt(c, avctx);
163 + ff_float_init_arm_vfp(c, avctx);
166 + ff_dsputil_init_neon(c, avctx);
169 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_arm_s.S mythtv/libs/libavcodec/armv4l/dsputil_arm_s.S
170 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_arm_s.S 2008-07-23 12:19:05.000000000 +0200
171 +++ mythtv/libs/libavcodec/armv4l/dsputil_arm_s.S 2008-07-24 19:54:00.753198000 +0200
173 @ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
183 .macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
184 mov \Rd0, \Rn0, lsr #(\shift * 8)
185 mov \Rd1, \Rn1, lsr #(\shift * 8)
186 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt.c mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt.c
187 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt.c 2008-07-23 12:19:05.000000000 +0200
188 +++ mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt.c 2008-07-24 19:54:00.753198000 +0200
190 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
193 -#include "dsputil.h"
194 +#include "libavcodec/dsputil.h"
196 #define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
197 -#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
198 +#define SET_RND(regd) asm volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
199 #define WAVG2B "wavg2b"
200 #include "dsputil_iwmmxt_rnd.h"
205 #define DEF(x, y) x ## _ ## y ##_iwmmxt
206 -#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
207 +#define SET_RND(regd) asm volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
208 #define WAVG2B "wavg2br"
209 #include "dsputil_iwmmxt_rnd.h"
213 uint8_t *pixels2 = pixels + line_size;
215 - __asm__ __volatile__ (
219 "pld [%[pixels], %[line_size2]] \n\t"
222 static void clear_blocks_iwmmxt(DCTELEM *blocks)
227 "mov r1, #(128 * 6 / 32) \n\t"
229 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h
230 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h 2008-07-23 12:19:05.000000000 +0200
231 +++ mythtv/libs/libavcodec/armv4l/dsputil_iwmmxt_rnd.h 2008-07-24 19:54:01.023198000 +0200
233 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
236 -#ifndef FFMPEG_DSPUTIL_IWMMXT_RND_H
237 -#define FFMPEG_DSPUTIL_IWMMXT_RND_H
238 +/* This header intentionally has no multiple inclusion guards. It is meant to
239 + * be included multiple times and generates different code depending on the
240 + * value of certain #defines. */
242 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
244 int stride = line_size;
245 - __asm__ __volatile__ (
247 "and r12, %[pixels], #7 \n\t"
248 "bic %[pixels], %[pixels], #7 \n\t"
249 "tmcr wcgr1, r12 \n\t"
251 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
253 int stride = line_size;
254 - __asm__ __volatile__ (
256 "and r12, %[pixels], #7 \n\t"
257 "bic %[pixels], %[pixels], #7 \n\t"
258 "tmcr wcgr1, r12 \n\t"
260 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
262 int stride = line_size;
263 - __asm__ __volatile__ (
265 "and r12, %[pixels], #7 \n\t"
266 "bic %[pixels], %[pixels], #7 \n\t"
267 "tmcr wcgr1, r12 \n\t"
269 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
271 int stride = line_size;
272 - __asm__ __volatile__ (
274 "pld [%[pixels]] \n\t"
275 "pld [%[pixels], #32] \n\t"
276 "pld [%[block]] \n\t"
278 // [wr0 wr1 wr2 wr3] for previous line
279 // [wr4 wr5 wr6 wr7] for current line
280 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
281 - __asm__ __volatile__(
283 "pld [%[pixels]] \n\t"
284 "pld [%[pixels], #32] \n\t"
285 "and r12, %[pixels], #7 \n\t"
287 // [wr0 wr1 wr2 wr3] for previous line
288 // [wr4 wr5 wr6 wr7] for current line
289 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
290 - __asm__ __volatile__(
292 "pld [%[pixels]] \n\t"
293 "pld [%[pixels], #32] \n\t"
294 "and r12, %[pixels], #7 \n\t"
296 // [wr0 wr1 wr2 wr3] for previous line
297 // [wr4 wr5 wr6 wr7] for current line
298 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
299 - __asm__ __volatile__(
301 "pld [%[pixels]] \n\t"
302 "pld [%[pixels], #32] \n\t"
303 "pld [%[block]] \n\t"
305 // [wr0 wr1 wr2 wr3] for previous line
306 // [wr4 wr5 wr6 wr7] for current line
307 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
308 - __asm__ __volatile__(
310 "pld [%[pixels]] \n\t"
311 "pld [%[pixels], #32] \n\t"
312 "pld [%[block]] \n\t"
314 int stride = line_size;
315 // [wr0 wr1 wr2 wr3] for previous line
316 // [wr4 wr5 wr6 wr7] for current line
317 - __asm__ __volatile__(
319 "pld [%[pixels]] \n\t"
320 "pld [%[pixels], #32] \n\t"
321 "and r12, %[pixels], #7 \n\t"
323 int stride = line_size;
324 // [wr0 wr1 wr2 wr3] for previous line
325 // [wr4 wr5 wr6 wr7] for current line
326 - __asm__ __volatile__(
328 "pld [%[pixels]] \n\t"
329 "pld [%[pixels], #32] \n\t"
330 "and r12, %[pixels], #7 \n\t"
332 int stride = line_size;
333 // [wr0 wr1 wr2 wr3] for previous line
334 // [wr4 wr5 wr6 wr7] for current line
335 - __asm__ __volatile__(
337 "pld [%[pixels]] \n\t"
338 "pld [%[pixels], #32] \n\t"
339 "and r12, %[pixels], #7 \n\t"
341 // [wr0 wr1 wr2 wr3] for previous line
342 // [wr4 wr5 wr6 wr7] for current line
343 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
344 - __asm__ __volatile__(
346 "pld [%[pixels]] \n\t"
348 "pld [%[pixels], #32] \n\t"
350 // [wr0 wr1 wr2 wr3] for previous line
351 // [wr4 wr5 wr6 wr7] for current line
352 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
353 - __asm__ __volatile__(
355 "pld [%[pixels]] \n\t"
357 "pld [%[pixels], #32] \n\t"
359 // [wr0 wr1 wr2 wr3] for previous line
360 // [wr4 wr5 wr6 wr7] for current line
361 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
362 - __asm__ __volatile__(
364 "pld [%[block]] \n\t"
365 "pld [%[block], #32] \n\t"
366 "pld [%[pixels]] \n\t"
368 // [wr0 wr1 wr2 wr3] for previous line
369 // [wr4 wr5 wr6 wr7] for current line
370 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version
371 - __asm__ __volatile__(
373 "pld [%[block]] \n\t"
374 "pld [%[block], #32] \n\t"
375 "pld [%[pixels]] \n\t"
376 @@ -1115,5 +1116,3 @@
377 : [line_size]"r"(line_size)
381 -#endif /* FFMPEG_DSPUTIL_IWMMXT_RND_H */
382 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/dsputil_neon.c mythtv/libs/libavcodec/armv4l/dsputil_neon.c
383 --- mythtv.orig/libs/libavcodec/armv4l/dsputil_neon.c 1970-01-01 01:00:00.000000000 +0100
384 +++ mythtv/libs/libavcodec/armv4l/dsputil_neon.c 2008-07-24 19:54:01.023198000 +0200
387 + * ARM NEON optimised DSP functions
388 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
390 + * This file is part of FFmpeg.
392 + * FFmpeg is free software; you can redistribute it and/or
393 + * modify it under the terms of the GNU Lesser General Public
394 + * License as published by the Free Software Foundation; either
395 + * version 2.1 of the License, or (at your option) any later version.
397 + * FFmpeg is distributed in the hope that it will be useful,
398 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
399 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
400 + * Lesser General Public License for more details.
402 + * You should have received a copy of the GNU Lesser General Public
403 + * License along with FFmpeg; if not, write to the Free Software
404 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
409 +#include "libavcodec/avcodec.h"
410 +#include "libavcodec/dsputil.h"
412 +extern void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
413 + int h, int x, int y);
415 +#define PUT_PIXELS_16_X2(vhadd) \
417 + "vld1.64 {d0,d1,d2}, [%[p]], %[line_size] \n\t" \
418 + "vld1.64 {d4,d5,d6}, [%[p]], %[line_size] \n\t" \
419 + "pld [%[p]] \n\t" \
420 + "subs %[h], %[h], #2 \n\t" \
421 + "vext.8 q1, q0, q1, #1 \n\t" \
422 + "vext.8 q3, q2, q3, #1 \n\t" \
423 + vhadd".u8 q0, q0, q1 \n\t" \
424 + vhadd".u8 q2, q2, q3 \n\t" \
425 + "vst1.64 {d0,d1}, [%[b],:64], %[line_size] \n\t" \
426 + "vst1.64 {d4,d5}, [%[b],:64], %[line_size] \n\t" \
429 +#define PUT_PIXELS_16_Y2(vhadd) \
430 + "add %[p1], %[p0], %[line_size] \n\t" \
431 + "lsl %[l2], %[line_size], #1 \n\t" \
432 + "vld1.64 {d0,d1}, [%[p0]], %[l2] \n\t" \
433 + "vld1.64 {d2,d3}, [%[p1]], %[l2] \n\t" \
435 + "subs %[h], %[h], #2 \n\t" \
436 + vhadd".u8 q2, q0, q1 \n\t" \
437 + "vst1.64 {d4,d5}, [%[b],:128], %[line_size] \n\t" \
438 + "vld1.64 {d0,d1}, [%[p0]], %[l2] \n\t" \
439 + vhadd".u8 q2, q0, q1 \n\t" \
440 + "vst1.64 {d4,d5}, [%[b],:128], %[line_size] \n\t" \
441 + "vld1.64 {d2,d3}, [%[p1]], %[l2] \n\t" \
444 +#define PUT_PIXELS_16_XY2(vshrn, no_rnd) \
445 + "lsl %[l2], %[line_size], #1 \n\t" \
446 + "add %[p1], %[p0], %[line_size] \n\t" \
447 + "vld1.64 {d0,d1,d2}, [%[p0]], %[l2] \n\t" \
448 + "vld1.64 {d4,d5,d6}, [%[p1]], %[l2] \n\t" \
449 + "pld [%[p0]] \n\t" \
450 + "pld [%[p1]] \n\t" \
451 + "vext.8 q1, q0, q1, #1 \n\t" \
452 + "vext.8 q3, q2, q3, #1 \n\t" \
453 + "vaddl.u8 q8, d0, d2 \n\t" \
454 + "vaddl.u8 q10, d1, d3 \n\t" \
455 + "vaddl.u8 q9, d4, d6 \n\t" \
456 + "vaddl.u8 q11, d5, d7 \n\t" \
458 + "subs %[h], %[h], #2 \n\t" \
459 + "vld1.64 {d0,d1,d2}, [%[p0]], %[l2] \n\t" \
460 + "vadd.u16 q12, q8, q9 \n\t" \
461 + "pld [%[p0]] \n\t" \
462 + no_rnd "vadd.u16 q12, q12, q13 \n\t" \
463 + "vext.8 q15, q0, q1, #1 \n\t" \
464 + "vadd.u16 q1, q10, q11 \n\t" \
465 + vshrn".u16 d28, q12, #2 \n\t" \
466 + no_rnd "vadd.u16 q1, q1, q13 \n\t" \
467 + vshrn".u16 d29, q1, #2 \n\t" \
468 + "vaddl.u8 q8, d0, d30 \n\t" \
469 + "vld1.64 {d2,d3,d4}, [%[p1]], %[l2] \n\t" \
470 + "vaddl.u8 q10, d1, d31 \n\t" \
471 + "vst1.64 {d28,d29}, [%[b],:128], %[line_size] \n\t" \
472 + "vadd.u16 q12, q8, q9 \n\t" \
473 + "pld [%[p1]] \n\t" \
474 + no_rnd "vadd.u16 q12, q12, q13 \n\t" \
475 + "vext.8 q2, q1, q2, #1 \n\t" \
476 + "vadd.u16 q0, q10, q11 \n\t" \
477 + vshrn".u16 d30, q12, #2 \n\t" \
478 + no_rnd "vadd.u16 q0, q0, q13 \n\t" \
479 + vshrn".u16 d31, q0, #2 \n\t" \
480 + "vaddl.u8 q9, d2, d4 \n\t" \
481 + "vst1.64 {d30,d31}, [%[b],:128], %[line_size] \n\t" \
482 + "vaddl.u8 q11, d3, d5 \n\t" \
485 +#define PUT_PIXELS_8_X2(vhadd) \
487 + "vld1.64 {d0,d1}, [%[p]], %[line_size] \n\t" \
488 + "vld1.64 {d2,d3}, [%[p]], %[line_size] \n\t" \
489 + "pld [%[p]] \n\t" \
490 + "subs %[h], %[h], #2 \n\t" \
491 + "vext.8 d1, d0, d1, #1 \n\t" \
492 + "vext.8 d3, d2, d3, #1 \n\t" \
493 + "vswp d1, d2 \n\t" \
494 + vhadd".u8 q0, q0, q1 \n\t" \
495 + "vst1.64 {d0}, [%[b],:64], %[line_size] \n\t" \
496 + "vst1.64 {d1}, [%[b],:64], %[line_size] \n\t" \
499 +#define PUT_PIXELS_8_Y2(vhadd) \
500 + "add %[p1], %[p0], %[line_size] \n\t" \
501 + "lsl %[l2], %[line_size], #1 \n\t" \
502 + "vld1.64 {d0}, [%[p0]], %[l2] \n\t" \
503 + "vld1.64 {d1}, [%[p1]], %[l2] \n\t" \
505 + "subs %[h], %[h], #2 \n\t" \
506 + vhadd".u8 d4, d0, d1 \n\t" \
507 + "vst1.64 {d4}, [%[b],:64], %[line_size] \n\t" \
508 + "vld1.64 {d0}, [%[p0]], %[l2] \n\t" \
509 + vhadd".u8 d4, d0, d1 \n\t" \
510 + "vst1.64 {d4}, [%[b],:64], %[line_size] \n\t" \
511 + "vld1.64 {d1}, [%[p1]], %[l2] \n\t" \
514 +#define PUT_PIXELS8_XY2(vshrn, no_rnd) \
515 + "lsl %[l2], %[line_size], #1 \n\t" \
516 + "add %[p1], %[p0], %[line_size] \n\t" \
517 + "vld1.64 {d0,d1}, [%[p0]], %[l2] \n\t" \
518 + "vld1.64 {d2,d3}, [%[p1]], %[l2] \n\t" \
519 + "pld [%[p0]] \n\t" \
520 + "pld [%[p1]] \n\t" \
521 + "vext.8 d4, d0, d1, #1 \n\t" \
522 + "vext.8 d6, d2, d3, #1 \n\t" \
523 + "vaddl.u8 q8, d0, d4 \n\t" \
524 + "vaddl.u8 q9, d2, d6 \n\t" \
526 + "subs %[h], %[h], #2 \n\t" \
527 + "vld1.64 {d0,d1}, [%[p0]], %[l2] \n\t" \
528 + "pld [%[p0]] \n\t" \
529 + "vadd.u16 q10, q8, q9 \n\t" \
530 + "vext.8 d4, d0, d1, #1 \n\t" \
531 + no_rnd "vadd.u16 q10, q10, q11 \n\t" \
532 + "vaddl.u8 q8, d0, d4 \n\t" \
533 + vshrn".u16 d5, q10, #2 \n\t" \
534 + "vld1.64 {d2,d3}, [%[p1]], %[l2] \n\t" \
535 + "vadd.u16 q10, q8, q9 \n\t" \
536 + "pld [%[p1]] \n\t" \
537 + no_rnd "vadd.u16 q10, q10, q11 \n\t" \
538 + "vst1.64 {d5}, [%[b],:64], %[line_size] \n\t" \
539 + vshrn".u16 d7, q10, #2 \n\t" \
540 + "vext.8 d6, d2, d3, #1 \n\t" \
541 + "vaddl.u8 q9, d2, d6 \n\t" \
542 + "vst1.64 {d7}, [%[b],:64], %[line_size] \n\t" \
545 +static void put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
546 + int line_size, int h)
550 + "vld1.64 {d0,d1}, [%[pixels]], %[line_size] \n\t"
551 + "vld1.64 {d2,d3}, [%[pixels]], %[line_size] \n\t"
552 + "vld1.64 {d4,d5}, [%[pixels]], %[line_size] \n\t"
553 + "vld1.64 {d6,d7}, [%[pixels]], %[line_size] \n\t"
554 + "pld [%[pixels]] \n\t"
555 + "subs %[h], %[h], #4 \n\t"
556 + "vst1.64 {d0,d1}, [%[block],:128], %[line_size] \n\t"
557 + "vst1.64 {d2,d3}, [%[block],:128], %[line_size] \n\t"
558 + "vst1.64 {d4,d5}, [%[block],:128], %[line_size] \n\t"
559 + "vst1.64 {d6,d7}, [%[block],:128], %[line_size] \n\t"
561 + : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h)
562 + : [line_size]"r"(line_size)
563 + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "memory");
566 +static void put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
567 + int line_size, int h)
570 + PUT_PIXELS_16_X2("vrhadd")
571 + : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
572 + : [line_size]"r"(line_size)
573 + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "memory");
576 +static void put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
577 + int line_size, int h)
583 + PUT_PIXELS_16_Y2("vrhadd")
584 + : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
586 + : [line_size]"r"(line_size)
587 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory");
590 +static void put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
591 + int line_size, int h)
597 + PUT_PIXELS_16_XY2("vrshrn", "@")
600 + [p1]"=&r"(p1), [h]"+r"(h),
602 + : [line_size]"r"(line_size)
603 + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
604 + "d28", "d29", "d30", "d31",
605 + "q8", "q9", "q10", "q11", "q12", "memory");
608 +static void put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
609 + int line_size, int h)
613 + "vld1.64 {d0}, [%[p]], %[line_size] \n\t"
614 + "vld1.64 {d1}, [%[p]], %[line_size] \n\t"
615 + "vld1.64 {d2}, [%[p]], %[line_size] \n\t"
616 + "vld1.64 {d3}, [%[p]], %[line_size] \n\t"
617 + "subs %[h], %[h], #4 \n\t"
618 + "vst1.64 {d0}, [%[b],:64], %[line_size] \n\t"
619 + "vst1.64 {d1}, [%[b],:64], %[line_size] \n\t"
620 + "vst1.64 {d2}, [%[b],:64], %[line_size] \n\t"
621 + "vst1.64 {d3}, [%[b],:64], %[line_size] \n\t"
623 + : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
624 + : [line_size]"r"(line_size)
625 + : "d0", "d1", "d2", "d3", "memory");
628 +static void put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
629 + int line_size, int h)
632 + PUT_PIXELS_8_X2("vrhadd")
633 + : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
634 + : [line_size]"r"(line_size)
635 + : "d0", "d1", "d2", "d3", "memory");
638 +static void put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
639 + int line_size, int h)
645 + PUT_PIXELS_8_Y2("vrhadd")
646 + : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
648 + : [line_size]"r"(line_size)
649 + : "d0", "d1", "d4", "memory");
652 +static void put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
653 + int line_size, int h)
659 + PUT_PIXELS8_XY2("vrshrn", "@")
662 + [p1]"=&r"(p1), [h]"+r"(h),
664 + : [line_size]"r"(line_size)
665 + : "d0", "d1", "d2", "d3", "d4", "d6", "d7",
666 + "q8", "q9", "q10", "memory");
669 +static void put_no_rnd_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
670 + int line_size, int h)
673 + PUT_PIXELS_16_X2("vhadd")
674 + : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
675 + : [line_size]"r"(line_size)
676 + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "memory");
679 +static void put_no_rnd_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
680 + int line_size, int h)
686 + PUT_PIXELS_16_Y2("vhadd")
687 + : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
689 + : [line_size]"r"(line_size)
690 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory");
693 +static void put_no_rnd_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
694 + int line_size, int h)
700 + "vmov.i16 q13, #1 \n\t"
701 + PUT_PIXELS_16_XY2("vshrn", "")
704 + [p1]"=&r"(p1), [h]"+r"(h),
706 + : [line_size]"r"(line_size)
707 + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
708 + "d28", "d29", "d30", "d31",
709 + "q8", "q9", "q10", "q11", "q12", "q13", "memory");
712 +static void put_no_rnd_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
713 + int line_size, int h)
716 + PUT_PIXELS_8_X2("vhadd")
717 + : [b]"+r"(block), [p]"+r"(pixels), [h]"+r"(h)
718 + : [line_size]"r"(line_size)
719 + : "d0", "d1", "d2", "d3", "memory");
722 +static void put_no_rnd_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
723 + int line_size, int h)
729 + PUT_PIXELS_8_Y2("vhadd")
730 + : [b]"+r"(block), [p0]"+r"(pixels), [p1]"=&r"(p1), [h]"+r"(h),
732 + : [line_size]"r"(line_size)
733 + : "d0", "d1", "d4", "memory");
736 +static void put_no_rnd_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
737 + int line_size, int h)
743 + "vmov.i16 q11, #1 \n\t"
744 + PUT_PIXELS8_XY2("vshrn", "")
747 + [p1]"=&r"(p1), [h]"+r"(h),
749 + : [line_size]"r"(line_size)
750 + : "d0", "d1", "d2", "d3", "d4", "d6", "d7",
751 + "q8", "q9", "q10", "q11", "memory");
754 +static void put_h264_qpel16_mc00_neon(uint8_t *dst, uint8_t *src, int stride)
756 + put_pixels16_neon(dst, src, stride, 16);
759 +void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
761 + c->put_pixels_tab[0][0] = put_pixels16_neon;
762 + c->put_pixels_tab[0][1] = put_pixels16_x2_neon;
763 + c->put_pixels_tab[0][2] = put_pixels16_y2_neon;
764 + c->put_pixels_tab[0][3] = put_pixels16_xy2_neon;
765 + c->put_pixels_tab[1][0] = put_pixels8_neon;
766 + c->put_pixels_tab[1][1] = put_pixels8_x2_neon;
767 + c->put_pixels_tab[1][2] = put_pixels8_y2_neon;
768 + c->put_pixels_tab[1][3] = put_pixels8_xy2_neon;
770 + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_neon;
771 + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_neon;
772 + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_neon;
773 + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_neon;
774 + c->put_no_rnd_pixels_tab[1][0] = put_pixels8_neon;
775 + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_neon;
776 + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_neon;
777 + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_neon;
779 + c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
781 + c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_neon;
783 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/float_arm_vfp.c mythtv/libs/libavcodec/armv4l/float_arm_vfp.c
784 --- mythtv.orig/libs/libavcodec/armv4l/float_arm_vfp.c 1970-01-01 01:00:00.000000000 +0100
785 +++ mythtv/libs/libavcodec/armv4l/float_arm_vfp.c 2008-07-24 19:54:01.023198000 +0200
788 + * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
790 + * This file is part of FFmpeg.
792 + * FFmpeg is free software; you can redistribute it and/or
793 + * modify it under the terms of the GNU Lesser General Public
794 + * License as published by the Free Software Foundation; either
795 + * version 2.1 of the License, or (at your option) any later version.
797 + * FFmpeg is distributed in the hope that it will be useful,
798 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
799 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
800 + * Lesser General Public License for more details.
802 + * You should have received a copy of the GNU Lesser General Public
803 + * License along with FFmpeg; if not, write to the Free Software
804 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
807 +#include "libavcodec/dsputil.h"
810 + * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
811 + * throughput for almost all the instructions (except for double precision
812 + * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
813 + * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
814 + * important for performance. One more interesting feature is that VFP has
815 + * independent load/store and arithmetics pipelines, so it is possible to make
816 + * them work simultaneously and get more than 1 operation per cycle. Load/store
817 + * pipeline can process 2 single precision floating point values per cycle and
818 + * supports bulk loads and stores for large sets of registers. Arithmetic operations
819 + * can be done on vectors, which allows to keep the arithmetics pipeline busy,
820 + * while the processor may issue and execute other instructions. Detailed
821 + * optimization manuals can be found at http://www.arm.com
825 + * ARM VFP optimized implementation of 'vector_fmul_c' function.
826 + * Assume that len is a positive number and is multiple of 8
828 +static void vector_fmul_vfp(float *dst, const float *src, int len)
832 + "fmrx %[tmp], fpscr\n\t"
833 + "orr %[tmp], %[tmp], #(3 << 16)\n\t" /* set vector size to 4 */
834 + "fmxr fpscr, %[tmp]\n\t"
836 + "fldmias %[dst_r]!, {s0-s3}\n\t"
837 + "fldmias %[src]!, {s8-s11}\n\t"
838 + "fldmias %[dst_r]!, {s4-s7}\n\t"
839 + "fldmias %[src]!, {s12-s15}\n\t"
840 + "fmuls s8, s0, s8\n\t"
842 + "subs %[len], %[len], #16\n\t"
843 + "fmuls s12, s4, s12\n\t"
844 + "fldmiasge %[dst_r]!, {s16-s19}\n\t"
845 + "fldmiasge %[src]!, {s24-s27}\n\t"
846 + "fldmiasge %[dst_r]!, {s20-s23}\n\t"
847 + "fldmiasge %[src]!, {s28-s31}\n\t"
848 + "fmulsge s24, s16, s24\n\t"
849 + "fstmias %[dst_w]!, {s8-s11}\n\t"
850 + "fstmias %[dst_w]!, {s12-s15}\n\t"
851 + "fmulsge s28, s20, s28\n\t"
852 + "fldmiasgt %[dst_r]!, {s0-s3}\n\t"
853 + "fldmiasgt %[src]!, {s8-s11}\n\t"
854 + "fldmiasgt %[dst_r]!, {s4-s7}\n\t"
855 + "fldmiasgt %[src]!, {s12-s15}\n\t"
856 + "fmulsge s8, s0, s8\n\t"
857 + "fstmiasge %[dst_w]!, {s24-s27}\n\t"
858 + "fstmiasge %[dst_w]!, {s28-s31}\n\t"
861 + "bic %[tmp], %[tmp], #(7 << 16)\n\t" /* set vector size back to 1 */
862 + "fmxr fpscr, %[tmp]\n\t"
863 + : [dst_w] "+&r" (dst), [dst_r] "+&r" (dst), [src] "+&r" (src), [len] "+&r" (len), [tmp] "=&r" (tmp)
865 + : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
866 + "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
867 + "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
868 + "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
873 + * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
874 + * Assume that len is a positive number and is multiple of 8
876 +static void vector_fmul_reverse_vfp(float *dst, const float *src0, const float *src1, int len)
880 + "fldmdbs %[src1]!, {s0-s3}\n\t"
881 + "fldmias %[src0]!, {s8-s11}\n\t"
882 + "fldmdbs %[src1]!, {s4-s7}\n\t"
883 + "fldmias %[src0]!, {s12-s15}\n\t"
884 + "fmuls s8, s3, s8\n\t"
885 + "fmuls s9, s2, s9\n\t"
886 + "fmuls s10, s1, s10\n\t"
887 + "fmuls s11, s0, s11\n\t"
889 + "subs %[len], %[len], #16\n\t"
890 + "fldmdbsge %[src1]!, {s16-s19}\n\t"
891 + "fmuls s12, s7, s12\n\t"
892 + "fldmiasge %[src0]!, {s24-s27}\n\t"
893 + "fmuls s13, s6, s13\n\t"
894 + "fldmdbsge %[src1]!, {s20-s23}\n\t"
895 + "fmuls s14, s5, s14\n\t"
896 + "fldmiasge %[src0]!, {s28-s31}\n\t"
897 + "fmuls s15, s4, s15\n\t"
898 + "fmulsge s24, s19, s24\n\t"
899 + "fldmdbsgt %[src1]!, {s0-s3}\n\t"
900 + "fmulsge s25, s18, s25\n\t"
901 + "fstmias %[dst]!, {s8-s13}\n\t"
902 + "fmulsge s26, s17, s26\n\t"
903 + "fldmiasgt %[src0]!, {s8-s11}\n\t"
904 + "fmulsge s27, s16, s27\n\t"
905 + "fmulsge s28, s23, s28\n\t"
906 + "fldmdbsgt %[src1]!, {s4-s7}\n\t"
907 + "fmulsge s29, s22, s29\n\t"
908 + "fstmias %[dst]!, {s14-s15}\n\t"
909 + "fmulsge s30, s21, s30\n\t"
910 + "fmulsge s31, s20, s31\n\t"
911 + "fmulsge s8, s3, s8\n\t"
912 + "fldmiasgt %[src0]!, {s12-s15}\n\t"
913 + "fmulsge s9, s2, s9\n\t"
914 + "fmulsge s10, s1, s10\n\t"
915 + "fstmiasge %[dst]!, {s24-s27}\n\t"
916 + "fmulsge s11, s0, s11\n\t"
917 + "fstmiasge %[dst]!, {s28-s31}\n\t"
920 + : [dst] "+&r" (dst), [src0] "+&r" (src0), [src1] "+&r" (src1), [len] "+&r" (len)
922 + : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
923 + "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
924 + "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
925 + "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
931 + * ARM VFP optimized float to int16 conversion.
932 + * Assume that len is a positive number and is multiple of 8, destination
933 + * buffer is at least 4 bytes aligned (8 bytes alignment is better for
934 + * performance), little endian byte sex
936 +void float_to_int16_vfp(int16_t *dst, const float *src, int len)
939 + "fldmias %[src]!, {s16-s23}\n\t"
940 + "ftosis s0, s16\n\t"
941 + "ftosis s1, s17\n\t"
942 + "ftosis s2, s18\n\t"
943 + "ftosis s3, s19\n\t"
944 + "ftosis s4, s20\n\t"
945 + "ftosis s5, s21\n\t"
946 + "ftosis s6, s22\n\t"
947 + "ftosis s7, s23\n\t"
949 + "subs %[len], %[len], #8\n\t"
950 + "fmrrs r3, r4, {s0, s1}\n\t"
951 + "fmrrs r5, r6, {s2, s3}\n\t"
952 + "fmrrs r7, r8, {s4, s5}\n\t"
953 + "fmrrs ip, lr, {s6, s7}\n\t"
954 + "fldmiasgt %[src]!, {s16-s23}\n\t"
955 + "ssat r4, #16, r4\n\t"
956 + "ssat r3, #16, r3\n\t"
957 + "ssat r6, #16, r6\n\t"
958 + "ssat r5, #16, r5\n\t"
959 + "pkhbt r3, r3, r4, lsl #16\n\t"
960 + "pkhbt r4, r5, r6, lsl #16\n\t"
961 + "ftosisgt s0, s16\n\t"
962 + "ftosisgt s1, s17\n\t"
963 + "ftosisgt s2, s18\n\t"
964 + "ftosisgt s3, s19\n\t"
965 + "ftosisgt s4, s20\n\t"
966 + "ftosisgt s5, s21\n\t"
967 + "ftosisgt s6, s22\n\t"
968 + "ftosisgt s7, s23\n\t"
969 + "ssat r8, #16, r8\n\t"
970 + "ssat r7, #16, r7\n\t"
971 + "ssat lr, #16, lr\n\t"
972 + "ssat ip, #16, ip\n\t"
973 + "pkhbt r5, r7, r8, lsl #16\n\t"
974 + "pkhbt r6, ip, lr, lsl #16\n\t"
975 + "stmia %[dst]!, {r3-r6}\n\t"
978 + : [dst] "+&r" (dst), [src] "+&r" (src), [len] "+&r" (len)
980 + : "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
981 + "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
982 + "r3", "r4", "r5", "r6", "r7", "r8", "ip", "lr",
987 +void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx)
989 + c->vector_fmul = vector_fmul_vfp;
990 + c->vector_fmul_reverse = vector_fmul_reverse_vfp;
992 + c->float_to_int16 = float_to_int16_vfp;
995 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/h264dsp_neon.S mythtv/libs/libavcodec/armv4l/h264dsp_neon.S
996 --- mythtv.orig/libs/libavcodec/armv4l/h264dsp_neon.S 1970-01-01 01:00:00.000000000 +0100
997 +++ mythtv/libs/libavcodec/armv4l/h264dsp_neon.S 2008-07-24 19:54:01.033198000 +0200
1000 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
1002 + * This file is part of FFmpeg.
1004 + * FFmpeg is free software; you can redistribute it and/or
1005 + * modify it under the terms of the GNU Lesser General Public
1006 + * License as published by the Free Software Foundation; either
1007 + * version 2.1 of the License, or (at your option) any later version.
1009 + * FFmpeg is distributed in the hope that it will be useful,
1010 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1011 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1012 + * Lesser General Public License for more details.
1014 + * You should have received a copy of the GNU Lesser General Public
1015 + * License along with FFmpeg; if not, write to the Free Software
1016 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1023 + .global ff_put_h264_chroma_mc8_neon
1024 + .func ff_put_h264_chroma_mc8_neon
1025 +/* void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, int stride,
1026 + int h, int x, int y) */
1027 +ff_put_h264_chroma_mc8_neon:
1029 + ldrd r4, [sp, #16]
1035 + rsb r6, r7, r5, lsl #3
1036 + rsb ip, r7, r4, lsl #3
1037 + sub r4, r7, r4, lsl #3
1038 + sub r4, r4, r5, lsl #3
1048 + vld1.64 {d4,d5}, [r1], r4
1050 + vld1.64 {d6,d7}, [r5], r4
1054 + vext.8 d5, d4, d5, #1
1056 + vext.8 d7, d6, d7, #1
1059 + vmull.u8 q8, d4, d0
1060 + vmlal.u8 q8, d5, d1
1061 + vld1.64 {d4,d5}, [r1], r4
1062 + vmlal.u8 q8, d6, d2
1063 + vext.8 d5, d4, d5, #1
1064 + vmlal.u8 q8, d7, d3
1065 + vmull.u8 q9, d6, d0
1066 + vadd.i16 q8, q8, q12
1068 + vmlal.u8 q9, d7, d1
1069 + vshrn.u16 d16, q8, #6
1070 + vld1.64 {d6,d7}, [r5], r4
1071 + vmlal.u8 q9, d4, d2
1072 + vmlal.u8 q9, d5, d3
1074 + vadd.i16 q9, q9, q12
1075 + vst1.64 {d16}, [r0,:64], r2
1076 + vshrn.u16 d17, q9, #6
1077 + vext.8 d7, d6, d7, #1
1078 + vst1.64 {d17}, [r0,:64], r2
1096 + vld1.64 {d4}, [r1], r4
1097 + vld1.64 {d6}, [r5], r4
1100 + vmull.u8 q8, d4, d0
1101 + vmlal.u8 q8, d6, d1
1102 + vld1.64 {d4}, [r1], r4
1103 + vmull.u8 q9, d6, d0
1104 + vadd.i16 q8, q8, q12
1105 + vmlal.u8 q9, d4, d1
1106 + vshrn.u16 d16, q8, #6
1107 + vadd.i16 q9, q9, q12
1108 + vst1.64 {d16}, [r0,:64], r2
1109 + vshrn.u16 d17, q9, #6
1111 + vld1.64 {d6}, [r5], r4
1113 + vst1.64 {d17}, [r0,:64], r2
1120 + vld1.64 {d4,d5}, [r1], r2
1121 + vld1.64 {d6,d7}, [r1], r2
1122 + vext.8 d5, d4, d5, #1
1123 + vext.8 d7, d6, d7, #1
1127 + vmull.u8 q8, d4, d0
1128 + vmlal.u8 q8, d5, d1
1129 + vld1.64 {d4,d5}, [r1], r2
1130 + vmull.u8 q9, d6, d0
1131 + vmlal.u8 q9, d7, d1
1133 + vadd.i16 q8, q8, q12
1134 + vadd.i16 q9, q9, q12
1135 + vext.8 d5, d4, d5, #1
1136 + vshrn.u16 d16, q8, #6
1137 + vld1.64 {d6,d7}, [r1], r2
1138 + vshrn.u16 d17, q9, #6
1139 + vst1.64 {d16}, [r0,:64], r2
1140 + vext.8 d7, d6, d7, #1
1141 + vst1.64 {d17}, [r0,:64], r2
1147 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_arm.c mythtv/libs/libavcodec/armv4l/mpegvideo_arm.c
1148 --- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_arm.c 2008-07-23 12:19:05.000000000 +0200
1149 +++ mythtv/libs/libavcodec/armv4l/mpegvideo_arm.c 2008-07-24 19:54:01.263198000 +0200
1151 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1154 -#include "dsputil.h"
1155 -#include "mpegvideo.h"
1156 -#include "avcodec.h"
1157 +#include "libavcodec/avcodec.h"
1158 +#include "libavcodec/dsputil.h"
1159 +#include "libavcodec/mpegvideo.h"
1161 extern void MPV_common_init_iwmmxt(MpegEncContext *s);
1162 extern void MPV_common_init_armv5te(MpegEncContext *s);
1164 void MPV_common_init_armv4l(MpegEncContext *s)
1166 /* IWMMXT support is a superset of armv5te, so
1167 - * allow optimised functions for armv5te unless
1168 + * allow optimized functions for armv5te unless
1169 * a better iwmmxt function exists
1172 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_armv5te.c mythtv/libs/libavcodec/armv4l/mpegvideo_armv5te.c
1173 --- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_armv5te.c 2008-07-23 12:19:05.000000000 +0200
1174 +++ mythtv/libs/libavcodec/armv4l/mpegvideo_armv5te.c 2008-07-24 19:54:01.263198000 +0200
1176 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1179 -#include "dsputil.h"
1180 -#include "mpegvideo.h"
1181 -#include "avcodec.h"
1182 +#include "libavcodec/avcodec.h"
1183 +#include "libavcodec/dsputil.h"
1184 +#include "libavcodec/mpegvideo.h"
1187 #ifdef ENABLE_ARM_TESTS
1189 ({ DCTELEM *xblock = xxblock; \
1190 int xqmul = xxqmul, xqadd = xxqadd, xcount = xxcount, xtmp; \
1191 int xdata1, xdata2; \
1192 -__asm__ __volatile__( \
1194 "subs %[count], %[count], #2 \n\t" \
1196 "ldrd r4, [%[block], #0] \n\t" \
1197 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c mythtv/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c
1198 --- mythtv.orig/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c 2008-07-23 12:19:05.000000000 +0200
1199 +++ mythtv/libs/libavcodec/armv4l/mpegvideo_iwmmxt.c 2008-07-24 19:54:01.273198000 +0200
1201 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1204 -#include "dsputil.h"
1205 -#include "mpegvideo.h"
1206 -#include "avcodec.h"
1207 +#include "libavcodec/avcodec.h"
1208 +#include "libavcodec/dsputil.h"
1209 +#include "libavcodec/mpegvideo.h"
1211 static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s,
1212 DCTELEM *block, int n, int qscale)
1215 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
1217 - __asm__ __volatile__ (
1219 /* "movd %1, %%mm6 \n\t" //qmul */
1220 /* "packssdw %%mm6, %%mm6 \n\t" */
1221 /* "packssdw %%mm6, %%mm6 \n\t" */
1222 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/simple_idct_arm.S mythtv/libs/libavcodec/armv4l/simple_idct_arm.S
1223 --- mythtv.orig/libs/libavcodec/armv4l/simple_idct_arm.S 2008-07-23 12:19:05.000000000 +0200
1224 +++ mythtv/libs/libavcodec/armv4l/simple_idct_arm.S 2008-07-24 19:54:01.503198000 +0200
1229 - @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
1230 + @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
1231 ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
1232 ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
1233 ldr r3, [r14, #8] @ R3=ROWr32[2]
1235 @@ col[40] = ((a2 - b2) >> COL_SHIFT);
1236 @@ col[48] = ((a1 - b1) >> COL_SHIFT);
1237 @@ col[56] = ((a0 - b0) >> COL_SHIFT);
1238 - @@@@@ no optimisation here @@@@@
1239 + @@@@@ no optimization here @@@@@
1240 add r8, r6, r0 @ R8=a0+b0
1241 add r9, r2, r1 @ R9=a1+b1
1242 mov r8, r8, asr #COL_SHIFT
1243 diff -Nurd mythtv.orig/libs/libavcodec/armv4l/simple_idct_neon.S mythtv/libs/libavcodec/armv4l/simple_idct_neon.S
1244 --- mythtv.orig/libs/libavcodec/armv4l/simple_idct_neon.S 1970-01-01 01:00:00.000000000 +0100
1245 +++ mythtv/libs/libavcodec/armv4l/simple_idct_neon.S 2008-07-24 19:54:01.503198000 +0200
1250 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
1252 + * Based on Simple IDCT
1253 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
1255 + * This file is part of FFmpeg.
1257 + * FFmpeg is free software; you can redistribute it and/or
1258 + * modify it under the terms of the GNU Lesser General Public
1259 + * License as published by the Free Software Foundation; either
1260 + * version 2.1 of the License, or (at your option) any later version.
1262 + * FFmpeg is distributed in the hope that it will be useful,
1263 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1264 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1265 + * Lesser General Public License for more details.
1267 + * You should have received a copy of the GNU Lesser General Public
1268 + * License along with FFmpeg; if not, write to the Free Software
1269 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1272 +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1273 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1274 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1275 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1276 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1277 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1278 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
1279 +#define W4c ((1<<(COL_SHIFT-1))/W4)
1280 +#define ROW_SHIFT 11
1281 +#define COL_SHIFT 20
1294 + .macro idct_col4_top
1295 + vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */
1296 + vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */
1297 + vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */
1298 + vadd.i32 q11, q15, q7
1299 + vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */
1300 + vadd.i32 q12, q15, q8
1301 + vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */
1302 + vsub.i32 q13, q15, q8
1303 + vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */
1304 + vsub.i32 q14, q15, q7
1306 + vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */
1307 + vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */
1308 + vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */
1309 + vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */
1312 + .macro idct_col4_mid1
1313 + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
1314 + vadd.i32 q11, q11, q7
1315 + vsub.i32 q12, q12, q7
1316 + vsub.i32 q13, q13, q7
1317 + vadd.i32 q14, q14, q7
1320 + .macro idct_col4_mid2
1321 + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
1322 + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
1323 + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
1324 + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
1327 + .macro idct_col4_mid3
1328 + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
1329 + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
1330 + vadd.i32 q11, q11, q7
1331 + vsub.i32 q12, q12, q8
1332 + vadd.i32 q13, q13, q8
1333 + vsub.i32 q14, q14, q7
1336 + .macro idct_col4_mid4
1337 + vmlal.s16 q9, d9, w7
1338 + vmlsl.s16 q10, d9, w5
1339 + vmlal.s16 q5, d9, w3
1340 + vmlsl.s16 q6, d9, w1
1343 + .macro idct_col4_mid
1344 + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
1345 + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
1346 + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
1347 + vadd.i32 q11, q11, q7
1348 + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
1349 + vsub.i32 q12, q12, q7
1350 + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
1351 + vsub.i32 q13, q13, q7
1352 + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
1353 + vadd.i32 q14, q14, q7
1354 + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
1355 + vadd.i32 q11, q11, q7
1356 + vmlal.s16 q9, d9, w7
1357 + vsub.i32 q12, q12, q8
1358 + vmlsl.s16 q10, d9, w5
1359 + vadd.i32 q13, q13, q8
1360 + vmlal.s16 q5, d9, w3
1361 + vsub.i32 q14, q14, q7
1362 + vmlsl.s16 q6, d9, w1
1365 + .macro idct_col4_end
1366 + vadd.i32 q3, q11, q9
1367 + vadd.i32 q4, q12, q10
1368 + vadd.i32 q7, q13, q5
1369 + vadd.i32 q8, q14, q6
1370 + vsub.i32 q11, q11, q9
1371 + vsub.i32 q12, q12, q10
1372 + vsub.i32 q13, q13, q5
1373 + vsub.i32 q14, q14, q6
1378 + .type idct_row4_neon, %function
1379 + .func idct_row4_neon
1381 + vld1.64 {d2,d3}, [a3,:128]!
1382 + vld1.64 {d4,d5}, [a3,:128]!
1383 + vld1.64 {d6,d7}, [a3,:128]!
1384 + vld1.64 {d8,d9}, [a3,:128]!
1387 + vmov.i32 q15, #(1<<(ROW_SHIFT-1))
1392 + vorr d10, d10, d11
1394 + vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */
1404 + vadd.i32 q3, q11, q9
1405 + vadd.i32 q4, q12, q10
1406 + vshrn.i32 d2, q3, #ROW_SHIFT
1407 + vadd.i32 q7, q13, q5
1408 + vshrn.i32 d4, q4, #ROW_SHIFT
1409 + vadd.i32 q8, q14, q6
1410 + vshrn.i32 d6, q7, #ROW_SHIFT
1411 + vsub.i32 q11, q11, q9
1412 + vshrn.i32 d8, q8, #ROW_SHIFT
1413 + vsub.i32 q12, q12, q10
1414 + vshrn.i32 d9, q11, #ROW_SHIFT
1415 + vsub.i32 q13, q13, q5
1416 + vshrn.i32 d7, q12, #ROW_SHIFT
1417 + vsub.i32 q14, q14, q6
1418 + vshrn.i32 d5, q13, #ROW_SHIFT
1419 + vshrn.i32 d3, q14, #ROW_SHIFT
1426 + vst1.64 {d2,d3}, [a3,:128]!
1427 + vst1.64 {d4,d5}, [a3,:128]!
1428 + vst1.64 {d6,d7}, [a3,:128]!
1429 + vst1.64 {d8,d9}, [a3,:128]!
1435 + .type idct_col4_neon, %function
1436 + .func idct_col4_neon
1439 + vld1.64 {d2}, [a3,:64], ip /* d2 = col[0] */
1440 + vld1.64 {d4}, [a3,:64], ip /* d3 = col[1] */
1441 + vld1.64 {d6}, [a3,:64], ip /* d4 = col[2] */
1442 + vld1.64 {d8}, [a3,:64], ip /* d5 = col[3] */
1443 + vld1.64 {d3}, [a3,:64], ip /* d6 = col[4] */
1444 + vld1.64 {d5}, [a3,:64], ip /* d7 = col[5] */
1445 + vld1.64 {d7}, [a3,:64], ip /* d8 = col[6] */
1446 + vld1.64 {d9}, [a3,:64], ip /* d9 = col[7] */
1455 + vmov.32 v1, d11[0]
1456 + vmov.32 v2, d13[0]
1458 + vmov.32 v3, d15[0]
1459 + vmov.32 ip, d17[0]
1461 + vadd.i16 d30, d30, d2
1462 + vmull.s16 q15, d30, w4 /* q15 = W4 * (col[0]+(1<<(COL_SHIFT-1))/W4) */
1480 + vshr.s32 q2, q3, #COL_SHIFT
1481 + vshr.s32 q3, q4, #COL_SHIFT
1483 + vshr.s32 q4, q7, #COL_SHIFT
1485 + vshr.s32 q5, q8, #COL_SHIFT
1487 + vshr.s32 q6, q14, #COL_SHIFT
1489 + vshr.s32 q7, q13, #COL_SHIFT
1491 + vshr.s32 q8, q12, #COL_SHIFT
1493 + vshr.s32 q9, q11, #COL_SHIFT
1500 + .macro idct_col4_st16
1502 + vst1.64 {d2}, [a3,:64], ip
1503 + vst1.64 {d3}, [a3,:64], ip
1504 + vst1.64 {d4}, [a3,:64], ip
1505 + vst1.64 {d5}, [a3,:64], ip
1506 + vst1.64 {d6}, [a3,:64], ip
1507 + vst1.64 {d7}, [a3,:64], ip
1508 + vst1.64 {d8}, [a3,:64], ip
1509 + vst1.64 {d9}, [a3,:64], ip
1513 + .type idct_col4_add8, %function
1514 + .func idct_col4_add8
1516 + vld1.32 {d10[0]}, [a1,:32], a2
1517 + vld1.32 {d10[1]}, [a1,:32], a2
1518 + vld1.32 {d11[0]}, [a1,:32], a2
1519 + vld1.32 {d11[1]}, [a1,:32], a2
1520 + vld1.32 {d12[0]}, [a1,:32], a2
1521 + vld1.32 {d12[1]}, [a1,:32], a2
1522 + vld1.32 {d13[0]}, [a1,:32], a2
1523 + vld1.32 {d13[1]}, [a1,:32], a2
1525 + vaddw.u8 q1, q1, d10
1526 + vaddw.u8 q2, q2, d11
1527 + vaddw.u8 q3, q3, d12
1528 + vaddw.u8 q4, q4, d13
1530 + sub a1, a1, a2, lsl #3
1533 + .type idct_col4_st8, %function
1534 + .func idct_col4_st8
1536 + vqmovun.s16 d2, q1
1537 + vqmovun.s16 d3, q2
1538 + vqmovun.s16 d4, q3
1539 + vqmovun.s16 d5, q4
1541 + vst1.32 {d2[0]}, [a1,:32], a2
1542 + vst1.32 {d2[1]}, [a1,:32], a2
1543 + vst1.32 {d3[0]}, [a1,:32], a2
1544 + vst1.32 {d3[1]}, [a1,:32], a2
1545 + vst1.32 {d4[0]}, [a1,:32], a2
1546 + vst1.32 {d4[1]}, [a1,:32], a2
1547 + vst1.32 {d5[0]}, [a1,:32], a2
1548 + vst1.32 {d5[1]}, [a1,:32], a2
1553 +const: .short W1, W2, W3, W4, W5, W6, W7, W4c
1555 + .macro idct_start data
1561 + vld1.64 {d0,d1}, [a4,:128]
1570 + .global ff_simple_idct_neon
1571 + .type ff_simple_idct_neon, %function
1572 + .func ff_simple_idct_neon
1573 +/* void ff_simple_idct_neon(DCTELEM *data); */
1574 +ff_simple_idct_neon:
1593 + .global ff_simple_idct_put_neon
1594 + .type ff_simple_idct_put_neon, %function
1595 + .func ff_simple_idct_put_neon
1596 +/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */
1597 +ff_simple_idct_put_neon:
1605 + sub a1, a1, a2, lsl #3
1615 + .global ff_simple_idct_add_neon
1616 + .type ff_simple_idct_add_neon, %function
1617 + .func ff_simple_idct_add_neon
1618 +/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */
1619 +ff_simple_idct_add_neon:
1627 + sub a1, a1, a2, lsl #3
1635 diff -Nurd mythtv.orig/libs/libavcodec/avcodec.h mythtv/libs/libavcodec/avcodec.h
1636 --- mythtv.orig/libs/libavcodec/avcodec.h 2008-07-23 12:19:11.000000000 +0200
1637 +++ mythtv/libs/libavcodec/avcodec.h 2008-07-24 19:56:46.953198000 +0200
1638 @@ -1328,6 +1328,8 @@
1639 #define FF_IDCT_SIMPLEARMV6 17
1640 #define FF_IDCT_SIMPLEVIS 18
1641 #define FF_IDCT_WMV2 19
1642 +#define FF_IDCT_FAAN 20
1643 +#define FF_IDCT_SIMPLENEON 21
1647 diff -Nurd mythtv.orig/libs/libavcodec/libavcodec.pro mythtv/libs/libavcodec/libavcodec.pro
1648 --- mythtv.orig/libs/libavcodec/libavcodec.pro 2008-07-23 12:19:10.000000000 +0200
1649 +++ mythtv/libs/libavcodec/libavcodec.pro 2008-07-24 19:54:01.503198000 +0200
1652 contains( HAVE_ARMV6, yes ) { SOURCES += armv4l/simple_idct_armv6.S }
1654 +contains( HAVE_NEON, yes ) { SOURCES += armv4l/float_arm_vfp.c armv4l/simple_idct_neon.S armv4l/dsputil_neon.c armv4l/h264dsp_neon.S }
1656 contains( HAVE_VIS, yes ) {
1657 SOURCES += sparc/dsputil_vis.c
1658 SOURCES += sparc/simple_idct_vis.c
1659 diff -Nurd mythtv.orig/libs/libavcodec/utils.c mythtv/libs/libavcodec/utils.c
1660 --- mythtv.orig/libs/libavcodec/utils.c 2008-07-23 12:19:10.000000000 +0200
1661 +++ mythtv/libs/libavcodec/utils.c 2008-07-24 19:58:12.403198000 +0200
1663 {"sh4", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SH4, INT_MIN, INT_MAX, V|E|D, "idct"},
1664 {"simplearm", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARM, INT_MIN, INT_MAX, V|E|D, "idct"},
1665 {"simplearmv5te", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEARMV5TE, INT_MIN, INT_MAX, V|E|D, "idct"},
1666 +{"simpleneon", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLENEON, INT_MIN, INT_MAX, V|E|D, "idct"},
1667 {"h264", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_H264, INT_MIN, INT_MAX, V|E|D, "idct"},
1668 {"vp3", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_VP3, INT_MIN, INT_MAX, V|E|D, "idct"},
1669 {"ipp", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_IPP, INT_MIN, INT_MAX, V|E|D, "idct"},