initial import
[vuplus_webkit] / Source / WebCore / platform / graphics / filters / arm / FEGaussianBlurNEON.cpp
1 /*
2  * Copyright (C) 2011 University of Szeged
3  * Copyright (C) 2011 Zoltan Herczeg
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26
27 #include "config.h"
28 #include "FEGaussianBlurNEON.h"
29
30 #if CPU(ARM_NEON) && COMPILER(GCC)
31
32 #include <wtf/Alignment.h>
33
34 namespace WebCore {
35
36 static WTF_ALIGNED(unsigned char, s_FEGaussianBlurConstantsForNeon[], 16) = {
37     // Mapping from ARM to NEON registers.
38     0, 16, 16, 16, 1,  16, 16, 16, 2,  16, 16, 16, 3,  16, 16, 16,
39     // Mapping from NEON to ARM registers.
40     0, 4,  8,  12, 16, 16, 16, 16
41 };
42
43 unsigned char* feGaussianBlurConstantsForNeon()
44 {
45     return s_FEGaussianBlurConstantsForNeon;
46 }
47
48 #define ASSTRING(str) #str
49 #define TOSTRING(value) ASSTRING(value)
50
51 #define STRIDE_OFFSET TOSTRING(0)
52 #define STRIDE_WIDTH_OFFSET TOSTRING(4)
53 #define STRIDE_LINE_OFFSET TOSTRING(8)
54 #define STRIDE_LINE_WIDTH_OFFSET TOSTRING(12)
55 #define REMAINING_STRIDES_OFFSET TOSTRING(16)
56 #define DISTANCE_LEFT_OFFSET TOSTRING(20)
57 #define DISTANCE_RIGHT_OFFSET TOSTRING(24)
58 #define INVERTED_KERNEL_SIZE_OFFSET TOSTRING(28)
59 #define PAINTING_CONSTANTS_OFFSET TOSTRING(32)
60 #define NL "\n"
61
62 // Register allocation.
63 #define SOURCE_R                "r0"
64 #define DESTINATION_R           "r1"
65 #define LEFT_R                  "r2"
66 #define RIGHT_R                 "r3"
67 #define SOURCE_END_R            "r4"
68 #define DESTINATION_END_R       "r5"
69 #define STRIDE_R                "r6"
70 #define STRIDE_WIDTH_R          "r7"
71 #define STRIDE_LINE_R           "r8"
72 #define SOURCE_LINE_END_R       "r10"
73 #define DISTANCE_LEFT_R         "r11"
74 #define DISTANCE_RIGHT_R        "r12"
75 #define MAX_KERNEL_SIZE_R       "lr"
76
77 // Alternate names.
78 #define INIT_INVERTED_KERNEL_SIZE_R SOURCE_END_R
79 #define INIT_PAINTING_CONSTANTS_R DESTINATION_END_R
80 #define INIT_SUM_R LEFT_R
81 #define REMAINING_STRIDES_R SOURCE_LINE_END_R
82
83 #define INVERTED_KERNEL_SIZE_Q  "q0"
84 #define SUM_Q                   "q1"
85 #define PIXEL_Q                 "q2"
86 #define PIXEL_D0                "d4"
87 #define PIXEL_D1                "d5"
88 #define PIXEL_D00               "d4[0]"
89 #define PIXEL_D01               "d4[1]"
90 #define PIXEL_S1                "s9"
91 #define PIXEL_D10               "d5[0]"
92 #define PIXEL_S2                "s10"
93 #define PIXEL_D11               "d5[1]"
94 #define REMAINING_STRIDES_S0    "s12"
95
96 #define READ_RANGE              "d16-d18"
97 #define REMAP_ARM_NEON1_Q       "d16"
98 #define REMAP_ARM_NEON2_Q       "d17"
99 #define REMAP_NEON_ARM_Q        "d18"
100
101 asm ( // NOLINT
102 ".globl " TOSTRING(neonDrawAllChannelGaussianBlur) NL
103 TOSTRING(neonDrawAllChannelGaussianBlur) ":" NL
104     "stmdb sp!, {r4-r8, r10, r11, lr}" NL
105     "ldr " STRIDE_R ", [r2, #" STRIDE_OFFSET "]" NL
106     "ldr " STRIDE_WIDTH_R ", [r2, #" STRIDE_WIDTH_OFFSET "]" NL
107     "ldr " DISTANCE_LEFT_R ", [r2, #" DISTANCE_LEFT_OFFSET "]" NL
108     "ldr " DISTANCE_RIGHT_R ", [r2, #" DISTANCE_RIGHT_OFFSET "]" NL
109     "ldr " STRIDE_LINE_R ", [r2, #" STRIDE_LINE_OFFSET "]" NL
110     "ldr " SOURCE_LINE_END_R ", [r2, #" STRIDE_LINE_WIDTH_OFFSET "]" NL
111     "ldr " INIT_INVERTED_KERNEL_SIZE_R ", [r2, #" INVERTED_KERNEL_SIZE_OFFSET "]" NL
112     "ldr " INIT_PAINTING_CONSTANTS_R ", [r2, #" PAINTING_CONSTANTS_OFFSET "]" NL
113
114     // Initialize locals.
115     "mul " DISTANCE_LEFT_R ", " DISTANCE_LEFT_R ", " STRIDE_R NL
116     "mul " DISTANCE_RIGHT_R ", " DISTANCE_RIGHT_R ", " STRIDE_R NL
117     "mov " MAX_KERNEL_SIZE_R ", " DISTANCE_RIGHT_R NL
118     "cmp " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
119     "movcs " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
120     "add " SOURCE_LINE_END_R ", " SOURCE_LINE_END_R ", " SOURCE_R NL
121     "vdup.f32 " INVERTED_KERNEL_SIZE_Q ", " INIT_INVERTED_KERNEL_SIZE_R NL
122     "vld1.f32 { " READ_RANGE " }, [" INIT_PAINTING_CONSTANTS_R "]!" NL
123
124 ".allChannelMainLoop:" NL
125
126     // Initialize the sum variable.
127     "vmov.u32 " SUM_Q ", #0" NL
128     "mov " INIT_SUM_R ", " SOURCE_R NL
129     "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
130     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
131     "bcs .allChannelInitSumDone" NL
132 ".allChannelInitSum:" NL
133     "vld1.u32 " PIXEL_D00 ", [" INIT_SUM_R "], " STRIDE_R NL
134     "vtbl.8 " PIXEL_D1 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON2_Q NL
135     "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON1_Q NL
136     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
137     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
138     "bcc .allChannelInitSum" NL
139 ".allChannelInitSumDone:" NL
140
141     // Blurring.
142     "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
143     "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
144     "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
145     "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
146
147 ".allChannelBlur:" NL
148     "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
149     "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
150     "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
151     "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_NEON_ARM_Q NL
152     "vst1.u32 " PIXEL_D00 ", [" DESTINATION_R "], " STRIDE_R NL
153
154     "cmp " LEFT_R ", " SOURCE_R NL
155     "bcc .allChannelSkipLeft" NL
156     "vld1.u32 " PIXEL_D00 ", [" LEFT_R "]" NL
157     "vtbl.8 " PIXEL_D1 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON2_Q NL
158     "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON1_Q NL
159     "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
160 ".allChannelSkipLeft: " NL
161
162     "cmp " RIGHT_R ", " SOURCE_END_R NL
163     "bcs .allChannelSkipRight" NL
164     "vld1.u32 " PIXEL_D00 ", [" RIGHT_R "]" NL
165     "vtbl.8 " PIXEL_D1 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON2_Q NL
166     "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON1_Q NL
167     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
168 ".allChannelSkipRight: " NL
169
170     "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
171     "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
172     "cmp " DESTINATION_R ", " DESTINATION_END_R NL
173     "bcc .allChannelBlur" NL
174     "sub " DESTINATION_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
175
176     "add " SOURCE_R ", " SOURCE_R ", " STRIDE_LINE_R NL
177     "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_LINE_R NL
178     "cmp " SOURCE_R ", " SOURCE_LINE_END_R NL
179     "bcc .allChannelMainLoop" NL
180
181     "ldmia sp!, {r4-r8, r10, r11, pc}" NL
182 ); // NOLINT
183
184 #define DATA_TRANSFER4(command, base) \
185     command " " PIXEL_D00 ", [" base "], " STRIDE_LINE_R NL \
186     command " " PIXEL_D01 ", [" base "], " STRIDE_LINE_R NL \
187     command " " PIXEL_D10 ", [" base "], " STRIDE_LINE_R NL \
188     command " " PIXEL_D11 ", [" base "], " STRIDE_LINE_R NL \
189     "sub " base ", " base ", " STRIDE_LINE_R ", lsl #2" NL
190
191 // The number of reads depend on REMAINING_STRIDES_R, but it is always >= 1 and <= 3
192 #define CONDITIONAL_DATA_TRANSFER4(command1, command2, base) \
193     command1 " " PIXEL_D00 ", [" base "], " STRIDE_LINE_R NL \
194     "cmp " REMAINING_STRIDES_R ", #2" NL \
195     command2 "cs " PIXEL_S1 ", [" base "]" NL \
196     "add " base ", " base ", " STRIDE_LINE_R NL \
197     "cmp " REMAINING_STRIDES_R ", #3" NL \
198     command2 "cs " PIXEL_S2 ", [" base "]" NL \
199     "sub " base ", " base ", " STRIDE_LINE_R ", lsl #1" NL
200
201 asm ( // NOLINT
202 ".globl " TOSTRING(neonDrawAlphaChannelGaussianBlur) NL
203 TOSTRING(neonDrawAlphaChannelGaussianBlur) ":" NL
204     "stmdb sp!, {r4-r8, r10, r11, lr}" NL
205     "ldr " STRIDE_R ", [r2, #" STRIDE_OFFSET "]" NL
206     "ldr " STRIDE_WIDTH_R ", [r2, #" STRIDE_WIDTH_OFFSET "]" NL
207     "ldr " DISTANCE_LEFT_R ", [r2, #" DISTANCE_LEFT_OFFSET "]" NL
208     "ldr " DISTANCE_RIGHT_R ", [r2, #" DISTANCE_RIGHT_OFFSET "]" NL
209     "ldr " STRIDE_LINE_R ", [r2, #" STRIDE_LINE_OFFSET "]" NL
210     "ldr " SOURCE_LINE_END_R ", [r2, #" STRIDE_LINE_WIDTH_OFFSET "]" NL
211     "ldr " INIT_INVERTED_KERNEL_SIZE_R ", [r2, #" INVERTED_KERNEL_SIZE_OFFSET "]" NL
212     "vldr.u32 " REMAINING_STRIDES_S0 ", [r2, #" REMAINING_STRIDES_OFFSET "]" NL
213
214     // Initialize locals.
215     "mul " DISTANCE_LEFT_R ", " DISTANCE_LEFT_R ", " STRIDE_R NL
216     "mul " DISTANCE_RIGHT_R ", " DISTANCE_RIGHT_R ", " STRIDE_R NL
217     "mov " MAX_KERNEL_SIZE_R ", " DISTANCE_RIGHT_R NL
218     "cmp " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
219     "movcs " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
220     "add " SOURCE_LINE_END_R ", " SOURCE_LINE_END_R ", " SOURCE_R NL
221     "vdup.f32 " INVERTED_KERNEL_SIZE_Q ", " INIT_INVERTED_KERNEL_SIZE_R NL
222     "cmp " SOURCE_LINE_END_R ", " SOURCE_R NL
223     "beq .alphaChannelEarlyLeave" NL
224
225     // Processing 4 strides parallelly.
226
227 ".alphaChannelMainLoop:" NL
228
229     // Initialize the sum variable.
230     "vmov.u32 " SUM_Q ", #0" NL
231     "mov " INIT_SUM_R ", " SOURCE_R NL
232     "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
233     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
234     "bcs .alphaChannelInitSumDone" NL
235 ".alphaChannelInitSum:" NL
236     DATA_TRANSFER4("vld1.u32", INIT_SUM_R)
237     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
238     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
239     "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
240     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
241     "bcc .alphaChannelInitSum" NL
242 ".alphaChannelInitSumDone:" NL
243
244     // Blurring.
245     "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
246     "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
247     "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
248     "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
249
250 ".alphaChannelBlur:" NL
251     "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
252     "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
253     "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
254     "vshl.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
255     DATA_TRANSFER4("vst1.u32", DESTINATION_R)
256
257     "cmp " LEFT_R ", " SOURCE_R NL
258     "bcc .alphaChannelSkipLeft" NL
259     DATA_TRANSFER4("vld1.u32", LEFT_R)
260     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
261     "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
262 ".alphaChannelSkipLeft: " NL
263
264     "cmp " RIGHT_R ", " SOURCE_END_R NL
265     "bcs .alphaChannelSkipRight" NL
266     DATA_TRANSFER4("vld1.u32", RIGHT_R)
267     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
268     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
269 ".alphaChannelSkipRight: " NL
270
271     "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_R NL
272     "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
273     "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
274     "cmp " DESTINATION_R ", " DESTINATION_END_R NL
275     "bcc .alphaChannelBlur" NL
276     "sub " DESTINATION_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
277
278     "add " SOURCE_R ", " SOURCE_R ", " STRIDE_LINE_R ", lsl #2" NL
279     "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_LINE_R ", lsl #2" NL
280     "cmp " SOURCE_R ", " SOURCE_LINE_END_R NL
281     "bcc .alphaChannelMainLoop" NL
282
283     // Processing the remaining strides (0 - 3).
284 ".alphaChannelEarlyLeave:" NL
285     "vmov.u32 " REMAINING_STRIDES_R ", " REMAINING_STRIDES_S0 NL
286     // Early return for 0 strides.
287     "cmp " REMAINING_STRIDES_R ", #0" NL
288     "ldmeqia sp!, {r4-r8, r10, r11, pc}" NL
289
290     // Initialize the sum variable.
291     "vmov.u32 " SUM_Q ", #0" NL
292     "mov " INIT_SUM_R ", " SOURCE_R NL
293     "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
294     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
295     "bcs .alphaChannelSecondInitSumDone" NL
296 ".alphaChannelSecondInitSum:" NL
297     CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", INIT_SUM_R)
298     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
299     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
300     "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
301     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
302     "bcc .alphaChannelSecondInitSum" NL
303 ".alphaChannelSecondInitSumDone:" NL
304
305     // Blurring.
306     "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
307     "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
308     "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
309     "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
310
311 ".alphaChannelSecondBlur:" NL
312     "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
313     "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
314     "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
315     "vshl.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
316     CONDITIONAL_DATA_TRANSFER4("vst1.u32", "vstr", DESTINATION_R)
317
318     "cmp " LEFT_R ", " SOURCE_R NL
319     "bcc .alphaChannelSecondSkipLeft" NL
320     CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", LEFT_R)
321     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
322     "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
323 ".alphaChannelSecondSkipLeft: " NL
324
325     "cmp " RIGHT_R ", " SOURCE_END_R NL
326     "bcs .alphaChannelSecondSkipRight" NL
327     CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", RIGHT_R)
328     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
329     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
330 ".alphaChannelSecondSkipRight: " NL
331
332     "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_R NL
333     "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
334     "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
335     "cmp " DESTINATION_R ", " DESTINATION_END_R NL
336     "bcc .alphaChannelSecondBlur" NL
337
338     "ldmia sp!, {r4-r8, r10, r11, pc}" NL
339 ); // NOLINT
340
341 } // namespace WebCore
342
343 #endif // CPU(ARM_NEON) && COMPILER(GCC)