2 * Copyright (C) 2011 University of Szeged
3 * Copyright (C) 2011 Zoltan Herczeg
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "FEGaussianBlurNEON.h"
30 #if CPU(ARM_NEON) && COMPILER(GCC)
32 #include <wtf/Alignment.h>
36 static WTF_ALIGNED(unsigned char, s_FEGaussianBlurConstantsForNeon[], 16) = {
37 // Mapping from ARM to NEON registers.
38 0, 16, 16, 16, 1, 16, 16, 16, 2, 16, 16, 16, 3, 16, 16, 16,
39 // Mapping from NEON to ARM registers.
40 0, 4, 8, 12, 16, 16, 16, 16
43 unsigned char* feGaussianBlurConstantsForNeon()
45 return s_FEGaussianBlurConstantsForNeon;
48 #define ASSTRING(str) #str
49 #define TOSTRING(value) ASSTRING(value)
51 #define STRIDE_OFFSET TOSTRING(0)
52 #define STRIDE_WIDTH_OFFSET TOSTRING(4)
53 #define STRIDE_LINE_OFFSET TOSTRING(8)
54 #define STRIDE_LINE_WIDTH_OFFSET TOSTRING(12)
55 #define REMAINING_STRIDES_OFFSET TOSTRING(16)
56 #define DISTANCE_LEFT_OFFSET TOSTRING(20)
57 #define DISTANCE_RIGHT_OFFSET TOSTRING(24)
58 #define INVERTED_KERNEL_SIZE_OFFSET TOSTRING(28)
59 #define PAINTING_CONSTANTS_OFFSET TOSTRING(32)
62 // Register allocation.
64 #define DESTINATION_R "r1"
67 #define SOURCE_END_R "r4"
68 #define DESTINATION_END_R "r5"
70 #define STRIDE_WIDTH_R "r7"
71 #define STRIDE_LINE_R "r8"
72 #define SOURCE_LINE_END_R "r10"
73 #define DISTANCE_LEFT_R "r11"
74 #define DISTANCE_RIGHT_R "r12"
75 #define MAX_KERNEL_SIZE_R "lr"
78 #define INIT_INVERTED_KERNEL_SIZE_R SOURCE_END_R
79 #define INIT_PAINTING_CONSTANTS_R DESTINATION_END_R
80 #define INIT_SUM_R LEFT_R
81 #define REMAINING_STRIDES_R SOURCE_LINE_END_R
83 #define INVERTED_KERNEL_SIZE_Q "q0"
88 #define PIXEL_D00 "d4[0]"
89 #define PIXEL_D01 "d4[1]"
91 #define PIXEL_D10 "d5[0]"
92 #define PIXEL_S2 "s10"
93 #define PIXEL_D11 "d5[1]"
94 #define REMAINING_STRIDES_S0 "s12"
96 #define READ_RANGE "d16-d18"
97 #define REMAP_ARM_NEON1_Q "d16"
98 #define REMAP_ARM_NEON2_Q "d17"
99 #define REMAP_NEON_ARM_Q "d18"
102 ".globl " TOSTRING(neonDrawAllChannelGaussianBlur) NL
103 TOSTRING(neonDrawAllChannelGaussianBlur) ":" NL
104 "stmdb sp!, {r4-r8, r10, r11, lr}" NL
105 "ldr " STRIDE_R ", [r2, #" STRIDE_OFFSET "]" NL
106 "ldr " STRIDE_WIDTH_R ", [r2, #" STRIDE_WIDTH_OFFSET "]" NL
107 "ldr " DISTANCE_LEFT_R ", [r2, #" DISTANCE_LEFT_OFFSET "]" NL
108 "ldr " DISTANCE_RIGHT_R ", [r2, #" DISTANCE_RIGHT_OFFSET "]" NL
109 "ldr " STRIDE_LINE_R ", [r2, #" STRIDE_LINE_OFFSET "]" NL
110 "ldr " SOURCE_LINE_END_R ", [r2, #" STRIDE_LINE_WIDTH_OFFSET "]" NL
111 "ldr " INIT_INVERTED_KERNEL_SIZE_R ", [r2, #" INVERTED_KERNEL_SIZE_OFFSET "]" NL
112 "ldr " INIT_PAINTING_CONSTANTS_R ", [r2, #" PAINTING_CONSTANTS_OFFSET "]" NL
114 // Initialize locals.
115 "mul " DISTANCE_LEFT_R ", " DISTANCE_LEFT_R ", " STRIDE_R NL
116 "mul " DISTANCE_RIGHT_R ", " DISTANCE_RIGHT_R ", " STRIDE_R NL
117 "mov " MAX_KERNEL_SIZE_R ", " DISTANCE_RIGHT_R NL
118 "cmp " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
119 "movcs " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
120 "add " SOURCE_LINE_END_R ", " SOURCE_LINE_END_R ", " SOURCE_R NL
121 "vdup.f32 " INVERTED_KERNEL_SIZE_Q ", " INIT_INVERTED_KERNEL_SIZE_R NL
122 "vld1.f32 { " READ_RANGE " }, [" INIT_PAINTING_CONSTANTS_R "]!" NL
124 ".allChannelMainLoop:" NL
126 // Initialize the sum variable.
127 "vmov.u32 " SUM_Q ", #0" NL
128 "mov " INIT_SUM_R ", " SOURCE_R NL
129 "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
130 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
131 "bcs .allChannelInitSumDone" NL
132 ".allChannelInitSum:" NL
133 "vld1.u32 " PIXEL_D00 ", [" INIT_SUM_R "], " STRIDE_R NL
134 "vtbl.8 " PIXEL_D1 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON2_Q NL
135 "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON1_Q NL
136 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
137 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
138 "bcc .allChannelInitSum" NL
139 ".allChannelInitSumDone:" NL
142 "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
143 "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
144 "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
145 "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
147 ".allChannelBlur:" NL
148 "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
149 "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
150 "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
151 "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_NEON_ARM_Q NL
152 "vst1.u32 " PIXEL_D00 ", [" DESTINATION_R "], " STRIDE_R NL
154 "cmp " LEFT_R ", " SOURCE_R NL
155 "bcc .allChannelSkipLeft" NL
156 "vld1.u32 " PIXEL_D00 ", [" LEFT_R "]" NL
157 "vtbl.8 " PIXEL_D1 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON2_Q NL
158 "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON1_Q NL
159 "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
160 ".allChannelSkipLeft: " NL
162 "cmp " RIGHT_R ", " SOURCE_END_R NL
163 "bcs .allChannelSkipRight" NL
164 "vld1.u32 " PIXEL_D00 ", [" RIGHT_R "]" NL
165 "vtbl.8 " PIXEL_D1 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON2_Q NL
166 "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "}, " REMAP_ARM_NEON1_Q NL
167 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
168 ".allChannelSkipRight: " NL
170 "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
171 "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
172 "cmp " DESTINATION_R ", " DESTINATION_END_R NL
173 "bcc .allChannelBlur" NL
174 "sub " DESTINATION_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
176 "add " SOURCE_R ", " SOURCE_R ", " STRIDE_LINE_R NL
177 "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_LINE_R NL
178 "cmp " SOURCE_R ", " SOURCE_LINE_END_R NL
179 "bcc .allChannelMainLoop" NL
181 "ldmia sp!, {r4-r8, r10, r11, pc}" NL
184 #define DATA_TRANSFER4(command, base) \
185 command " " PIXEL_D00 ", [" base "], " STRIDE_LINE_R NL \
186 command " " PIXEL_D01 ", [" base "], " STRIDE_LINE_R NL \
187 command " " PIXEL_D10 ", [" base "], " STRIDE_LINE_R NL \
188 command " " PIXEL_D11 ", [" base "], " STRIDE_LINE_R NL \
189 "sub " base ", " base ", " STRIDE_LINE_R ", lsl #2" NL
191 // The number of reads depend on REMAINING_STRIDES_R, but it is always >= 1 and <= 3
192 #define CONDITIONAL_DATA_TRANSFER4(command1, command2, base) \
193 command1 " " PIXEL_D00 ", [" base "], " STRIDE_LINE_R NL \
194 "cmp " REMAINING_STRIDES_R ", #2" NL \
195 command2 "cs " PIXEL_S1 ", [" base "]" NL \
196 "add " base ", " base ", " STRIDE_LINE_R NL \
197 "cmp " REMAINING_STRIDES_R ", #3" NL \
198 command2 "cs " PIXEL_S2 ", [" base "]" NL \
199 "sub " base ", " base ", " STRIDE_LINE_R ", lsl #1" NL
202 ".globl " TOSTRING(neonDrawAlphaChannelGaussianBlur) NL
203 TOSTRING(neonDrawAlphaChannelGaussianBlur) ":" NL
204 "stmdb sp!, {r4-r8, r10, r11, lr}" NL
205 "ldr " STRIDE_R ", [r2, #" STRIDE_OFFSET "]" NL
206 "ldr " STRIDE_WIDTH_R ", [r2, #" STRIDE_WIDTH_OFFSET "]" NL
207 "ldr " DISTANCE_LEFT_R ", [r2, #" DISTANCE_LEFT_OFFSET "]" NL
208 "ldr " DISTANCE_RIGHT_R ", [r2, #" DISTANCE_RIGHT_OFFSET "]" NL
209 "ldr " STRIDE_LINE_R ", [r2, #" STRIDE_LINE_OFFSET "]" NL
210 "ldr " SOURCE_LINE_END_R ", [r2, #" STRIDE_LINE_WIDTH_OFFSET "]" NL
211 "ldr " INIT_INVERTED_KERNEL_SIZE_R ", [r2, #" INVERTED_KERNEL_SIZE_OFFSET "]" NL
212 "vldr.u32 " REMAINING_STRIDES_S0 ", [r2, #" REMAINING_STRIDES_OFFSET "]" NL
214 // Initialize locals.
215 "mul " DISTANCE_LEFT_R ", " DISTANCE_LEFT_R ", " STRIDE_R NL
216 "mul " DISTANCE_RIGHT_R ", " DISTANCE_RIGHT_R ", " STRIDE_R NL
217 "mov " MAX_KERNEL_SIZE_R ", " DISTANCE_RIGHT_R NL
218 "cmp " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
219 "movcs " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
220 "add " SOURCE_LINE_END_R ", " SOURCE_LINE_END_R ", " SOURCE_R NL
221 "vdup.f32 " INVERTED_KERNEL_SIZE_Q ", " INIT_INVERTED_KERNEL_SIZE_R NL
222 "cmp " SOURCE_LINE_END_R ", " SOURCE_R NL
223 "beq .alphaChannelEarlyLeave" NL
225 // Processing 4 strides parallelly.
227 ".alphaChannelMainLoop:" NL
229 // Initialize the sum variable.
230 "vmov.u32 " SUM_Q ", #0" NL
231 "mov " INIT_SUM_R ", " SOURCE_R NL
232 "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
233 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
234 "bcs .alphaChannelInitSumDone" NL
235 ".alphaChannelInitSum:" NL
236 DATA_TRANSFER4("vld1.u32", INIT_SUM_R)
237 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
238 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
239 "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
240 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
241 "bcc .alphaChannelInitSum" NL
242 ".alphaChannelInitSumDone:" NL
245 "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
246 "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
247 "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
248 "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
250 ".alphaChannelBlur:" NL
251 "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
252 "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
253 "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
254 "vshl.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
255 DATA_TRANSFER4("vst1.u32", DESTINATION_R)
257 "cmp " LEFT_R ", " SOURCE_R NL
258 "bcc .alphaChannelSkipLeft" NL
259 DATA_TRANSFER4("vld1.u32", LEFT_R)
260 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
261 "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
262 ".alphaChannelSkipLeft: " NL
264 "cmp " RIGHT_R ", " SOURCE_END_R NL
265 "bcs .alphaChannelSkipRight" NL
266 DATA_TRANSFER4("vld1.u32", RIGHT_R)
267 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
268 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
269 ".alphaChannelSkipRight: " NL
271 "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_R NL
272 "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
273 "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
274 "cmp " DESTINATION_R ", " DESTINATION_END_R NL
275 "bcc .alphaChannelBlur" NL
276 "sub " DESTINATION_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
278 "add " SOURCE_R ", " SOURCE_R ", " STRIDE_LINE_R ", lsl #2" NL
279 "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_LINE_R ", lsl #2" NL
280 "cmp " SOURCE_R ", " SOURCE_LINE_END_R NL
281 "bcc .alphaChannelMainLoop" NL
283 // Processing the remaining strides (0 - 3).
284 ".alphaChannelEarlyLeave:" NL
285 "vmov.u32 " REMAINING_STRIDES_R ", " REMAINING_STRIDES_S0 NL
286 // Early return for 0 strides.
287 "cmp " REMAINING_STRIDES_R ", #0" NL
288 "ldmeqia sp!, {r4-r8, r10, r11, pc}" NL
290 // Initialize the sum variable.
291 "vmov.u32 " SUM_Q ", #0" NL
292 "mov " INIT_SUM_R ", " SOURCE_R NL
293 "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
294 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
295 "bcs .alphaChannelSecondInitSumDone" NL
296 ".alphaChannelSecondInitSum:" NL
297 CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", INIT_SUM_R)
298 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
299 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
300 "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
301 "cmp " INIT_SUM_R ", " SOURCE_END_R NL
302 "bcc .alphaChannelSecondInitSum" NL
303 ".alphaChannelSecondInitSumDone:" NL
306 "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
307 "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
308 "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
309 "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
311 ".alphaChannelSecondBlur:" NL
312 "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
313 "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
314 "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
315 "vshl.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
316 CONDITIONAL_DATA_TRANSFER4("vst1.u32", "vstr", DESTINATION_R)
318 "cmp " LEFT_R ", " SOURCE_R NL
319 "bcc .alphaChannelSecondSkipLeft" NL
320 CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", LEFT_R)
321 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
322 "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
323 ".alphaChannelSecondSkipLeft: " NL
325 "cmp " RIGHT_R ", " SOURCE_END_R NL
326 "bcs .alphaChannelSecondSkipRight" NL
327 CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", RIGHT_R)
328 "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
329 "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
330 ".alphaChannelSecondSkipRight: " NL
332 "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_R NL
333 "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
334 "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
335 "cmp " DESTINATION_R ", " DESTINATION_END_R NL
336 "bcc .alphaChannelSecondBlur" NL
338 "ldmia sp!, {r4-r8, r10, r11, pc}" NL
341 } // namespace WebCore
343 #endif // CPU(ARM_NEON) && COMPILER(GCC)