code.vuplus.com Git - vuplus_webkit/blob - Source/WebCore/platform/graphics/filters/arm/FELightingNEON.cpp

   1 /*
   2  * Copyright (C) 2011 University of Szeged
   3  * Copyright (C) 2011 Zoltan Herczeg
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
  15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
  18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26
  27 #include "config.h"
  28 #include "FELightingNEON.h"
  29
  30 #if CPU(ARM_NEON) && COMPILER(GCC)
  31
  32 #include <wtf/Alignment.h>
  33
  34 namespace WebCore {
  35
  36 // These constants are copied to the following SIMD registers:
  37 //   ALPHAX_Q ALPHAY_Q REMAPX_D REMAPY_D
  38
  39 static WTF_ALIGNED(short, s_FELightingConstantsForNeon[], 16) = {
  40     // Alpha coefficients.
  41     -2, 1, 0, -1, 2, 1, 0, -1,
  42     0, -1, -2, -1, 0, 1, 2, 1,
  43     // Remapping indicies.
  44     0x0f0e, 0x0302, 0x0504, 0x0706,
  45     0x0b0a, 0x1312, 0x1514, 0x1716,
  46 };
  47
  48 short* feLightingConstantsForNeon()
  49 {
  50     return s_FELightingConstantsForNeon;
  51 }
  52
  53 #if ENABLE(PARALLEL_JOBS)
  54 void FELighting::platformApplyNeonWorker(FELightingPaintingDataForNeon* parameters)
  55 {
  56     neonDrawLighting(parameters);
  57 }
  58 #endif
  59
  60 #define ASSTRING(str) #str
  61 #define TOSTRING(value) ASSTRING(value)
  62
  63 #define PIXELS_OFFSET TOSTRING(0)
  64 #define YSTART_OFFSET TOSTRING(4)
  65 #define WIDTH_OFFSET TOSTRING(8)
  66 #define HEIGHT_OFFSET TOSTRING(12)
  67 #define FLAGS_OFFSET TOSTRING(16)
  68 #define SPECULAR_EXPONENT_OFFSET TOSTRING(20)
  69 #define CONE_EXPONENT_OFFSET TOSTRING(24)
  70 #define FLOAT_ARGUMENTS_OFFSET TOSTRING(28)
  71 #define PAINTING_CONSTANTS_OFFSET TOSTRING(32)
  72 #define NL "\n"
  73
  74 // Register allocation
  75 #define PAINTING_DATA_R       "r11"
  76 #define RESET_WIDTH_R         PAINTING_DATA_R
  77 #define PIXELS_R              "r4"
  78 #define WIDTH_R               "r5"
  79 #define HEIGHT_R              "r6"
  80 #define FLAGS_R               "r7"
  81 #define SPECULAR_EXPONENT_R   "r8"
  82 #define CONE_EXPONENT_R       "r10"
  83 #define SCANLINE_R            "r12"
  84
  85 #define TMP1_Q                "q0"
  86 #define TMP1_D0               "d0"
  87 #define TMP1_S0               "s0"
  88 #define TMP1_S1               "s1"
  89 #define TMP1_D1               "d1"
  90 #define TMP1_S2               "s2"
  91 #define TMP1_S3               "s3"
  92 #define TMP2_Q                "q1"
  93 #define TMP2_D0               "d2"
  94 #define TMP2_S0               "s4"
  95 #define TMP2_S1               "s5"
  96 #define TMP2_D1               "d3"
  97 #define TMP2_S2               "s6"
  98 #define TMP2_S3               "s7"
  99 #define TMP3_Q                "q2"
 100 #define TMP3_D0               "d4"
 101 #define TMP3_S0               "s8"
 102 #define TMP3_S1               "s9"
 103 #define TMP3_D1               "d5"
 104 #define TMP3_S2               "s10"
 105 #define TMP3_S3               "s11"
 106
 107 #define COSINE_OF_ANGLE       "s12"
 108 #define POWF_INT_S            "s13"
 109 #define POWF_FRAC_S           "s14"
 110 #define SPOT_COLOR_Q          "q4"
 111
 112 // Because of VMIN and VMAX CONST_ZERO_S and CONST_ONE_S
 113 // must be placed on the same side of the double vector
 114
 115 // Current pixel position
 116 #define POSITION_Q            "q5"
 117 #define POSITION_X_S          "s20"
 118 #define POSITION_Y_S          "s21"
 119 #define POSITION_Z_S          "s22"
 120 #define CONST_ZERO_HI_D       "d11"
 121 #define CONST_ZERO_S          "s23"
 122
 123 // -------------------------------
 124 //     Variable arguments
 125 // Misc arguments
 126 #define READ1_RANGE           "d12-d15"
 127 #define READ2_RANGE           "d16-d19"
 128 #define READ3_RANGE           "d20-d21"
 129
 130 #define SCALE_S               "s24"
 131 #define SCALE_DIV4_S          "s25"
 132 #define DIFFUSE_CONST_S       "s26"
 133
 134 // Light source position
 135 #define CONE_CUT_OFF_S        "s28"
 136 #define CONE_FULL_LIGHT_S     "s29"
 137 #define CONE_CUT_OFF_RANGE_S  "s30"
 138 #define CONST_ONE_HI_D        "d15"
 139 #define CONST_ONE_S           "s31"
 140
 141 #define LIGHT_Q               "q8"
 142 #define DIRECTION_Q           "q9"
 143 #define COLOR_Q               "q10"
 144 // -------------------------------
 145 //    Constant coefficients
 146 #define READ4_RANGE           "d22-d25"
 147 #define READ5_RANGE           "d26-d27"
 148
 149 #define ALPHAX_Q              "q11"
 150 #define ALPHAY_Q              "q12"
 151 #define REMAPX_D              "d26"
 152 #define REMAPY_D              "d27"
 153 // -------------------------------
 154
 155 #define ALL_ROWS_D            "{d28,d29,d30}"
 156 #define TOP_ROW_D             "d28"
 157 #define MIDDLE_ROW_D          "d29"
 158 #define BOTTOM_ROW_D          "d30"
 159
 160 #define GET_LENGTH(source, temp) \
 161     "vmul.f32 " temp##_Q ", " source##_Q ", " source##_Q NL \
 162     "vadd.f32 " source##_S3 ", " temp##_S0 ", " temp##_S1 NL \
 163     "vadd.f32 " source##_S3 ", " source##_S3 ", " temp##_S2 NL \
 164     "vsqrt.f32 " source##_S3 ", " source##_S3 NL
 165
 166 // destination##_S3 can contain the multiply of length.
 167 #define DOT_PRODUCT(destination, source1, source2) \
 168     "vmul.f32 " destination##_Q ", " source1##_Q ", " source2##_Q NL \
 169     "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S1 NL \
 170     "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S2 NL
 171
 172 #define MULTIPLY_BY_DIFFUSE_CONST(normalVectorLength, dotProductLength) \
 173     "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL \
 174     "vmuleq.f32 " TMP2_S1 ", " DIFFUSE_CONST_S ", " normalVectorLength NL \
 175     "vdiveq.f32 " TMP2_S1 ", " TMP2_S1 ", " dotProductLength NL \
 176     "vdivne.f32 " TMP2_S1 ", " normalVectorLength ", " dotProductLength NL
 177
 178 #define POWF_SQR(value, exponent, current, remaining) \
 179     "tst " exponent ", #" ASSTRING(current) NL \
 180     "vmulne.f32 " value ", " value ", " POWF_INT_S NL \
 181     "tst " exponent ", #" ASSTRING(remaining) NL \
 182     "vmulne.f32 " POWF_INT_S ", " POWF_INT_S ", " POWF_INT_S NL
 183
 184 #define POWF_SQRT(value, exponent, current, remaining) \
 185     "tst " exponent ", #" ASSTRING(remaining) NL \
 186     "vsqrtne.f32 " POWF_FRAC_S ", " POWF_FRAC_S NL \
 187     "tst " exponent ", #" ASSTRING(current) NL \
 188     "vmulne.f32 " value ", " value ", " POWF_FRAC_S NL
 189
 190 // This simplified powf function is sufficiently accurate.
 191 #define POWF(value, exponent) \
 192     "tst " exponent ", #0xfc0" NL \
 193     "vmovne.f32 " POWF_INT_S ", " value NL \
 194     "tst " exponent ", #0x03f" NL \
 195     "vmovne.f32 " POWF_FRAC_S ", " value NL \
 196     "vmov.f32 " value ", " CONST_ONE_S NL \
 197     \
 198     POWF_SQR(value, exponent, 0x040, 0xf80) \
 199     POWF_SQR(value, exponent, 0x080, 0xf00) \
 200     POWF_SQR(value, exponent, 0x100, 0xe00) \
 201     POWF_SQR(value, exponent, 0x200, 0xc00) \
 202     POWF_SQR(value, exponent, 0x400, 0x800) \
 203     "tst " exponent ", #0x800" NL \
 204     "vmulne.f32 " value ", " value ", " POWF_INT_S NL \
 205     \
 206     POWF_SQRT(value, exponent, 0x20, 0x3f) \
 207     POWF_SQRT(value, exponent, 0x10, 0x1f) \
 208     POWF_SQRT(value, exponent, 0x08, 0x0f) \
 209     POWF_SQRT(value, exponent, 0x04, 0x07) \
 210     POWF_SQRT(value, exponent, 0x02, 0x03) \
 211     POWF_SQRT(value, exponent, 0x01, 0x01)
 212
 213 // The following algorithm is an ARM-NEON optimized version of
 214 // the main loop found in FELighting.cpp. Since the whole code
 215 // is redesigned to be as effective as possible (ARM specific
 216 // thinking), it is four times faster than its C++ counterpart.
 217
 218 asm ( // NOLINT
 219 ".globl " TOSTRING(neonDrawLighting) NL
 220 TOSTRING(neonDrawLighting) ":" NL
 221     // Because of the clever register allocation, nothing is stored on the stack
 222     // except the saved registers.
 223     // Stack must be aligned to 8 bytes.
 224     "stmdb sp!, {r4-r8, r10, r11, lr}" NL
 225     "vstmdb sp!, {d8-d15}" NL
 226     "mov " PAINTING_DATA_R ", r0" NL
 227
 228     // The following two arguments are loaded to SIMD registers.
 229     "ldr r0, [" PAINTING_DATA_R ", #" FLOAT_ARGUMENTS_OFFSET "]" NL
 230     "ldr r1, [" PAINTING_DATA_R ", #" PAINTING_CONSTANTS_OFFSET "]" NL
 231     "ldr " PIXELS_R ", [" PAINTING_DATA_R ", #" PIXELS_OFFSET "]" NL
 232     "vldr.f32 " POSITION_Y_S ", [" PAINTING_DATA_R ", #" YSTART_OFFSET "]"  NL
 233     "ldr " WIDTH_R ", [" PAINTING_DATA_R ", #" WIDTH_OFFSET "]" NL
 234     "ldr " HEIGHT_R ", [" PAINTING_DATA_R ", #" HEIGHT_OFFSET "]" NL
 235     "ldr " FLAGS_R ", [" PAINTING_DATA_R ", #" FLAGS_OFFSET "]" NL
 236     "ldr " SPECULAR_EXPONENT_R ", [" PAINTING_DATA_R ", #" SPECULAR_EXPONENT_OFFSET "]" NL
 237     "ldr " CONE_EXPONENT_R ", [" PAINTING_DATA_R ", #" CONE_EXPONENT_OFFSET "]" NL
 238
 239     // Load all data to the SIMD registers with the least number of instructions.
 240     "vld1.f32 { " READ1_RANGE " }, [r0]!" NL
 241     "vld1.f32 { " READ2_RANGE " }, [r0]!" NL
 242     "vld1.f32 { " READ3_RANGE " }, [r0]!" NL
 243     "vld1.s16 {" READ4_RANGE "}, [r1]!" NL
 244     "vld1.s16 {" READ5_RANGE "}, [r1]!" NL
 245
 246     // Initializing local variables.
 247     "mov " SCANLINE_R ", " WIDTH_R ", lsl #2" NL
 248     "add " SCANLINE_R ", " SCANLINE_R ", #8" NL
 249     "add " PIXELS_R ", " PIXELS_R ", " SCANLINE_R NL
 250     "add " PIXELS_R ", " PIXELS_R ", #3" NL
 251     "mov r0, #0" NL
 252     "vmov.f32 " CONST_ZERO_S ", r0" NL
 253     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
 254     "vmov.f32 " SPOT_COLOR_Q ", " COLOR_Q NL
 255     "mov " RESET_WIDTH_R ", " WIDTH_R NL
 256
 257 ".mainLoop:" NL
 258     "mov r3, #3" NL
 259     "vmov.f32 " POSITION_X_S ", " CONST_ONE_S NL
 260
 261 ".scanline:" NL
 262     // The ROW registers are storing the alpha channel of the last three pixels.
 263     // The alpha channel is stored as signed short (sint16) values. The fourth value
 264     // is garbage. The following instructions are shifting out the unnecessary alpha
 265     // values and load the next ones.
 266     "ldrb r0, [" PIXELS_R ", -" SCANLINE_R "]" NL
 267     "ldrb r1, [" PIXELS_R ", +" SCANLINE_R "]" NL
 268     "ldrb r2, [" PIXELS_R "], #4" NL
 269     "vext.s16 " TOP_ROW_D ", " TOP_ROW_D ", " TOP_ROW_D ", #3" NL
 270     "vext.s16 " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", #3" NL
 271     "vext.s16 " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", #3" NL
 272     "vmov.s16 " TOP_ROW_D "[1], r0" NL
 273     "vmov.s16 " MIDDLE_ROW_D "[1], r2" NL
 274     "vmov.s16 " BOTTOM_ROW_D "[1], r1" NL
 275
 276     // The two border pixels (rightmost and leftmost) are skipped when
 277     // the next scanline is reached. It also jumps, when the algorithm
 278     // is started, and the first free alpha values are loaded to each row.
 279     "subs r3, r3, #1" NL
 280     "bne .scanline" NL
 281
 282     // The light vector goes to TMP1_Q. It is constant in case of distant light.
 283     // The fourth value contains the length of the light vector.
 284     "tst " FLAGS_R ", #" TOSTRING(FLAG_POINT_LIGHT | FLAG_SPOT_LIGHT) NL
 285     "beq .distantLight" NL
 286
 287     "vmov.s16 r3, " MIDDLE_ROW_D "[2]" NL
 288     "vmov.f32 " POSITION_Z_S ", r3" NL
 289     "vcvt.f32.s32 " POSITION_Z_S ", " POSITION_Z_S NL
 290     "vmul.f32 " POSITION_Z_S ", " POSITION_Z_S ", " SCALE_S NL
 291
 292     "vsub.f32 " TMP1_Q ", " LIGHT_Q ", " POSITION_Q NL
 293     GET_LENGTH(TMP1, TMP2)
 294
 295     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
 296     "bne .cosineOfAngle" NL
 297 ".visiblePixel:" NL
 298
 299     //     | -1  0  1 |      | -1 -2 -1 |
 300     // X = | -2  0  2 |  Y = |  0  0  0 |
 301     //     | -1  0  1 |      |  1  2  1 |
 302
 303     // Multiply the alpha values by the X and Y matrices.
 304
 305     // Moving the 8 alpha value to TMP3.
 306     "vtbl.8 " TMP3_D0 ", " ALL_ROWS_D ", " REMAPX_D NL
 307     "vtbl.8 " TMP3_D1 ", " ALL_ROWS_D ", " REMAPY_D NL
 308
 309     "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAX_Q NL
 310     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
 311     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
 312     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
 313     "vmov.s16 r0, " TMP2_D0 "[0]" NL
 314
 315     "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAY_Q NL
 316     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
 317     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
 318     "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
 319     "vmov.s16 r1, " TMP2_D0 "[0]" NL
 320
 321     // r0 and r1 contains the X and Y coordinates of the
 322     // normal vector, respectively.
 323
 324     // Calculating the spot light strength.
 325     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
 326     "beq .endLight" NL
 327
 328     "vneg.f32 " TMP3_S1 ", " COSINE_OF_ANGLE NL
 329     "tst " FLAGS_R ", #" TOSTRING(FLAG_CONE_EXPONENT_IS_1) NL
 330     "beq .coneExpPowf" NL
 331 ".coneExpPowfFinished:" NL
 332
 333     // Smoothing the cone edge if necessary.
 334     "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_FULL_LIGHT_S NL
 335     "fmstat" NL
 336     "bhi .cutOff" NL
 337 ".cutOffFinished:" NL
 338
 339     "vmin.f32 " TMP3_D0 ", " TMP3_D0 ", " CONST_ONE_HI_D NL
 340     "vmul.f32 " COLOR_Q ", " SPOT_COLOR_Q ", " TMP3_D0 "[1]" NL
 341
 342 ".endLight:" NL
 343     // Summarize:
 344     // r0 and r1 contains the normalVector.
 345     // TMP1_Q contains the light vector and its length.
 346     // COLOR_Q contains the color of the light vector.
 347
 348     // Test whether both r0 and r1 are zero (Normal vector is (0, 0, 1)).
 349     "orrs r2, r0, r1" NL
 350     "bne .normalVectorIsNonZero" NL
 351
 352     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
 353     "bne .specularLight1" NL
 354
 355     // Calculate diffuse light strength.
 356     MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
 357     "b .lightStrengthCalculated" NL
 358
 359 ".specularLight1:" NL
 360     // Calculating specular light strength.
 361     "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
 362     GET_LENGTH(TMP1, TMP2)
 363
 364     // When the exponent is 1, we don't need to call an expensive powf function.
 365     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
 366     "vdiveq.f32 " TMP2_S1 ", " TMP1_S2 ", " TMP1_S3 NL
 367     "beq .specularExpPowf" NL
 368
 369     MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
 370     "b .lightStrengthCalculated" NL
 371
 372 ".normalVectorIsNonZero:" NL
 373     // Normal vector goes to TMP2, and its length is calculated as well.
 374     "vmov.s32 " TMP2_S0 ", r0" NL
 375     "vcvt.f32.s32 " TMP2_S0 ", " TMP2_S0 NL
 376     "vmul.f32 " TMP2_S0 ", " TMP2_S0 ", " SCALE_DIV4_S NL
 377     "vmov.s32 " TMP2_S1 ", r1" NL
 378     "vcvt.f32.s32 " TMP2_S1 ", " TMP2_S1 NL
 379     "vmul.f32 " TMP2_S1 ", " TMP2_S1 ", " SCALE_DIV4_S NL
 380     "vmov.f32 " TMP2_S2 ", " CONST_ONE_S NL
 381     GET_LENGTH(TMP2, TMP3)
 382
 383     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
 384     "bne .specularLight2" NL
 385
 386     // Calculating diffuse light strength.
 387     DOT_PRODUCT(TMP3, TMP2, TMP1)
 388     MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)
 389     "b .lightStrengthCalculated" NL
 390
 391 ".specularLight2:" NL
 392     // Calculating specular light strength.
 393     "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
 394     GET_LENGTH(TMP1, TMP3)
 395     DOT_PRODUCT(TMP3, TMP2, TMP1)
 396
 397     // When the exponent is 1, we don't need to call an expensive powf function.
 398     "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
 399     "vdiveq.f32 " TMP2_S1 ", " TMP3_S0 ", " TMP3_S3 NL
 400     "beq .specularExpPowf" NL
 401     MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)
 402
 403 ".lightStrengthCalculated:" NL
 404     // TMP2_S1 contains the light strength. Clamp it to [0, 1]
 405     "vmax.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ZERO_HI_D NL
 406     "vmin.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ONE_HI_D NL
 407     "vmul.f32 " TMP3_Q ", " COLOR_Q ", " TMP2_D0 "[1]" NL
 408     "vcvt.u32.f32 " TMP3_Q ", " TMP3_Q NL
 409     "vmov.u32 r2, r3, " TMP3_S0 ", " TMP3_S1 NL
 410     // The color values are stored in-place.
 411     "strb r2, [" PIXELS_R ", #-11]" NL
 412     "strb r3, [" PIXELS_R ", #-10]" NL
 413     "vmov.u32 r2, " TMP3_S2 NL
 414     "strb r2, [" PIXELS_R ", #-9]" NL
 415
 416     // Continue to the next pixel.
 417 ".blackPixel:" NL
 418     "vadd.f32 " POSITION_X_S ", " CONST_ONE_S NL
 419     "mov r3, #1" NL
 420     "subs " WIDTH_R ", " WIDTH_R ", #1" NL
 421     "bne .scanline" NL
 422
 423     // If the end of the scanline is reached, we continue
 424     // to the next scanline.
 425     "vadd.f32 " POSITION_Y_S ", " CONST_ONE_S NL
 426     "mov " WIDTH_R ", " RESET_WIDTH_R NL
 427     "subs " HEIGHT_R ", " HEIGHT_R ", #1" NL
 428     "bne .mainLoop" NL
 429
 430     // Return.
 431     "vldmia sp!, {d8-d15}" NL
 432     "ldmia sp!, {r4-r8, r10, r11, pc}" NL
 433
 434 ".distantLight:" NL
 435     // In case of distant light, the light vector is constant,
 436     // we simply copy it.
 437     "vmov.f32 " TMP1_Q ", " LIGHT_Q NL
 438     "b .visiblePixel" NL
 439
 440 ".cosineOfAngle:" NL
 441     // If the pixel is outside of the cone angle, it is simply a black pixel.
 442     DOT_PRODUCT(TMP3, TMP1, DIRECTION)
 443     "vdiv.f32 " COSINE_OF_ANGLE ", " TMP3_S0 ", " TMP1_S3 NL
 444     "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_CUT_OFF_S NL
 445     "fmstat" NL
 446     "bls .visiblePixel" NL
 447     "mov r0, #0" NL
 448     "strh r0, [" PIXELS_R ", #-11]" NL
 449     "strb r0, [" PIXELS_R ", #-9]" NL
 450     "b .blackPixel" NL
 451
 452 ".cutOff:" NL
 453     // Smoothing the light strength on the cone edge.
 454     "vsub.f32 " TMP3_S0 ", " CONE_CUT_OFF_S ", " COSINE_OF_ANGLE NL
 455     "vdiv.f32 " TMP3_S0 ", " TMP3_S0 ", " CONE_CUT_OFF_RANGE_S NL
 456     "vmul.f32 " TMP3_S1 ", " TMP3_S1 ", " TMP3_S0 NL
 457     "b .cutOffFinished" NL
 458
 459 ".coneExpPowf:" NL
 460     POWF(TMP3_S1, CONE_EXPONENT_R)
 461     "b .coneExpPowfFinished" NL
 462
 463 ".specularExpPowf:" NL
 464     POWF(TMP2_S1, SPECULAR_EXPONENT_R)
 465     "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL
 466     "vmuleq.f32 " TMP2_S1 ", " TMP2_S1 ", " DIFFUSE_CONST_S NL
 467     "b .lightStrengthCalculated" NL
 468 ); // NOLINT
 469
 470 int FELighting::getPowerCoefficients(float exponent)
 471 {
 472     // Calling a powf function from the assembly code would require to save
 473     // and reload a lot of NEON registers. Since the base is in range [0..1]
 474     // and only 8 bit precision is required, we use our own powf function.
 475     // This is probably not the best, but it uses only a few registers and
 476     // gives us enough precision (modifying the exponent field directly would
 477     // also be possible).
 478
 479     // First, we limit the exponent to maximum of 64, which gives us enough
 480     // precision. We split the exponent to an integer and fraction part,
 481     // since a^x = (a^y)*(a^z) where x = y+z. The integer exponent of the
 482     // power is estimated by square, and the fraction exponent of the power
 483     // is estimated by square root assembly instructions.
 484     int i, result;
 485
 486     if (exponent < 0)
 487         exponent = 1 / (-exponent);
 488
 489     if (exponent > 63.99)
 490         exponent = 63.99;
 491
 492     exponent /= 64;
 493     result = 0;
 494     for (i = 11; i >= 0; --i) {
 495         exponent *= 2;
 496         if (exponent >= 1) {
 497             result |= 1 << i;
 498             exponent -= 1;
 499         }
 500     }
 501     return result;
 502 }
 503
 504 } // namespace WebCore
 505
 506 #endif // CPU(ARM_NEON) && COMPILER(GCC)