Ignore:
Timestamp:
May 5, 2011, 5:36:53 AM (15 years ago)
Author:
Dmitry A. Kuminov
Message:

trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk

  • trunk/src/gui/painting/qdrawhelper_sse2.cpp

    r769 r846  
    11/****************************************************************************
    22**
    3 ** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
     3** Copyright (C) 201 Nokia Corporation and/or its subsidiary(-ies).
    44** All rights reserved.
    55** Contact: Nokia Corporation ([email protected])
     
    4444#ifdef QT_HAVE_SSE2
    4545
     46
    4647#include <private/qpaintengine_raster_p.h>
    4748
    48 #ifdef QT_LINUXBASE
    49 // this is an evil hack - the posix_memalign declaration in LSB
    50 // is wrong - see http://bugs.linuxbase.org/show_bug.cgi?id=2431
    51 #  define posix_memalign _lsb_hack_posix_memalign
    52 #  include <emmintrin.h>
    53 #  undef posix_memalign
    54 #else
    55 #  include <emmintrin.h>
    56 #endif
    57 
    5849QT_BEGIN_NAMESPACE
    59 
    60 /*
    61  * Multiply the components of pixelVector by alphaChannel
    62  * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
    63  * colorMask must have 0x00ff00ff on each 32 bits component
    64  * half must have the value 128 (0x80) for each 32 bits compnent
    65  */
    66 #define BYTE_MUL_SSE2(result, pixelVector, alphaChannel, colorMask, half) \
    67 { \
    68     /* 1. separate the colors in 2 vectors so each color is on 16 bits \
    69        (in order to be multiplied by the alpha \
    70        each 32 bit of dstVectorAG are in the form 0x00AA00GG \
    71        each 32 bit of dstVectorRB are in the form 0x00RR00BB */\
    72     __m128i pixelVectorAG = _mm_srli_epi16(pixelVector, 8); \
    73     __m128i pixelVectorRB = _mm_and_si128(pixelVector, colorMask); \
    74  \
    75     /* 2. multiply the vectors by the alpha channel */\
    76     pixelVectorAG = _mm_mullo_epi16(pixelVectorAG, alphaChannel); \
    77     pixelVectorRB = _mm_mullo_epi16(pixelVectorRB, alphaChannel); \
    78  \
    79     /* 3. devide by 255, that's the tricky part. \
    80        we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ \
    81     /** so first (X + X/256 + rounding) */\
    82     pixelVectorRB = _mm_add_epi16(pixelVectorRB, _mm_srli_epi16(pixelVectorRB, 8)); \
    83     pixelVectorRB = _mm_add_epi16(pixelVectorRB, half); \
    84     pixelVectorAG = _mm_add_epi16(pixelVectorAG, _mm_srli_epi16(pixelVectorAG, 8)); \
    85     pixelVectorAG = _mm_add_epi16(pixelVectorAG, half); \
    86  \
    87     /** second devide by 256 */\
    88     pixelVectorRB = _mm_srli_epi16(pixelVectorRB, 8); \
    89     /** for AG, we could >> 8 to divide followed by << 8 to put the \
    90         bytes in the correct position. By masking instead, we execute \
    91         only one instruction */\
    92     pixelVectorAG = _mm_andnot_si128(colorMask, pixelVectorAG); \
    93  \
    94     /* 4. combine the 2 pairs of colors */ \
    95     result = _mm_or_si128(pixelVectorAG, pixelVectorRB); \
    96 }
    97 
    98 /*
    99  * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
    100  * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component
    101  * colorMask must have 0x00ff00ff on each 32 bits component
    102  * half must have the value 128 (0x80) for each 32 bits compnent
    103  */
    104 #define INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, alphaChannel, oneMinusAlphaChannel, colorMask, half) { \
    105     /* interpolate AG */\
    106     __m128i srcVectorAG = _mm_srli_epi16(srcVector, 8); \
    107     __m128i dstVectorAG = _mm_srli_epi16(dstVector, 8); \
    108     __m128i srcVectorAGalpha = _mm_mullo_epi16(srcVectorAG, alphaChannel); \
    109     __m128i dstVectorAGoneMinusAlphalpha = _mm_mullo_epi16(dstVectorAG, oneMinusAlphaChannel); \
    110     __m128i finalAG = _mm_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); \
    111     finalAG = _mm_add_epi16(finalAG, _mm_srli_epi16(finalAG, 8)); \
    112     finalAG = _mm_add_epi16(finalAG, half); \
    113     finalAG = _mm_andnot_si128(colorMask, finalAG); \
    114  \
    115     /* interpolate RB */\
    116     __m128i srcVectorRB = _mm_and_si128(srcVector, colorMask); \
    117     __m128i dstVectorRB = _mm_and_si128(dstVector, colorMask); \
    118     __m128i srcVectorRBalpha = _mm_mullo_epi16(srcVectorRB, alphaChannel); \
    119     __m128i dstVectorRBoneMinusAlphalpha = _mm_mullo_epi16(dstVectorRB, oneMinusAlphaChannel); \
    120     __m128i finalRB = _mm_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); \
    121     finalRB = _mm_add_epi16(finalRB, _mm_srli_epi16(finalRB, 8)); \
    122     finalRB = _mm_add_epi16(finalRB, half); \
    123     finalRB = _mm_srli_epi16(finalRB, 8); \
    124  \
    125     /* combine */\
    126     result = _mm_or_si128(finalAG, finalRB); \
    127 }
    12850
    12951void qt_blend_argb32_on_argb32_sse2(uchar *destPixels, int dbpl,
     
    13355{
    13456    const quint32 *src = (const quint32 *) srcPixels;
    135     quint32 *dst = (uint *) destPixels;
     57    quint32 *dst = ( *) destPixels;
    13658    if (const_alpha == 256) {
    13759        const __m128i alphaMask = _mm_set1_epi32(0xff000000);
     
    14163        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
    14264        for (int y = 0; y < h; ++y) {
    143             int x = 0;
    144             for (; x < w-3; x += 4) {
    145                 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
    146                 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
    147                 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
    148                     // all opaque
    149                     _mm_storeu_si128((__m128i *)&dst[x], srcVector);
    150                 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) {
    151                     // not fully transparent
    152                     // result = s + d * (1-alpha)
    153 
    154                     // extract the alpha channel on 2 x 16 bits
    155                     // so we have room for the multiplication
    156                     // each 32 bits will be in the form 0x00AA00AA
    157                     // with A being the 1 - alpha
    158                     __m128i alphaChannel = _mm_srli_epi32(srcVector, 24);
    159                     alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16));
    160                     alphaChannel = _mm_sub_epi16(one, alphaChannel);
    161 
    162                     const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]);
    163                     __m128i destMultipliedByOneMinusAlpha;
    164                     BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
    165 
    166                     // result = s + d * (1-alpha)
    167                     const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
    168                     _mm_storeu_si128((__m128i *)&dst[x], result);
    169                 }
    170             }
    171             for (; x<w; ++x) {
    172                 uint s = src[x];
    173                 if (s >= 0xff000000)
    174                     dst[x] = s;
    175                 else if (s != 0)
    176                     dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
    177             }
     65            BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, w, nullVector, half, one, colorMask, alphaMask);
    17866            dst = (quint32 *)(((uchar *) dst) + dbpl);
    17967            src = (const quint32 *)(((const uchar *) src) + sbpl);
     
    19078        const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
    19179        for (int y = 0; y < h; ++y) {
    192             int x = 0;
    193             for (; x < w-3; x += 4) {
    194                 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
    195                 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) {
    196                     BYTE_MUL_SSE2(srcVector, srcVector, constAlphaVector, colorMask, half);
    197 
    198                     __m128i alphaChannel = _mm_srli_epi32(srcVector, 24);
    199                     alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16));
    200                     alphaChannel = _mm_sub_epi16(one, alphaChannel);
    201 
    202                     const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]);
    203                     __m128i destMultipliedByOneMinusAlpha;
    204                     BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
    205 
    206                     const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha);
    207                     _mm_storeu_si128((__m128i *)&dst[x], result);
    208                 }
    209             }
    210             for (; x<w; ++x) {
    211                 quint32 s = src[x];
    212                 if (s != 0) {
    213                     s = BYTE_MUL(s, const_alpha);
    214                     dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
    215                 }
    216             }
     80            BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
    21781            dst = (quint32 *)(((uchar *) dst) + dbpl);
    21882            src = (const quint32 *)(((const uchar *) src) + sbpl);
     
    23397{
    23498    const quint32 *src = (const quint32 *) srcPixels;
    235     quint32 *dst = (uint *) destPixels;
     99    quint32 *dst = ( *) destPixels;
    236100    if (const_alpha != 256) {
    237101        if (const_alpha != 0) {
     
    246110            for (int y = 0; y < h; ++y) {
    247111                int x = 0;
     112
     113
     114
     115
     116
     117
     118
     119
    248120                for (; x < w-3; x += 4) {
    249121                    __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
    250122                    if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) {
    251                         const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]);
     123                        const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
    252124                        __m128i result;
    253125                        INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
    254                         _mm_storeu_si128((__m128i *)&dst[x], result);
     126                        _mm_store_si128((__m128i *)&dst[x], result);
    255127                    }
    256128                }
     
    269141}
    270142
     143
     144
     145
     146
     147
     148
     149
     150
     151
     152
     153
     154
     155
     156
     157
     158
     159
     160
     161
     162
     163
     164
     165
     166
     167
     168
     169
     170
     171
     172
     173
     174
     175
     176
     177
     178
     179
     180
     181
     182
     183
     184
     185
     186
     187
     188
     189
     190
     191
     192
     193
     194
     195
     196
     197
     198
     199
     200
     201
     202
     203
     204
     205
     206
     207
     208
     209
     210
     211
     212
     213
     214
     215
     216
     217
     218
     219
     220
     221
     222
     223
     224
     225
     226
     227
     228
     229
     230
     231
     232
     233
     234
     235
     236
     237
     238
     239
     240
     241
    271242void qt_memfill32_sse2(quint32 *dest, quint32 value, int count)
    272243{
     
    296267    int n = (count128 + 3) / 4;
    297268    switch (count128 & 0x3) {
    298     case 0: do { _mm_store_si128(dst128++, value128);
    299     case 3:      _mm_store_si128(dst128++, value128);
    300     case 2:      _mm_store_si128(dst128++, value128);
    301     case 1:      _mm_store_si128(dst128++, value128);
     269    case 0: do { _mm_st_si128(dst128++, value128);
     270    case 3:      _mm_st_si128(dst128++, value128);
     271    case 2:      _mm_st_si128(dst128++, value128);
     272    case 1:      _mm_st_si128(dst128++, value128);
    302273    } while (--n > 0);
    303274    }
     
    312283    }
    313284}
     285
     286
     287
     288
     289
     290
     291
     292
     293
     294
     295
     296
     297
     298
     299
     300
     301
     302
     303
     304
     305
     306
     307
     308
     309
     310
     311
     312
     313
     314
     315
     316
     317
     318
     319
     320
     321
     322
     323
     324
     325
     326
     327
     328
     329
     330
     331
     332
     333
     334
     335
     336
     337
     338
     339
     340
     341
     342
     343
     344
     345
     346
     347
     348
     349
     350
     351
     352
     353
     354
     355
     356
     357
     358
     359
     360
     361
     362
     363
     364
     365
     366
     367
     368
     369
     370
     371
     372
     373
     374
     375
     376
     377
     378
     379
     380
     381
     382
     383
     384
     385
     386
     387
    314388
    315389void qt_memfill16_sse2(quint16 *dest, quint16 value, int count)
Note: See TracChangeset for help on using the changeset viewer.