Changeset 846 for trunk/src/gui/painting/qdrawhelper_sse2.cpp
- Timestamp:
- May 5, 2011, 5:36:53 AM (15 years ago)
- Location:
- trunk
- Files:
-
- 2 edited
-
. (modified) (1 prop)
-
src/gui/painting/qdrawhelper_sse2.cpp (modified) (10 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk
- Property svn:mergeinfo changed
/branches/vendor/nokia/qt/4.7.2 (added) merged: 845 /branches/vendor/nokia/qt/current merged: 844 /branches/vendor/nokia/qt/4.6.3 removed
- Property svn:mergeinfo changed
-
trunk/src/gui/painting/qdrawhelper_sse2.cpp
r769 r846 1 1 /**************************************************************************** 2 2 ** 3 ** Copyright (C) 201 0Nokia Corporation and/or its subsidiary(-ies).3 ** Copyright (C) 201 Nokia Corporation and/or its subsidiary(-ies). 4 4 ** All rights reserved. 5 5 ** Contact: Nokia Corporation ([email protected]) … … 44 44 #ifdef QT_HAVE_SSE2 45 45 46 46 47 #include <private/qpaintengine_raster_p.h> 47 48 48 #ifdef QT_LINUXBASE49 // this is an evil hack - the posix_memalign declaration in LSB50 // is wrong - see http://bugs.linuxbase.org/show_bug.cgi?id=243151 # define posix_memalign _lsb_hack_posix_memalign52 # include <emmintrin.h>53 # undef posix_memalign54 #else55 # include <emmintrin.h>56 #endif57 58 49 QT_BEGIN_NAMESPACE 59 60 /*61 * Multiply the components of pixelVector by alphaChannel62 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA63 * colorMask must have 0x00ff00ff on each 32 bits component64 * half must have the value 128 (0x80) for each 32 bits compnent65 */66 #define BYTE_MUL_SSE2(result, pixelVector, alphaChannel, colorMask, half) \67 { \68 /* 1. separate the colors in 2 vectors so each color is on 16 bits \69 (in order to be multiplied by the alpha \70 each 32 bit of dstVectorAG are in the form 0x00AA00GG \71 each 32 bit of dstVectorRB are in the form 0x00RR00BB */\72 __m128i pixelVectorAG = _mm_srli_epi16(pixelVector, 8); \73 __m128i pixelVectorRB = _mm_and_si128(pixelVector, colorMask); \74 \75 /* 2. multiply the vectors by the alpha channel */\76 pixelVectorAG = _mm_mullo_epi16(pixelVectorAG, alphaChannel); \77 pixelVectorRB = _mm_mullo_epi16(pixelVectorRB, alphaChannel); \78 \79 /* 3. devide by 255, that's the tricky part. \80 we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ \81 /** so first (X + X/256 + rounding) */\82 pixelVectorRB = _mm_add_epi16(pixelVectorRB, _mm_srli_epi16(pixelVectorRB, 8)); \83 pixelVectorRB = _mm_add_epi16(pixelVectorRB, half); \84 pixelVectorAG = _mm_add_epi16(pixelVectorAG, _mm_srli_epi16(pixelVectorAG, 8)); \85 pixelVectorAG = _mm_add_epi16(pixelVectorAG, half); \86 \87 /** second devide by 256 */\88 pixelVectorRB = _mm_srli_epi16(pixelVectorRB, 8); \89 /** for AG, we could >> 8 to divide followed by << 8 to put the \90 bytes in the correct position. By masking instead, we execute \91 only one instruction */\92 pixelVectorAG = _mm_andnot_si128(colorMask, pixelVectorAG); \93 \94 /* 4. combine the 2 pairs of colors */ \95 result = _mm_or_si128(pixelVectorAG, pixelVectorRB); \96 }97 98 /*99 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA100 * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component101 * colorMask must have 0x00ff00ff on each 32 bits component102 * half must have the value 128 (0x80) for each 32 bits compnent103 */104 #define INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, alphaChannel, oneMinusAlphaChannel, colorMask, half) { \105 /* interpolate AG */\106 __m128i srcVectorAG = _mm_srli_epi16(srcVector, 8); \107 __m128i dstVectorAG = _mm_srli_epi16(dstVector, 8); \108 __m128i srcVectorAGalpha = _mm_mullo_epi16(srcVectorAG, alphaChannel); \109 __m128i dstVectorAGoneMinusAlphalpha = _mm_mullo_epi16(dstVectorAG, oneMinusAlphaChannel); \110 __m128i finalAG = _mm_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); \111 finalAG = _mm_add_epi16(finalAG, _mm_srli_epi16(finalAG, 8)); \112 finalAG = _mm_add_epi16(finalAG, half); \113 finalAG = _mm_andnot_si128(colorMask, finalAG); \114 \115 /* interpolate RB */\116 __m128i srcVectorRB = _mm_and_si128(srcVector, colorMask); \117 __m128i dstVectorRB = _mm_and_si128(dstVector, colorMask); \118 __m128i srcVectorRBalpha = _mm_mullo_epi16(srcVectorRB, alphaChannel); \119 __m128i dstVectorRBoneMinusAlphalpha = _mm_mullo_epi16(dstVectorRB, oneMinusAlphaChannel); \120 __m128i finalRB = _mm_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); \121 finalRB = _mm_add_epi16(finalRB, _mm_srli_epi16(finalRB, 8)); \122 finalRB = _mm_add_epi16(finalRB, half); \123 finalRB = _mm_srli_epi16(finalRB, 8); \124 \125 /* combine */\126 result = _mm_or_si128(finalAG, finalRB); \127 }128 50 129 51 void qt_blend_argb32_on_argb32_sse2(uchar *destPixels, int dbpl, … … 133 55 { 134 56 const quint32 *src = (const quint32 *) srcPixels; 135 quint32 *dst = ( uint*) destPixels;57 quint32 *dst = ( *) destPixels; 136 58 if (const_alpha == 256) { 137 59 const __m128i alphaMask = _mm_set1_epi32(0xff000000); … … 141 63 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); 142 64 for (int y = 0; y < h; ++y) { 143 int x = 0; 144 for (; x < w-3; x += 4) { 145 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 146 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); 147 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { 148 // all opaque 149 _mm_storeu_si128((__m128i *)&dst[x], srcVector); 150 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { 151 // not fully transparent 152 // result = s + d * (1-alpha) 153 154 // extract the alpha channel on 2 x 16 bits 155 // so we have room for the multiplication 156 // each 32 bits will be in the form 0x00AA00AA 157 // with A being the 1 - alpha 158 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); 159 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); 160 alphaChannel = _mm_sub_epi16(one, alphaChannel); 161 162 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); 163 __m128i destMultipliedByOneMinusAlpha; 164 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); 165 166 // result = s + d * (1-alpha) 167 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); 168 _mm_storeu_si128((__m128i *)&dst[x], result); 169 } 170 } 171 for (; x<w; ++x) { 172 uint s = src[x]; 173 if (s >= 0xff000000) 174 dst[x] = s; 175 else if (s != 0) 176 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); 177 } 65 BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, w, nullVector, half, one, colorMask, alphaMask); 178 66 dst = (quint32 *)(((uchar *) dst) + dbpl); 179 67 src = (const quint32 *)(((const uchar *) src) + sbpl); … … 190 78 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); 191 79 for (int y = 0; y < h; ++y) { 192 int x = 0; 193 for (; x < w-3; x += 4) { 194 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 195 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { 196 BYTE_MUL_SSE2(srcVector, srcVector, constAlphaVector, colorMask, half); 197 198 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); 199 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); 200 alphaChannel = _mm_sub_epi16(one, alphaChannel); 201 202 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); 203 __m128i destMultipliedByOneMinusAlpha; 204 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); 205 206 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); 207 _mm_storeu_si128((__m128i *)&dst[x], result); 208 } 209 } 210 for (; x<w; ++x) { 211 quint32 s = src[x]; 212 if (s != 0) { 213 s = BYTE_MUL(s, const_alpha); 214 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); 215 } 216 } 80 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector) 217 81 dst = (quint32 *)(((uchar *) dst) + dbpl); 218 82 src = (const quint32 *)(((const uchar *) src) + sbpl); … … 233 97 { 234 98 const quint32 *src = (const quint32 *) srcPixels; 235 quint32 *dst = ( uint*) destPixels;99 quint32 *dst = ( *) destPixels; 236 100 if (const_alpha != 256) { 237 101 if (const_alpha != 0) { … … 246 110 for (int y = 0; y < h; ++y) { 247 111 int x = 0; 112 113 114 115 116 117 118 119 248 120 for (; x < w-3; x += 4) { 249 121 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); 250 122 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { 251 const __m128i dstVector = _mm_load u_si128((__m128i *)&dst[x]);123 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); 252 124 __m128i result; 253 125 INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); 254 _mm_store u_si128((__m128i *)&dst[x], result);126 _mm_store_si128((__m128i *)&dst[x], result); 255 127 } 256 128 } … … 269 141 } 270 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 271 242 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count) 272 243 { … … 296 267 int n = (count128 + 3) / 4; 297 268 switch (count128 & 0x3) { 298 case 0: do { _mm_st ore_si128(dst128++, value128);299 case 3: _mm_st ore_si128(dst128++, value128);300 case 2: _mm_st ore_si128(dst128++, value128);301 case 1: _mm_st ore_si128(dst128++, value128);269 case 0: do { _mm_st_si128(dst128++, value128); 270 case 3: _mm_st_si128(dst128++, value128); 271 case 2: _mm_st_si128(dst128++, value128); 272 case 1: _mm_st_si128(dst128++, value128); 302 273 } while (--n > 0); 303 274 } … … 312 283 } 313 284 } 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 314 388 315 389 void qt_memfill16_sse2(quint16 *dest, quint16 value, int count)
Note:
See TracChangeset
for help on using the changeset viewer.
