source: trunk/src/gui/painting/qdrawhelper_neon.cpp@ 1166

Last change on this file since 1166 was 846, checked in by Dmitry A. Kuminov, 14 years ago

trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.

  • Property svn:eol-style set to native
File size: 35.7 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation ([email protected])
6**
7** This file is part of the QtGui module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include <private/qdrawhelper_p.h>
43#include <private/qblendfunctions_p.h>
44#include <private/qmath_p.h>
45
46#ifdef QT_HAVE_NEON
47
48#include <private/qdrawhelper_neon_p.h>
49#include <private/qpaintengine_raster_p.h>
50#include <arm_neon.h>
51
52QT_BEGIN_NAMESPACE
53
54void qt_memfill32_neon(quint32 *dest, quint32 value, int count)
55{
56 const int epilogueSize = count % 16;
57 if (count >= 16) {
58 quint32 *const neonEnd = dest + count - epilogueSize;
59 register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value);
60 register uint32x4_t valueVector2 asm ("q1") = valueVector1;
61 while (dest != neonEnd) {
62 asm volatile (
63 "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
64 "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
65 : [DST]"+r" (dest)
66 : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
67 : "memory"
68 );
69 }
70 }
71
72 switch (epilogueSize)
73 {
74 case 15: *dest++ = value;
75 case 14: *dest++ = value;
76 case 13: *dest++ = value;
77 case 12: *dest++ = value;
78 case 11: *dest++ = value;
79 case 10: *dest++ = value;
80 case 9: *dest++ = value;
81 case 8: *dest++ = value;
82 case 7: *dest++ = value;
83 case 6: *dest++ = value;
84 case 5: *dest++ = value;
85 case 4: *dest++ = value;
86 case 3: *dest++ = value;
87 case 2: *dest++ = value;
88 case 1: *dest++ = value;
89 }
90}
91
92static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
93{
94 // result = (x + (x >> 8) + 0x80) >> 8
95
96 const uint16x8_t temp = vshrq_n_u16(x, 8); // x >> 8
97 const uint16x8_t sum_part = vaddq_u16(x, half); // x + 0x80
98 const uint16x8_t sum = vaddq_u16(temp, sum_part);
99
100 return vshrq_n_u16(sum, 8);
101}
102
103static inline uint16x8_t qvbyte_mul_u16(uint16x8_t x, uint16x8_t alpha, uint16x8_t half)
104{
105 // t = qRound(x * alpha / 255.0)
106
107 const uint16x8_t t = vmulq_u16(x, alpha); // t
108 return qvdiv_255_u16(t, half);
109}
110
111static inline uint16x8_t qvinterpolate_pixel_255(uint16x8_t x, uint16x8_t a, uint16x8_t y, uint16x8_t b, uint16x8_t half)
112{
113 // t = x * a + y * b
114
115 const uint16x8_t ta = vmulq_u16(x, a);
116 const uint16x8_t tb = vmulq_u16(y, b);
117
118 return qvdiv_255_u16(vaddq_u16(ta, tb), half);
119}
120
121static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, uint16x8_t half, uint16x8_t full)
122{
123 const uint16x4_t alpha16_high = vdup_lane_u16(vget_high_u16(src16), 3);
124 const uint16x4_t alpha16_low = vdup_lane_u16(vget_low_u16(src16), 3);
125
126 const uint16x8_t alpha16 = vsubq_u16(full, vcombine_u16(alpha16_low, alpha16_high));
127
128 return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
129}
130
131extern "C" void
132pixman_composite_over_8888_0565_asm_neon (int32_t w,
133 int32_t h,
134 uint16_t *dst,
135 int32_t dst_stride,
136 uint32_t *src,
137 int32_t src_stride);
138
139extern "C" void
140pixman_composite_over_8888_8888_asm_neon (int32_t w,
141 int32_t h,
142 uint32_t *dst,
143 int32_t dst_stride,
144 uint32_t *src,
145 int32_t src_stride);
146
147extern "C" void
148pixman_composite_src_0565_8888_asm_neon (int32_t w,
149 int32_t h,
150 uint32_t *dst,
151 int32_t dst_stride,
152 uint16_t *src,
153 int32_t src_stride);
154
155extern "C" void
156pixman_composite_over_n_8_0565_asm_neon (int32_t w,
157 int32_t h,
158 uint16_t *dst,
159 int32_t dst_stride,
160 uint32_t src,
161 int32_t unused,
162 uint8_t *mask,
163 int32_t mask_stride);
164
165extern "C" void
166pixman_composite_scanline_over_asm_neon (int32_t w,
167 const uint32_t *dst,
168 const uint32_t *src);
169
170extern "C" void
171pixman_composite_src_0565_0565_asm_neon (int32_t w,
172 int32_t h,
173 uint16_t *dst,
174 int32_t dst_stride,
175 uint16_t *src,
176 int32_t src_stride);
177
178// qblendfunctions.cpp
179void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
180 const uchar *srcPixels, int sbpl,
181 int w, int h,
182 int const_alpha);
183
184void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
185 const uchar *srcPixels, int sbpl,
186 int w, int h,
187 int const_alpha)
188{
189 dbpl /= 4;
190 sbpl /= 2;
191
192 quint32 *dst = (quint32 *) destPixels;
193 quint16 *src = (quint16 *) srcPixels;
194
195 if (const_alpha != 256) {
196 quint8 a = (255 * const_alpha) >> 8;
197 quint8 ia = 255 - a;
198
199 while (h--) {
200 for (int x=0; x<w; ++x)
201 dst[x] = INTERPOLATE_PIXEL_255(qt_colorConvert(src[x], dst[x]), a, dst[x], ia);
202 dst += dbpl;
203 src += sbpl;
204 }
205 return;
206 }
207
208 pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
209}
210
211// qblendfunctions.cpp
212void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl,
213 const uchar *src, int sbpl,
214 int w, int h,
215 int const_alpha);
216
217template <int N>
218static inline void scanLineBlit16(quint16 *dst, quint16 *src, int dstride)
219{
220 if (N >= 2) {
221 ((quint32 *)dst)[0] = ((quint32 *)src)[0];
222 __builtin_prefetch(dst + dstride, 1, 0);
223 }
224 for (int i = 1; i < N/2; ++i)
225 ((quint32 *)dst)[i] = ((quint32 *)src)[i];
226 if (N & 1)
227 dst[N-1] = src[N-1];
228}
229
230template <int Width>
231static inline void blockBlit16(quint16 *dst, quint16 *src, int dstride, int sstride, int h)
232{
233 union {
234 quintptr address;
235 quint16 *pointer;
236 } u;
237
238 u.pointer = dst;
239
240 if (u.address & 2) {
241 while (h--) {
242 // align dst
243 dst[0] = src[0];
244 if (Width > 1)
245 scanLineBlit16<Width-1>(dst + 1, src + 1, dstride);
246 dst += dstride;
247 src += sstride;
248 }
249 } else {
250 while (h--) {
251 scanLineBlit16<Width>(dst, src, dstride);
252
253 dst += dstride;
254 src += sstride;
255 }
256 }
257}
258
259void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
260 const uchar *srcPixels, int sbpl,
261 int w, int h,
262 int const_alpha)
263{
264 // testing show that the default memcpy is faster for widths 150 and up
265 if (const_alpha != 256 || w >= 150) {
266 qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
267 return;
268 }
269
270 int dstride = dbpl / 2;
271 int sstride = sbpl / 2;
272
273 quint16 *dst = (quint16 *) destPixels;
274 quint16 *src = (quint16 *) srcPixels;
275
276 switch (w) {
277#define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return;
278 BLOCKBLIT(1);
279 BLOCKBLIT(2);
280 BLOCKBLIT(3);
281 BLOCKBLIT(4);
282 BLOCKBLIT(5);
283 BLOCKBLIT(6);
284 BLOCKBLIT(7);
285 BLOCKBLIT(8);
286 BLOCKBLIT(9);
287 BLOCKBLIT(10);
288 BLOCKBLIT(11);
289 BLOCKBLIT(12);
290 BLOCKBLIT(13);
291 BLOCKBLIT(14);
292 BLOCKBLIT(15);
293#undef BLOCKBLIT
294 default:
295 break;
296 }
297
298 pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride);
299}
300
301extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha);
302
303void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
304 const uchar *srcPixels, int sbpl,
305 int w, int h,
306 int const_alpha)
307{
308 quint16 *dst = (quint16 *) destPixels;
309 quint32 *src = (quint32 *) srcPixels;
310
311 if (const_alpha != 256) {
312 for (int y=0; y<h; ++y) {
313 int i = 0;
314 for (; i < w-7; i += 8)
315 blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha);
316
317 if (i < w) {
318 int tail = w - i;
319
320 quint16 dstBuffer[8];
321 quint32 srcBuffer[8];
322
323 for (int j = 0; j < tail; ++j) {
324 dstBuffer[j] = dst[i + j];
325 srcBuffer[j] = src[i + j];
326 }
327
328 blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha);
329
330 for (int j = 0; j < tail; ++j) {
331 dst[i + j] = dstBuffer[j];
332 src[i + j] = srcBuffer[j];
333 }
334 }
335
336 dst = (quint16 *)(((uchar *) dst) + dbpl);
337 src = (quint32 *)(((uchar *) src) + sbpl);
338 }
339 return;
340 }
341
342 pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
343}
344
345void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha)
346{
347 if (const_alpha == 255) {
348 pixman_composite_scanline_over_asm_neon(length, dest, src);
349 } else {
350 qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255);
351 }
352}
353
354void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
355 const uchar *srcPixels, int sbpl,
356 int w, int h,
357 int const_alpha)
358{
359 const uint *src = (const uint *) srcPixels;
360 uint *dst = (uint *) destPixels;
361 uint16x8_t half = vdupq_n_u16(0x80);
362 uint16x8_t full = vdupq_n_u16(0xff);
363 if (const_alpha == 256) {
364 pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4);
365 } else if (const_alpha != 0) {
366 const_alpha = (const_alpha * 255) >> 8;
367 uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
368 for (int y = 0; y < h; ++y) {
369 int x = 0;
370 for (; x < w-3; x += 4) {
371 if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
372 uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
373 uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
374
375 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
376 const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
377
378 const uint8x8_t src8_low = vget_low_u8(src8);
379 const uint8x8_t dst8_low = vget_low_u8(dst8);
380
381 const uint8x8_t src8_high = vget_high_u8(src8);
382 const uint8x8_t dst8_high = vget_high_u8(dst8);
383
384 const uint16x8_t src16_low = vmovl_u8(src8_low);
385 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
386
387 const uint16x8_t src16_high = vmovl_u8(src8_high);
388 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
389
390 const uint16x8_t srcalpha16_low = qvbyte_mul_u16(src16_low, const_alpha16, half);
391 const uint16x8_t srcalpha16_high = qvbyte_mul_u16(src16_high, const_alpha16, half);
392
393 const uint16x8_t result16_low = qvsource_over_u16(srcalpha16_low, dst16_low, half, full);
394 const uint16x8_t result16_high = qvsource_over_u16(srcalpha16_high, dst16_high, half, full);
395
396 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
397 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
398
399 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
400 }
401 }
402 for (; x<w; ++x) {
403 uint s = src[x];
404 if (s != 0) {
405 s = BYTE_MUL(s, const_alpha);
406 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
407 }
408 }
409 dst = (quint32 *)(((uchar *) dst) + dbpl);
410 src = (const quint32 *)(((const uchar *) src) + sbpl);
411 }
412 }
413}
414
415// qblendfunctions.cpp
416void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
417 const uchar *srcPixels, int sbpl,
418 int w, int h,
419 int const_alpha);
420
421void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
422 const uchar *srcPixels, int sbpl,
423 int w, int h,
424 int const_alpha)
425{
426 if (const_alpha != 256) {
427 if (const_alpha != 0) {
428 const uint *src = (const uint *) srcPixels;
429 uint *dst = (uint *) destPixels;
430 uint16x8_t half = vdupq_n_u16(0x80);
431 const_alpha = (const_alpha * 255) >> 8;
432 int one_minus_const_alpha = 255 - const_alpha;
433 uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
434 uint16x8_t one_minus_const_alpha16 = vdupq_n_u16(255 - const_alpha);
435 for (int y = 0; y < h; ++y) {
436 int x = 0;
437 for (; x < w-3; x += 4) {
438 uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
439 uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
440
441 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
442 const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
443
444 const uint8x8_t src8_low = vget_low_u8(src8);
445 const uint8x8_t dst8_low = vget_low_u8(dst8);
446
447 const uint8x8_t src8_high = vget_high_u8(src8);
448 const uint8x8_t dst8_high = vget_high_u8(dst8);
449
450 const uint16x8_t src16_low = vmovl_u8(src8_low);
451 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
452
453 const uint16x8_t src16_high = vmovl_u8(src8_high);
454 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
455
456 const uint16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
457 const uint16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
458
459 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
460 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
461
462 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
463 }
464 for (; x<w; ++x) {
465 uint s = src[x];
466 s = BYTE_MUL(s, const_alpha);
467 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
468 }
469 dst = (quint32 *)(((uchar *) dst) + dbpl);
470 src = (const quint32 *)(((const uchar *) src) + sbpl);
471 }
472 }
473 } else {
474 qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
475 }
476}
477
478void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
479 int x, int y, quint32 color,
480 const uchar *bitmap,
481 int mapWidth, int mapHeight, int mapStride,
482 const QClipData *)
483{
484 quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
485 const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
486
487 uchar *mask = const_cast<uchar *>(bitmap);
488
489 pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, color, 0, mask, mapStride);
490}
491
492extern "C" void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha);
493
494template <typename SRC, typename BlendFunc>
495struct Blend_on_RGB16_SourceAndConstAlpha_Neon {
496 Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha)
497 : m_index(0)
498 , m_blender(blender)
499 , m_const_alpha(const_alpha)
500 {
501 }
502
503 inline void write(quint16 *dst, quint32 src)
504 {
505 srcBuffer[m_index++] = src;
506
507 if (m_index == 8) {
508 m_blender(dst - 7, srcBuffer, m_const_alpha);
509 m_index = 0;
510 }
511 }
512
513 inline void flush(quint16 *dst)
514 {
515 if (m_index > 0) {
516 quint16 dstBuffer[8];
517 for (int i = 0; i < m_index; ++i)
518 dstBuffer[i] = dst[i - m_index];
519
520 m_blender(dstBuffer, srcBuffer, m_const_alpha);
521
522 for (int i = 0; i < m_index; ++i)
523 dst[i - m_index] = dstBuffer[i];
524
525 m_index = 0;
526 }
527 }
528
529 SRC srcBuffer[8];
530
531 int m_index;
532 BlendFunc m_blender;
533 int m_const_alpha;
534};
535
536template <typename SRC, typename BlendFunc>
537Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>
538Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha)
539{
540 return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha);
541}
542
543void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
544 const uchar *srcPixels, int sbpl,
545 const QRectF &targetRect,
546 const QRectF &sourceRect,
547 const QRect &clip,
548 int const_alpha)
549{
550 if (const_alpha == 0)
551 return;
552
553 qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip,
554 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
555}
556
557void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
558 const uchar *srcPixels, int sbpl,
559 const QRectF &targetRect,
560 const QRectF &sourceRect,
561 const QRect &clip,
562 int const_alpha);
563
564void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
565 const uchar *srcPixels, int sbpl,
566 const QRectF &targetRect,
567 const QRectF &sourceRect,
568 const QRect &clip,
569 int const_alpha)
570{
571 if (const_alpha == 0)
572 return;
573
574 if (const_alpha == 256) {
575 qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, const_alpha);
576 return;
577 }
578
579 qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip,
580 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
581}
582
583extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
584 const uchar *srcPixels, int sbpl,
585 const QRectF &targetRect,
586 const QRectF &sourceRect,
587 const QRect &clip,
588 const QTransform &targetRectTransform,
589 int const_alpha);
590
591void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
592 const uchar *srcPixels, int sbpl,
593 const QRectF &targetRect,
594 const QRectF &sourceRect,
595 const QRect &clip,
596 const QTransform &targetRectTransform,
597 int const_alpha)
598{
599 if (const_alpha == 0)
600 return;
601
602 if (const_alpha == 256) {
603 qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha);
604 return;
605 }
606
607 qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
608 reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
609 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
610}
611
612void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
613 const uchar *srcPixels, int sbpl,
614 const QRectF &targetRect,
615 const QRectF &sourceRect,
616 const QRect &clip,
617 const QTransform &targetRectTransform,
618 int const_alpha)
619{
620 if (const_alpha == 0)
621 return;
622
623 qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
624 reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
625 Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
626}
627
628static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src)
629{
630 asm volatile (
631 "vld1.16 { d0, d1 }, [%[SRC]]\n\t"
632
633 /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
634 and put data into d4 - red, d3 - green, d2 - blue */
635 "vshrn.u16 d4, q0, #8\n\t"
636 "vshrn.u16 d3, q0, #3\n\t"
637 "vsli.u16 q0, q0, #5\n\t"
638 "vsri.u8 d4, d4, #5\n\t"
639 "vsri.u8 d3, d3, #6\n\t"
640 "vshrn.u16 d2, q0, #2\n\t"
641
642 /* fill d5 - alpha with 0xff */
643 "mov r2, #255\n\t"
644 "vdup.8 d5, r2\n\t"
645
646 "vst4.8 { d2, d3, d4, d5 }, [%[DST]]"
647 : : [DST]"r" (dst), [SRC]"r" (src)
648 : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5"
649 );
650}
651
652uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length)
653{
654 const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x;
655
656 int i = 0;
657 for (; i < length - 7; i += 8)
658 convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]);
659
660 if (i < length) {
661 quint16 srcBuffer[8];
662 quint32 dstBuffer[8];
663
664 int tail = length - i;
665 for (int j = 0; j < tail; ++j)
666 srcBuffer[j] = data[i + j];
667
668 convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer);
669
670 for (int j = 0; j < tail; ++j)
671 buffer[i + j] = dstBuffer[j];
672 }
673
674 return buffer;
675}
676
677static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src)
678{
679 asm volatile (
680 "vld4.8 { d0, d1, d2, d3 }, [%[SRC]]\n\t"
681
682 /* convert to r5g6b5 and store it into {d28, d29} */
683 "vshll.u8 q14, d2, #8\n\t"
684 "vshll.u8 q8, d1, #8\n\t"
685 "vshll.u8 q9, d0, #8\n\t"
686 "vsri.u16 q14, q8, #5\n\t"
687 "vsri.u16 q14, q9, #11\n\t"
688
689 "vst1.16 { d28, d29 }, [%[DST]]"
690 : : [DST]"r" (dst), [SRC]"r" (src)
691 : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29"
692 );
693}
694
695void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length)
696{
697 quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x;
698
699 int i = 0;
700 for (; i < length - 7; i += 8)
701 convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]);
702
703 if (i < length) {
704 quint32 srcBuffer[8];
705 quint16 dstBuffer[8];
706
707 int tail = length - i;
708 for (int j = 0; j < tail; ++j)
709 srcBuffer[j] = buffer[i + j];
710
711 convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer);
712
713 for (int j = 0; j < tail; ++j)
714 data[i + j] = dstBuffer[j];
715 }
716}
717
718void QT_FASTCALL comp_func_solid_SourceOver_neon(uint *destPixels, int length, uint color, uint const_alpha)
719{
720 if ((const_alpha & qAlpha(color)) == 255) {
721 QT_MEMFILL_UINT(destPixels, length, color);
722 } else {
723 if (const_alpha != 255)
724 color = BYTE_MUL(color, const_alpha);
725
726 const quint32 minusAlphaOfColor = qAlpha(~color);
727 int x = 0;
728
729 uint32_t *dst = (uint32_t *) destPixels;
730 const uint32x4_t colorVector = vdupq_n_u32(color);
731 uint16x8_t half = vdupq_n_u16(0x80);
732 const uint16x8_t minusAlphaOfColorVector = vdupq_n_u16(minusAlphaOfColor);
733
734 for (; x < length-3; x += 4) {
735 uint32x4_t dstVector = vld1q_u32(&dst[x]);
736
737 const uint8x16_t dst8 = vreinterpretq_u8_u32(dstVector);
738
739 const uint8x8_t dst8_low = vget_low_u8(dst8);
740 const uint8x8_t dst8_high = vget_high_u8(dst8);
741
742 const uint16x8_t dst16_low = vmovl_u8(dst8_low);
743 const uint16x8_t dst16_high = vmovl_u8(dst8_high);
744
745 const uint16x8_t result16_low = qvbyte_mul_u16(dst16_low, minusAlphaOfColorVector, half);
746 const uint16x8_t result16_high = qvbyte_mul_u16(dst16_high, minusAlphaOfColorVector, half);
747
748 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
749 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
750
751 uint32x4_t blendedPixels = vcombine_u32(result32_low, result32_high);
752 uint32x4_t colorPlusBlendedPixels = vaddq_u32(colorVector, blendedPixels);
753 vst1q_u32(&dst[x], colorPlusBlendedPixels);
754 }
755
756 for (;x < length; ++x)
757 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
758 }
759}
760
761void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uint const_alpha)
762{
763 if (const_alpha == 255) {
764 uint *const end = dst + length;
765 uint *const neonEnd = end - 3;
766
767 while (dst < neonEnd) {
768 asm volatile (
769 "vld2.8 { d0, d1 }, [%[SRC]] !\n\t"
770 "vld2.8 { d2, d3 }, [%[DST]]\n\t"
771 "vqadd.u8 q0, q0, q1\n\t"
772 "vst2.8 { d0, d1 }, [%[DST]] !\n\t"
773 : [DST]"+r" (dst), [SRC]"+r" (src)
774 :
775 : "memory", "d0", "d1", "d2", "d3", "q0", "q1"
776 );
777 }
778
779 while (dst != end) {
780 *dst = comp_func_Plus_one_pixel(*dst, *src);
781 ++dst;
782 ++src;
783 }
784 } else {
785 int x = 0;
786 const int one_minus_const_alpha = 255 - const_alpha;
787 const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha);
788 const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha);
789
790 const uint16x8_t half = vdupq_n_u16(0x80);
791 for (; x < length - 3; x += 4) {
792 const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
793 const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
794 uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]);
795 uint8x16_t result = vqaddq_u8(dst8, src8);
796
797 uint16x8_t result_low = vmovl_u8(vget_low_u8(result));
798 uint16x8_t result_high = vmovl_u8(vget_high_u8(result));
799
800 uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8));
801 uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8));
802
803 result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half);
804 result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half);
805
806 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low));
807 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high));
808 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
809 }
810
811 for (; x < length; ++x)
812 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
813 }
814}
815
816static const int tileSize = 32;
817
818extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
819
820void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride)
821{
822 const ushort *src = (const ushort *)srcPixels;
823 ushort *dest = (ushort *)destPixels;
824
825 sstride /= sizeof(ushort);
826 dstride /= sizeof(ushort);
827
828 const int pack = sizeof(quint32) / sizeof(ushort);
829 const int unaligned =
830 qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
831 const int restX = w % tileSize;
832 const int restY = (h - unaligned) % tileSize;
833 const int unoptimizedY = restY % pack;
834 const int numTilesX = w / tileSize + (restX > 0);
835 const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
836
837 for (int tx = 0; tx < numTilesX; ++tx) {
838 const int startx = w - tx * tileSize - 1;
839 const int stopx = qMax(startx - tileSize, 0);
840
841 if (unaligned) {
842 for (int x = startx; x >= stopx; --x) {
843 ushort *d = dest + (w - x - 1) * dstride;
844 for (int y = 0; y < unaligned; ++y) {
845 *d++ = src[y * sstride + x];
846 }
847 }
848 }
849
850 for (int ty = 0; ty < numTilesY; ++ty) {
851 const int starty = ty * tileSize + unaligned;
852 const int stopy = qMin(starty + tileSize, h - unoptimizedY);
853
854 int x = startx;
855 // qt_rotate90_16_neon writes to eight rows, four pixels at a time
856 for (; x >= stopx + 7; x -= 8) {
857 ushort *d = dest + (w - x - 1) * dstride + starty;
858 const ushort *s = &src[starty * sstride + x - 7];
859 qt_rotate90_16_neon(d, s, sstride * 2, dstride * 2, stopy - starty);
860 }
861
862 for (; x >= stopx; --x) {
863 quint32 *d = reinterpret_cast<quint32*>(dest + (w - x - 1) * dstride + starty);
864 for (int y = starty; y < stopy; y += pack) {
865 quint32 c = src[y * sstride + x];
866 for (int i = 1; i < pack; ++i) {
867 const int shift = (sizeof(int) * 8 / pack * i);
868 const ushort color = src[(y + i) * sstride + x];
869 c |= color << shift;
870 }
871 *d++ = c;
872 }
873 }
874 }
875
876 if (unoptimizedY) {
877 const int starty = h - unoptimizedY;
878 for (int x = startx; x >= stopx; --x) {
879 ushort *d = dest + (w - x - 1) * dstride + starty;
880 for (int y = starty; y < h; ++y) {
881 *d++ = src[y * sstride + x];
882 }
883 }
884 }
885 }
886}
887
888extern "C" void qt_rotate270_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
889
890void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
891 int sstride,
892 uchar *destPixels, int dstride)
893{
894 const ushort *src = (const ushort *)srcPixels;
895 ushort *dest = (ushort *)destPixels;
896
897 sstride /= sizeof(ushort);
898 dstride /= sizeof(ushort);
899
900 const int pack = sizeof(quint32) / sizeof(ushort);
901 const int unaligned =
902 qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
903 const int restX = w % tileSize;
904 const int restY = (h - unaligned) % tileSize;
905 const int unoptimizedY = restY % pack;
906 const int numTilesX = w / tileSize + (restX > 0);
907 const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
908
909 for (int tx = 0; tx < numTilesX; ++tx) {
910 const int startx = tx * tileSize;
911 const int stopx = qMin(startx + tileSize, w);
912
913 if (unaligned) {
914 for (int x = startx; x < stopx; ++x) {
915 ushort *d = dest + x * dstride;
916 for (int y = h - 1; y >= h - unaligned; --y) {
917 *d++ = src[y * sstride + x];
918 }
919 }
920 }
921
922 for (int ty = 0; ty < numTilesY; ++ty) {
923 const int starty = h - 1 - unaligned - ty * tileSize;
924 const int stopy = qMax(starty - tileSize, unoptimizedY);
925
926 int x = startx;
927 // qt_rotate90_16_neon writes to eight rows, four pixels at a time
928 for (; x < stopx - 7; x += 8) {
929 ushort *d = dest + x * dstride + h - 1 - starty;
930 const ushort *s = &src[starty * sstride + x];
931 qt_rotate90_16_neon(d + 7 * dstride, s, -sstride * 2, -dstride * 2, starty - stopy);
932 }
933
934 for (; x < stopx; ++x) {
935 quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
936 + h - 1 - starty);
937 for (int y = starty; y > stopy; y -= pack) {
938 quint32 c = src[y * sstride + x];
939 for (int i = 1; i < pack; ++i) {
940 const int shift = (sizeof(int) * 8 / pack * i);
941 const ushort color = src[(y - i) * sstride + x];
942 c |= color << shift;
943 }
944 *d++ = c;
945 }
946 }
947 }
948 if (unoptimizedY) {
949 const int starty = unoptimizedY - 1;
950 for (int x = startx; x < stopx; ++x) {
951 ushort *d = dest + x * dstride + h - 1 - starty;
952 for (int y = starty; y >= 0; --y) {
953 *d++ = src[y * sstride + x];
954 }
955 }
956 }
957 }
958}
959
960QT_END_NAMESPACE
961
962#endif // QT_HAVE_NEON
963
Note: See TracBrowser for help on using the repository browser.