Context Navigation

qdrawhelper_neon.cpp@ 1166

Last change on this file since 1166 was 846, checked in by Dmitry A. Kuminov, 14 years ago
trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.
Property svn:eol-style set to `native`
File size: 35.7 KB

Line
1	/****************************************************************************
2	**
3	** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
4	** All rights reserved.
5	** Contact: Nokia Corporation ([email protected])
6	**
7	** This file is part of the QtGui module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial Usage
11	** Licensees holding valid Qt Commercial licenses may use this file in
12	** accordance with the Qt Commercial License Agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and Nokia.
15	**
16	** GNU Lesser General Public License Usage
17	** Alternatively, this file may be used under the terms of the GNU Lesser
18	** General Public License version 2.1 as published by the Free Software
19	** Foundation and appearing in the file LICENSE.LGPL included in the
20	** packaging of this file. Please review the following information to
21	** ensure the GNU Lesser General Public License version 2.1 requirements
22	** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23	**
24	** In addition, as a special exception, Nokia gives you certain additional
25	** rights. These rights are described in the Nokia Qt LGPL Exception
26	** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27	**
28	** GNU General Public License Usage
29	** Alternatively, this file may be used under the terms of the GNU
30	** General Public License version 3.0 as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL included in the
32	** packaging of this file. Please review the following information to
33	** ensure the GNU General Public License version 3.0 requirements will be
34	** met: http://www.gnu.org/copyleft/gpl.html.
35	**
36	** If you have questions regarding the use of this file, please contact
37	** Nokia at [email protected].
38	** $QT_END_LICENSE$
39	**
40	****************************************************************************/
41
42	#include <private/qdrawhelper_p.h>
43	#include <private/qblendfunctions_p.h>
44	#include <private/qmath_p.h>
45
46	#ifdef QT_HAVE_NEON
47
48	#include <private/qdrawhelper_neon_p.h>
49	#include <private/qpaintengine_raster_p.h>
50	#include <arm_neon.h>
51
52	QT_BEGIN_NAMESPACE
53
54	void qt_memfill32_neon(quint32 *dest, quint32 value, int count)
55	{
56	const int epilogueSize = count % 16;
57	if (count >= 16) {
58	quint32 *const neonEnd = dest + count - epilogueSize;
59	register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value);
60	register uint32x4_t valueVector2 asm ("q1") = valueVector1;
61	while (dest != neonEnd) {
62	asm volatile (
63	"vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
64	"vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
65	: [DST]"+r" (dest)
66	: [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
67	: "memory"
68	);
69	}
70	}
71
72	switch (epilogueSize)
73	{
74	case 15: *dest++ = value;
75	case 14: *dest++ = value;
76	case 13: *dest++ = value;
77	case 12: *dest++ = value;
78	case 11: *dest++ = value;
79	case 10: *dest++ = value;
80	case 9: *dest++ = value;
81	case 8: *dest++ = value;
82	case 7: *dest++ = value;
83	case 6: *dest++ = value;
84	case 5: *dest++ = value;
85	case 4: *dest++ = value;
86	case 3: *dest++ = value;
87	case 2: *dest++ = value;
88	case 1: *dest++ = value;
89	}
90	}
91
92	static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
93	{
94	// result = (x + (x >> 8) + 0x80) >> 8
95
96	const uint16x8_t temp = vshrq_n_u16(x, 8); // x >> 8
97	const uint16x8_t sum_part = vaddq_u16(x, half); // x + 0x80
98	const uint16x8_t sum = vaddq_u16(temp, sum_part);
99
100	return vshrq_n_u16(sum, 8);
101	}
102
103	static inline uint16x8_t qvbyte_mul_u16(uint16x8_t x, uint16x8_t alpha, uint16x8_t half)
104	{
105	// t = qRound(x * alpha / 255.0)
106
107	const uint16x8_t t = vmulq_u16(x, alpha); // t
108	return qvdiv_255_u16(t, half);
109	}
110
111	static inline uint16x8_t qvinterpolate_pixel_255(uint16x8_t x, uint16x8_t a, uint16x8_t y, uint16x8_t b, uint16x8_t half)
112	{
113	// t = x * a + y * b
114
115	const uint16x8_t ta = vmulq_u16(x, a);
116	const uint16x8_t tb = vmulq_u16(y, b);
117
118	return qvdiv_255_u16(vaddq_u16(ta, tb), half);
119	}
120
121	static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, uint16x8_t half, uint16x8_t full)
122	{
123	const uint16x4_t alpha16_high = vdup_lane_u16(vget_high_u16(src16), 3);
124	const uint16x4_t alpha16_low = vdup_lane_u16(vget_low_u16(src16), 3);
125
126	const uint16x8_t alpha16 = vsubq_u16(full, vcombine_u16(alpha16_low, alpha16_high));
127
128	return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
129	}
130
131	extern "C" void
132	pixman_composite_over_8888_0565_asm_neon (int32_t w,
133	int32_t h,
134	uint16_t *dst,
135	int32_t dst_stride,
136	uint32_t *src,
137	int32_t src_stride);
138
139	extern "C" void
140	pixman_composite_over_8888_8888_asm_neon (int32_t w,
141	int32_t h,
142	uint32_t *dst,
143	int32_t dst_stride,
144	uint32_t *src,
145	int32_t src_stride);
146
147	extern "C" void
148	pixman_composite_src_0565_8888_asm_neon (int32_t w,
149	int32_t h,
150	uint32_t *dst,
151	int32_t dst_stride,
152	uint16_t *src,
153	int32_t src_stride);
154
155	extern "C" void
156	pixman_composite_over_n_8_0565_asm_neon (int32_t w,
157	int32_t h,
158	uint16_t *dst,
159	int32_t dst_stride,
160	uint32_t src,
161	int32_t unused,
162	uint8_t *mask,
163	int32_t mask_stride);
164
165	extern "C" void
166	pixman_composite_scanline_over_asm_neon (int32_t w,
167	const uint32_t *dst,
168	const uint32_t *src);
169
170	extern "C" void
171	pixman_composite_src_0565_0565_asm_neon (int32_t w,
172	int32_t h,
173	uint16_t *dst,
174	int32_t dst_stride,
175	uint16_t *src,
176	int32_t src_stride);
177
178	// qblendfunctions.cpp
179	void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
180	const uchar *srcPixels, int sbpl,
181	int w, int h,
182	int const_alpha);
183
184	void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
185	const uchar *srcPixels, int sbpl,
186	int w, int h,
187	int const_alpha)
188	{
189	dbpl /= 4;
190	sbpl /= 2;
191
192	quint32 dst = (quint32 ) destPixels;
193	quint16 src = (quint16 ) srcPixels;
194
195	if (const_alpha != 256) {
196	quint8 a = (255 * const_alpha) >> 8;
197	quint8 ia = 255 - a;
198
199	while (h--) {
200	for (int x=0; x<w; ++x)
201	dst[x] = INTERPOLATE_PIXEL_255(qt_colorConvert(src[x], dst[x]), a, dst[x], ia);
202	dst += dbpl;
203	src += sbpl;
204	}
205	return;
206	}
207
208	pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
209	}
210
211	// qblendfunctions.cpp
212	void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl,
213	const uchar *src, int sbpl,
214	int w, int h,
215	int const_alpha);
216
217	template <int N>
218	static inline void scanLineBlit16(quint16 dst, quint16 src, int dstride)
219	{
220	if (N >= 2) {
221	((quint32 )dst)[0] = ((quint32 )src)[0];
222	__builtin_prefetch(dst + dstride, 1, 0);
223	}
224	for (int i = 1; i < N/2; ++i)
225	((quint32 )dst)[i] = ((quint32 )src)[i];
226	if (N & 1)
227	dst[N-1] = src[N-1];
228	}
229
230	template <int Width>
231	static inline void blockBlit16(quint16 dst, quint16 src, int dstride, int sstride, int h)
232	{
233	union {
234	quintptr address;
235	quint16 *pointer;
236	} u;
237
238	u.pointer = dst;
239
240	if (u.address & 2) {
241	while (h--) {
242	// align dst
243	dst[0] = src[0];
244	if (Width > 1)
245	scanLineBlit16<Width-1>(dst + 1, src + 1, dstride);
246	dst += dstride;
247	src += sstride;
248	}
249	} else {
250	while (h--) {
251	scanLineBlit16<Width>(dst, src, dstride);
252
253	dst += dstride;
254	src += sstride;
255	}
256	}
257	}
258
259	void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
260	const uchar *srcPixels, int sbpl,
261	int w, int h,
262	int const_alpha)
263	{
264	// testing show that the default memcpy is faster for widths 150 and up
265	if (const_alpha != 256 \|\| w >= 150) {
266	qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
267	return;
268	}
269
270	int dstride = dbpl / 2;
271	int sstride = sbpl / 2;
272
273	quint16 dst = (quint16 ) destPixels;
274	quint16 src = (quint16 ) srcPixels;
275
276	switch (w) {
277	#define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return;
278	BLOCKBLIT(1);
279	BLOCKBLIT(2);
280	BLOCKBLIT(3);
281	BLOCKBLIT(4);
282	BLOCKBLIT(5);
283	BLOCKBLIT(6);
284	BLOCKBLIT(7);
285	BLOCKBLIT(8);
286	BLOCKBLIT(9);
287	BLOCKBLIT(10);
288	BLOCKBLIT(11);
289	BLOCKBLIT(12);
290	BLOCKBLIT(13);
291	BLOCKBLIT(14);
292	BLOCKBLIT(15);
293	#undef BLOCKBLIT
294	default:
295	break;
296	}
297
298	pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride);
299	}
300
301	extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 dst, const quint32 src, int const_alpha);
302
303	void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
304	const uchar *srcPixels, int sbpl,
305	int w, int h,
306	int const_alpha)
307	{
308	quint16 dst = (quint16 ) destPixels;
309	quint32 src = (quint32 ) srcPixels;
310
311	if (const_alpha != 256) {
312	for (int y=0; y<h; ++y) {
313	int i = 0;
314	for (; i < w-7; i += 8)
315	blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha);
316
317	if (i < w) {
318	int tail = w - i;
319
320	quint16 dstBuffer[8];
321	quint32 srcBuffer[8];
322
323	for (int j = 0; j < tail; ++j) {
324	dstBuffer[j] = dst[i + j];
325	srcBuffer[j] = src[i + j];
326	}
327
328	blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha);
329
330	for (int j = 0; j < tail; ++j) {
331	dst[i + j] = dstBuffer[j];
332	src[i + j] = srcBuffer[j];
333	}
334	}
335
336	dst = (quint16 )(((uchar ) dst) + dbpl);
337	src = (quint32 )(((uchar ) src) + sbpl);
338	}
339	return;
340	}
341
342	pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
343	}
344
345	void qt_blend_argb32_on_argb32_scanline_neon(uint dest, const uint src, int length, uint const_alpha)
346	{
347	if (const_alpha == 255) {
348	pixman_composite_scanline_over_asm_neon(length, dest, src);
349	} else {
350	qt_blend_argb32_on_argb32_neon((uchar )dest, 4 length, (uchar )src, 4 length, length, 1, (const_alpha * 256) / 255);
351	}
352	}
353
354	void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
355	const uchar *srcPixels, int sbpl,
356	int w, int h,
357	int const_alpha)
358	{
359	const uint src = (const uint ) srcPixels;
360	uint dst = (uint ) destPixels;
361	uint16x8_t half = vdupq_n_u16(0x80);
362	uint16x8_t full = vdupq_n_u16(0xff);
363	if (const_alpha == 256) {
364	pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t )destPixels, dbpl / 4, (uint32_t )srcPixels, sbpl / 4);
365	} else if (const_alpha != 0) {
366	const_alpha = (const_alpha * 255) >> 8;
367	uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
368	for (int y = 0; y < h; ++y) {
369	int x = 0;
370	for (; x < w-3; x += 4) {
371	if (src[x] \| src[x+1] \| src[x+2] \| src[x+3]) {
372	uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
373	uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
374
375	const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
376	const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
377
378	const uint8x8_t src8_low = vget_low_u8(src8);
379	const uint8x8_t dst8_low = vget_low_u8(dst8);
380
381	const uint8x8_t src8_high = vget_high_u8(src8);
382	const uint8x8_t dst8_high = vget_high_u8(dst8);
383
384	const uint16x8_t src16_low = vmovl_u8(src8_low);
385	const uint16x8_t dst16_low = vmovl_u8(dst8_low);
386
387	const uint16x8_t src16_high = vmovl_u8(src8_high);
388	const uint16x8_t dst16_high = vmovl_u8(dst8_high);
389
390	const uint16x8_t srcalpha16_low = qvbyte_mul_u16(src16_low, const_alpha16, half);
391	const uint16x8_t srcalpha16_high = qvbyte_mul_u16(src16_high, const_alpha16, half);
392
393	const uint16x8_t result16_low = qvsource_over_u16(srcalpha16_low, dst16_low, half, full);
394	const uint16x8_t result16_high = qvsource_over_u16(srcalpha16_high, dst16_high, half, full);
395
396	const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
397	const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
398
399	vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
400	}
401	}
402	for (; x<w; ++x) {
403	uint s = src[x];
404	if (s != 0) {
405	s = BYTE_MUL(s, const_alpha);
406	dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
407	}
408	}
409	dst = (quint32 )(((uchar ) dst) + dbpl);
410	src = (const quint32 )(((const uchar ) src) + sbpl);
411	}
412	}
413	}
414
415	// qblendfunctions.cpp
416	void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
417	const uchar *srcPixels, int sbpl,
418	int w, int h,
419	int const_alpha);
420
421	void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
422	const uchar *srcPixels, int sbpl,
423	int w, int h,
424	int const_alpha)
425	{
426	if (const_alpha != 256) {
427	if (const_alpha != 0) {
428	const uint src = (const uint ) srcPixels;
429	uint dst = (uint ) destPixels;
430	uint16x8_t half = vdupq_n_u16(0x80);
431	const_alpha = (const_alpha * 255) >> 8;
432	int one_minus_const_alpha = 255 - const_alpha;
433	uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
434	uint16x8_t one_minus_const_alpha16 = vdupq_n_u16(255 - const_alpha);
435	for (int y = 0; y < h; ++y) {
436	int x = 0;
437	for (; x < w-3; x += 4) {
438	uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
439	uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
440
441	const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
442	const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
443
444	const uint8x8_t src8_low = vget_low_u8(src8);
445	const uint8x8_t dst8_low = vget_low_u8(dst8);
446
447	const uint8x8_t src8_high = vget_high_u8(src8);
448	const uint8x8_t dst8_high = vget_high_u8(dst8);
449
450	const uint16x8_t src16_low = vmovl_u8(src8_low);
451	const uint16x8_t dst16_low = vmovl_u8(dst8_low);
452
453	const uint16x8_t src16_high = vmovl_u8(src8_high);
454	const uint16x8_t dst16_high = vmovl_u8(dst8_high);
455
456	const uint16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
457	const uint16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
458
459	const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
460	const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
461
462	vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
463	}
464	for (; x<w; ++x) {
465	uint s = src[x];
466	s = BYTE_MUL(s, const_alpha);
467	dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
468	}
469	dst = (quint32 )(((uchar ) dst) + dbpl);
470	src = (const quint32 )(((const uchar ) src) + sbpl);
471	}
472	}
473	} else {
474	qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
475	}
476	}
477
478	void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
479	int x, int y, quint32 color,
480	const uchar *bitmap,
481	int mapWidth, int mapHeight, int mapStride,
482	const QClipData *)
483	{
484	quint16 dest = reinterpret_cast<quint16>(rasterBuffer->scanLine(y)) + x;
485	const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
486
487	uchar mask = const_cast<uchar >(bitmap);
488
489	pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, color, 0, mask, mapStride);
490	}
491
492	extern "C" void blend_8_pixels_rgb16_on_rgb16_neon(quint16 dst, const quint16 src, int const_alpha);
493
494	template <typename SRC, typename BlendFunc>
495	struct Blend_on_RGB16_SourceAndConstAlpha_Neon {
496	Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha)
497	: m_index(0)
498	, m_blender(blender)
499	, m_const_alpha(const_alpha)
500	{
501	}
502
503	inline void write(quint16 *dst, quint32 src)
504	{
505	srcBuffer[m_index++] = src;
506
507	if (m_index == 8) {
508	m_blender(dst - 7, srcBuffer, m_const_alpha);
509	m_index = 0;
510	}
511	}
512
513	inline void flush(quint16 *dst)
514	{
515	if (m_index > 0) {
516	quint16 dstBuffer[8];
517	for (int i = 0; i < m_index; ++i)
518	dstBuffer[i] = dst[i - m_index];
519
520	m_blender(dstBuffer, srcBuffer, m_const_alpha);
521
522	for (int i = 0; i < m_index; ++i)
523	dst[i - m_index] = dstBuffer[i];
524
525	m_index = 0;
526	}
527	}
528
529	SRC srcBuffer[8];
530
531	int m_index;
532	BlendFunc m_blender;
533	int m_const_alpha;
534	};
535
536	template <typename SRC, typename BlendFunc>
537	Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>
538	Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha)
539	{
540	return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha);
541	}
542
543	void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
544	const uchar *srcPixels, int sbpl,
545	const QRectF &targetRect,
546	const QRectF &sourceRect,
547	const QRect &clip,
548	int const_alpha)
549	{
550	if (const_alpha == 0)
551	return;
552
553	qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip,
554	Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
555	}
556
557	void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
558	const uchar *srcPixels, int sbpl,
559	const QRectF &targetRect,
560	const QRectF &sourceRect,
561	const QRect &clip,
562	int const_alpha);
563
564	void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
565	const uchar *srcPixels, int sbpl,
566	const QRectF &targetRect,
567	const QRectF &sourceRect,
568	const QRect &clip,
569	int const_alpha)
570	{
571	if (const_alpha == 0)
572	return;
573
574	if (const_alpha == 256) {
575	qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, const_alpha);
576	return;
577	}
578
579	qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip,
580	Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
581	}
582
583	extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
584	const uchar *srcPixels, int sbpl,
585	const QRectF &targetRect,
586	const QRectF &sourceRect,
587	const QRect &clip,
588	const QTransform &targetRectTransform,
589	int const_alpha);
590
591	void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
592	const uchar *srcPixels, int sbpl,
593	const QRectF &targetRect,
594	const QRectF &sourceRect,
595	const QRect &clip,
596	const QTransform &targetRectTransform,
597	int const_alpha)
598	{
599	if (const_alpha == 0)
600	return;
601
602	if (const_alpha == 256) {
603	qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha);
604	return;
605	}
606
607	qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
608	reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
609	Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
610	}
611
612	void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
613	const uchar *srcPixels, int sbpl,
614	const QRectF &targetRect,
615	const QRectF &sourceRect,
616	const QRect &clip,
617	const QTransform &targetRectTransform,
618	int const_alpha)
619	{
620	if (const_alpha == 0)
621	return;
622
623	qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
624	reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
625	Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
626	}
627
628	static inline void convert_8_pixels_rgb16_to_argb32(quint32 dst, const quint16 src)
629	{
630	asm volatile (
631	"vld1.16 { d0, d1 }, [%[SRC]]\n\t"
632
633	/* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
634	and put data into d4 - red, d3 - green, d2 - blue */
635	"vshrn.u16 d4, q0, #8\n\t"
636	"vshrn.u16 d3, q0, #3\n\t"
637	"vsli.u16 q0, q0, #5\n\t"
638	"vsri.u8 d4, d4, #5\n\t"
639	"vsri.u8 d3, d3, #6\n\t"
640	"vshrn.u16 d2, q0, #2\n\t"
641
642	/* fill d5 - alpha with 0xff */
643	"mov r2, #255\n\t"
644	"vdup.8 d5, r2\n\t"
645
646	"vst4.8 { d2, d3, d4, d5 }, [%[DST]]"
647	: : [DST]"r" (dst), [SRC]"r" (src)
648	: "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5"
649	);
650	}
651
652	uint * QT_FASTCALL qt_destFetchRGB16_neon(uint buffer, QRasterBuffer rasterBuffer, int x, int y, int length)
653	{
654	const ushort data = (const ushort )rasterBuffer->scanLine(y) + x;
655
656	int i = 0;
657	for (; i < length - 7; i += 8)
658	convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]);
659
660	if (i < length) {
661	quint16 srcBuffer[8];
662	quint32 dstBuffer[8];
663
664	int tail = length - i;
665	for (int j = 0; j < tail; ++j)
666	srcBuffer[j] = data[i + j];
667
668	convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer);
669
670	for (int j = 0; j < tail; ++j)
671	buffer[i + j] = dstBuffer[j];
672	}
673
674	return buffer;
675	}
676
677	static inline void convert_8_pixels_argb32_to_rgb16(quint16 dst, const quint32 src)
678	{
679	asm volatile (
680	"vld4.8 { d0, d1, d2, d3 }, [%[SRC]]\n\t"
681
682	/* convert to r5g6b5 and store it into {d28, d29} */
683	"vshll.u8 q14, d2, #8\n\t"
684	"vshll.u8 q8, d1, #8\n\t"
685	"vshll.u8 q9, d0, #8\n\t"
686	"vsri.u16 q14, q8, #5\n\t"
687	"vsri.u16 q14, q9, #11\n\t"
688
689	"vst1.16 { d28, d29 }, [%[DST]]"
690	: : [DST]"r" (dst), [SRC]"r" (src)
691	: "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29"
692	);
693	}
694
695	void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer rasterBuffer, int x, int y, const uint buffer, int length)
696	{
697	quint16 data = (quint16)rasterBuffer->scanLine(y) + x;
698
699	int i = 0;
700	for (; i < length - 7; i += 8)
701	convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]);
702
703	if (i < length) {
704	quint32 srcBuffer[8];
705	quint16 dstBuffer[8];
706
707	int tail = length - i;
708	for (int j = 0; j < tail; ++j)
709	srcBuffer[j] = buffer[i + j];
710
711	convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer);
712
713	for (int j = 0; j < tail; ++j)
714	data[i + j] = dstBuffer[j];
715	}
716	}
717
718	void QT_FASTCALL comp_func_solid_SourceOver_neon(uint *destPixels, int length, uint color, uint const_alpha)
719	{
720	if ((const_alpha & qAlpha(color)) == 255) {
721	QT_MEMFILL_UINT(destPixels, length, color);
722	} else {
723	if (const_alpha != 255)
724	color = BYTE_MUL(color, const_alpha);
725
726	const quint32 minusAlphaOfColor = qAlpha(~color);
727	int x = 0;
728
729	uint32_t dst = (uint32_t ) destPixels;
730	const uint32x4_t colorVector = vdupq_n_u32(color);
731	uint16x8_t half = vdupq_n_u16(0x80);
732	const uint16x8_t minusAlphaOfColorVector = vdupq_n_u16(minusAlphaOfColor);
733
734	for (; x < length-3; x += 4) {
735	uint32x4_t dstVector = vld1q_u32(&dst[x]);
736
737	const uint8x16_t dst8 = vreinterpretq_u8_u32(dstVector);
738
739	const uint8x8_t dst8_low = vget_low_u8(dst8);
740	const uint8x8_t dst8_high = vget_high_u8(dst8);
741
742	const uint16x8_t dst16_low = vmovl_u8(dst8_low);
743	const uint16x8_t dst16_high = vmovl_u8(dst8_high);
744
745	const uint16x8_t result16_low = qvbyte_mul_u16(dst16_low, minusAlphaOfColorVector, half);
746	const uint16x8_t result16_high = qvbyte_mul_u16(dst16_high, minusAlphaOfColorVector, half);
747
748	const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
749	const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
750
751	uint32x4_t blendedPixels = vcombine_u32(result32_low, result32_high);
752	uint32x4_t colorPlusBlendedPixels = vaddq_u32(colorVector, blendedPixels);
753	vst1q_u32(&dst[x], colorPlusBlendedPixels);
754	}
755
756	for (;x < length; ++x)
757	destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
758	}
759	}
760
761	void QT_FASTCALL comp_func_Plus_neon(uint dst, const uint src, int length, uint const_alpha)
762	{
763	if (const_alpha == 255) {
764	uint *const end = dst + length;
765	uint *const neonEnd = end - 3;
766
767	while (dst < neonEnd) {
768	asm volatile (
769	"vld2.8 { d0, d1 }, [%[SRC]] !\n\t"
770	"vld2.8 { d2, d3 }, [%[DST]]\n\t"
771	"vqadd.u8 q0, q0, q1\n\t"
772	"vst2.8 { d0, d1 }, [%[DST]] !\n\t"
773	: [DST]"+r" (dst), [SRC]"+r" (src)
774	:
775	: "memory", "d0", "d1", "d2", "d3", "q0", "q1"
776	);
777	}
778
779	while (dst != end) {
780	dst = comp_func_Plus_one_pixel(dst, *src);
781	++dst;
782	++src;
783	}
784	} else {
785	int x = 0;
786	const int one_minus_const_alpha = 255 - const_alpha;
787	const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha);
788	const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha);
789
790	const uint16x8_t half = vdupq_n_u16(0x80);
791	for (; x < length - 3; x += 4) {
792	const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
793	const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
794	uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]);
795	uint8x16_t result = vqaddq_u8(dst8, src8);
796
797	uint16x8_t result_low = vmovl_u8(vget_low_u8(result));
798	uint16x8_t result_high = vmovl_u8(vget_high_u8(result));
799
800	uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8));
801	uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8));
802
803	result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half);
804	result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half);
805
806	const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low));
807	const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high));
808	vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
809	}
810
811	for (; x < length; ++x)
812	dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
813	}
814	}
815
816	static const int tileSize = 32;
817
818	extern "C" void qt_rotate90_16_neon(quint16 dst, const quint16 src, int sstride, int dstride, int count);
819
820	void qt_memrotate90_16_neon(const uchar srcPixels, int w, int h, int sstride, uchar destPixels, int dstride)
821	{
822	const ushort src = (const ushort )srcPixels;
823	ushort dest = (ushort )destPixels;
824
825	sstride /= sizeof(ushort);
826	dstride /= sizeof(ushort);
827
828	const int pack = sizeof(quint32) / sizeof(ushort);
829	const int unaligned =
830	qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
831	const int restX = w % tileSize;
832	const int restY = (h - unaligned) % tileSize;
833	const int unoptimizedY = restY % pack;
834	const int numTilesX = w / tileSize + (restX > 0);
835	const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
836
837	for (int tx = 0; tx < numTilesX; ++tx) {
838	const int startx = w - tx * tileSize - 1;
839	const int stopx = qMax(startx - tileSize, 0);
840
841	if (unaligned) {
842	for (int x = startx; x >= stopx; --x) {
843	ushort d = dest + (w - x - 1) dstride;
844	for (int y = 0; y < unaligned; ++y) {
845	d++ = src[y sstride + x];
846	}
847	}
848	}
849
850	for (int ty = 0; ty < numTilesY; ++ty) {
851	const int starty = ty * tileSize + unaligned;
852	const int stopy = qMin(starty + tileSize, h - unoptimizedY);
853
854	int x = startx;
855	// qt_rotate90_16_neon writes to eight rows, four pixels at a time
856	for (; x >= stopx + 7; x -= 8) {
857	ushort d = dest + (w - x - 1) dstride + starty;
858	const ushort s = &src[starty sstride + x - 7];
859	qt_rotate90_16_neon(d, s, sstride * 2, dstride * 2, stopy - starty);
860	}
861
862	for (; x >= stopx; --x) {
863	quint32 d = reinterpret_cast<quint32>(dest + (w - x - 1) * dstride + starty);
864	for (int y = starty; y < stopy; y += pack) {
865	quint32 c = src[y * sstride + x];
866	for (int i = 1; i < pack; ++i) {
867	const int shift = (sizeof(int) * 8 / pack * i);
868	const ushort color = src[(y + i) * sstride + x];
869	c \|= color << shift;
870	}
871	*d++ = c;
872	}
873	}
874	}
875
876	if (unoptimizedY) {
877	const int starty = h - unoptimizedY;
878	for (int x = startx; x >= stopx; --x) {
879	ushort d = dest + (w - x - 1) dstride + starty;
880	for (int y = starty; y < h; ++y) {
881	d++ = src[y sstride + x];
882	}
883	}
884	}
885	}
886	}
887
888	extern "C" void qt_rotate270_16_neon(quint16 dst, const quint16 src, int sstride, int dstride, int count);
889
890	void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
891	int sstride,
892	uchar *destPixels, int dstride)
893	{
894	const ushort src = (const ushort )srcPixels;
895	ushort dest = (ushort )destPixels;
896
897	sstride /= sizeof(ushort);
898	dstride /= sizeof(ushort);
899
900	const int pack = sizeof(quint32) / sizeof(ushort);
901	const int unaligned =
902	qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
903	const int restX = w % tileSize;
904	const int restY = (h - unaligned) % tileSize;
905	const int unoptimizedY = restY % pack;
906	const int numTilesX = w / tileSize + (restX > 0);
907	const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
908
909	for (int tx = 0; tx < numTilesX; ++tx) {
910	const int startx = tx * tileSize;
911	const int stopx = qMin(startx + tileSize, w);
912
913	if (unaligned) {
914	for (int x = startx; x < stopx; ++x) {
915	ushort d = dest + x dstride;
916	for (int y = h - 1; y >= h - unaligned; --y) {
917	d++ = src[y sstride + x];
918	}
919	}
920	}
921
922	for (int ty = 0; ty < numTilesY; ++ty) {
923	const int starty = h - 1 - unaligned - ty * tileSize;
924	const int stopy = qMax(starty - tileSize, unoptimizedY);
925
926	int x = startx;
927	// qt_rotate90_16_neon writes to eight rows, four pixels at a time
928	for (; x < stopx - 7; x += 8) {
929	ushort d = dest + x dstride + h - 1 - starty;
930	const ushort s = &src[starty sstride + x];
931	qt_rotate90_16_neon(d + 7 * dstride, s, -sstride * 2, -dstride * 2, starty - stopy);
932	}
933
934	for (; x < stopx; ++x) {
935	quint32 d = reinterpret_cast<quint32>(dest + x * dstride
936	+ h - 1 - starty);
937	for (int y = starty; y > stopy; y -= pack) {
938	quint32 c = src[y * sstride + x];
939	for (int i = 1; i < pack; ++i) {
940	const int shift = (sizeof(int) * 8 / pack * i);
941	const ushort color = src[(y - i) * sstride + x];
942	c \|= color << shift;
943	}
944	*d++ = c;
945	}
946	}
947	}
948	if (unoptimizedY) {
949	const int starty = unoptimizedY - 1;
950	for (int x = startx; x < stopx; ++x) {
951	ushort d = dest + x dstride + h - 1 - starty;
952	for (int y = starty; y >= 0; --y) {
953	d++ = src[y sstride + x];
954	}
955	}
956	}
957	}
958	}
959
960	QT_END_NAMESPACE
961
962	#endif // QT_HAVE_NEON
963

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/gui/painting/qdrawhelper_neon.cpp@ 1166

Download in other formats: