Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

simtexth.cpp@ 239

Last change on this file since 239 was 2, checked in by Dmitry A. Kuminov, 16 years ago
Initially imported qt-all-opensource-src-4.5.1 from Trolltech.
File size: 9.7 KB

Line
1	/****************************************************************************
2	**
3	** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4	** Contact: Qt Software Information ([email protected])
5	**
6	** This file is part of the Qt Linguist of the Qt Toolkit.
7	**
8	** $QT_BEGIN_LICENSE:LGPL$
9	** Commercial Usage
10	** Licensees holding valid Qt Commercial licenses may use this file in
11	** accordance with the Qt Commercial License Agreement provided with the
12	** Software or, alternatively, in accordance with the terms contained in
13	** a written agreement between you and Nokia.
14	**
15	** GNU Lesser General Public License Usage
16	** Alternatively, this file may be used under the terms of the GNU Lesser
17	** General Public License version 2.1 as published by the Free Software
18	** Foundation and appearing in the file LICENSE.LGPL included in the
19	** packaging of this file. Please review the following information to
20	** ensure the GNU Lesser General Public License version 2.1 requirements
21	** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
22	**
23	** In addition, as a special exception, Nokia gives you certain
24	** additional rights. These rights are described in the Nokia Qt LGPL
25	** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
26	** package.
27	**
28	** GNU General Public License Usage
29	** Alternatively, this file may be used under the terms of the GNU
30	** General Public License version 3.0 as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL included in the
32	** packaging of this file. Please review the following information to
33	** ensure the GNU General Public License version 3.0 requirements will be
34	** met: http://www.gnu.org/copyleft/gpl.html.
35	**
36	** If you are unsure which license is appropriate for your use, please
37	** contact the sales department at [email protected].
38	** $QT_END_LICENSE$
39	**
40	****************************************************************************/
41
42	#include "simtexth.h"
43	#include "translator.h"
44
45	#include <QtCore/QByteArray>
46	#include <QtCore/QString>
47	#include <QtCore/QList>
48
49
50	QT_BEGIN_NAMESPACE
51
52	typedef QList<TranslatorMessage> TML;
53
54	/*
55	How similar are two texts? The approach used here relies on co-occurrence
56	matrices and is very efficient.
57
58	Let's see with an example: how similar are "here" and "hither"? The
59	co-occurrence matrix M for "here" is M[h,e] = 1, M[e,r] = 1, M[r,e] = 1, and 0
60	elsewhere; the matrix N for "hither" is N[h,i] = 1, N[i,t] = 1, ...,
61	N[h,e] = 1, N[e,r] = 1, and 0 elsewhere. The union U of both matrices is the
62	matrix U[i,j] = max { M[i,j], N[i,j] }, and the intersection V is
63	V[i,j] = min { M[i,j], N[i,j] }. The score for a pair of texts is
64
65	score = (sum of V[i,j] over all i, j) / (sum of U[i,j] over all i, j),
66
67	a formula suggested by Arnt Gulbrandsen. Here we have
68
69	score = 2 / 6,
70
71	or one third.
72
73	The implementation differs from this in a few details. Most importantly,
74	repetitions are ignored; for input "xxx", M[x,x] equals 1, not 2.
75	*/
76
77	/*
78	Every character is assigned to one of 20 buckets so that the co-occurrence
79	matrix requires only 20 * 20 = 400 bits, not 256 * 256 = 65536 bits or even
80	more if we want the whole Unicode. Which character falls in which bucket is
81	arbitrary.
82
83	The second half of the table is a replica of the first half, because of
84	laziness.
85	*/
86	static const int indexOf[256] = {
87	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89	// ! " # $ % & ' ( ) * + , - . /
90	0, 2, 6, 7, 10, 12, 15, 19, 2, 6, 7, 10, 12, 15, 19, 0,
91	// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
92	1, 3, 4, 5, 8, 9, 11, 13, 14, 16, 2, 6, 7, 10, 12, 15,
93	// @ A B C D E F G H I J K L M N O
94	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
95	// P Q R S T U V W X Y Z [ \ ] ^ _
96	15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
97	// ` a b c d e f g h i j k l m n o
98	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
99	// p q r s t u v w x y z { \| } ~
100	15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
101
102	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104	0, 2, 6, 7, 10, 12, 15, 19, 2, 6, 7, 10, 12, 15, 19, 0,
105	1, 3, 4, 5, 8, 9, 11, 13, 14, 16, 2, 6, 7, 10, 12, 15,
106	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
107	15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
108	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
109	15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0
110	};
111
112	/*
113	The entry bitCount[i] (for i between 0 and 255) is the number of bits used to
114	represent i in binary.
115	*/
116	static const int bitCount[256] = {
117	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
118	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
119	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
120	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
121	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
122	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
123	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
124	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
125	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
126	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
127	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
128	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
129	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
130	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
131	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
132	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
133	};
134
135	struct CoMatrix
136	{
137	/*
138	The matrix has 20 * 20 = 400 entries. This requires 50 bytes, or 13
139	words. Some operations are performed on words for more efficiency.
140	*/
141	union {
142	quint8 b[52];
143	quint32 w[13];
144	};
145
146	CoMatrix() { memset( b, 0, 52 ); }
147
148	CoMatrix(const QString &str)
149	{
150	QByteArray ba = str.toUtf8();
151	const char *text = ba.constData();
152	char c = '\0', d;
153	memset( b, 0, 52 );
154	/*
155	The Knuth books are not in the office only for show; they help make
156	loops 30% faster and 20% as readable.
157	*/
158	while ( (d = *text) != '\0' ) {
159	setCoOccurence( c, d );
160	if ( (c = *++text) != '\0' ) {
161	setCoOccurence( d, c );
162	text++;
163	}
164	}
165	}
166
167	void setCoOccurence( char c, char d ) {
168	int k = indexOf[(uchar) c] + 20 * indexOf[(uchar) d];
169	b[k >> 3] \|= (1 << (k & 0x7));
170	}
171
172	int worth() const {
173	int w = 0;
174	for ( int i = 0; i < 50; i++ )
175	w += bitCount[b[i]];
176	return w;
177	}
178	};
179
180	static inline CoMatrix reunion(const CoMatrix &m, const CoMatrix &n)
181	{
182	CoMatrix p;
183	for (int i = 0; i < 13; ++i)
184	p.w[i] = m.w[i] \| n.w[i];
185	return p;
186	}
187
188	static inline CoMatrix intersection(const CoMatrix &m, const CoMatrix &n)
189	{
190	CoMatrix p;
191	for (int i = 0; i < 13; ++i)
192	p.w[i] = m.w[i] & n.w[i];
193	return p;
194	}
195
196	StringSimilarityMatcher::StringSimilarityMatcher(const QString &stringToMatch)
197	{
198	m_cm = new CoMatrix(stringToMatch);
199	m_length = stringToMatch.length();
200	}
201
202	int StringSimilarityMatcher::getSimilarityScore(const QString &strCandidate)
203	{
204	CoMatrix cmTarget(strCandidate);
205	int delta = qAbs(m_length - strCandidate.size());
206	int score = ( (intersection(*m_cm, cmTarget).worth() + 1) << 10 ) /
207	( reunion(*m_cm, cmTarget).worth() + (delta << 1) + 1 );
208	return score;
209	}
210
211	StringSimilarityMatcher::~StringSimilarityMatcher()
212	{
213	delete m_cm;
214	}
215
216	/**
217	* Checks how similar two strings are.
218	* The return value is the score, and a higher score is more similar
219	* than one with a low score.
220	* Linguist considers a score over 190 to be a good match.
221	* \sa StringSimilarityMatcher
222	*/
223	int getSimilarityScore(const QString &str1, const QString &str2)
224	{
225	CoMatrix cmTarget(str2);
226	CoMatrix cm(str1);
227	int delta = qAbs(str1.size() - str2.size());
228
229	int score = ( (intersection(cm, cmTarget).worth() + 1) << 10 )
230	/ ( reunion(cm, cmTarget).worth() + (delta << 1) + 1 );
231
232	return score;
233	}
234
235	CandidateList similarTextHeuristicCandidates(const Translator *tor,
236	const QString &text, int maxCandidates)
237	{
238	QList<int> scores;
239	CandidateList candidates;
240
241	TML all = tor->translatedMessages();
242
243	foreach (const TranslatorMessage &mtm, all) {
244	if (mtm.type() == TranslatorMessage::Unfinished
245	\|\| mtm.translation().isEmpty())
246	continue;
247
248	QString s = mtm.sourceText();
249	int score = getSimilarityScore(s, text);
250
251	if (candidates.size() == maxCandidates && score > scores[maxCandidates - 1] )
252	candidates.removeLast();
253
254	if (candidates.size() < maxCandidates && score >= textSimilarityThreshold) {
255	Candidate cand( s, mtm.translation() );
256
257	int i;
258	for (i = 0; i < candidates.size(); i++) {
259	if (score >= scores.at(i)) {
260	if (score == scores.at(i)) {
261	if (candidates.at(i) == cand)
262	goto continue_outer_loop;
263	} else {
264	break;
265	}
266	}
267	}
268	scores.insert(i, score);
269	candidates.insert(i, cand);
270	}
271	continue_outer_loop:
272	;
273	}
274	return candidates;
275	}
276
277	QT_END_NAMESPACE

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/tools/linguist/shared/simtexth.cpp@ 239

Download in other formats: