source: trunk/tools/linguist/shared/simtexth.cpp@ 239

Last change on this file since 239 was 2, checked in by Dmitry A. Kuminov, 16 years ago

Initially imported qt-all-opensource-src-4.5.1 from Trolltech.

File size: 9.7 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4** Contact: Qt Software Information ([email protected])
5**
6** This file is part of the Qt Linguist of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial Usage
10** Licensees holding valid Qt Commercial licenses may use this file in
11** accordance with the Qt Commercial License Agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and Nokia.
14**
15** GNU Lesser General Public License Usage
16** Alternatively, this file may be used under the terms of the GNU Lesser
17** General Public License version 2.1 as published by the Free Software
18** Foundation and appearing in the file LICENSE.LGPL included in the
19** packaging of this file. Please review the following information to
20** ensure the GNU Lesser General Public License version 2.1 requirements
21** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
22**
23** In addition, as a special exception, Nokia gives you certain
24** additional rights. These rights are described in the Nokia Qt LGPL
25** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
26** package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you are unsure which license is appropriate for your use, please
37** contact the sales department at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "simtexth.h"
43#include "translator.h"
44
45#include <QtCore/QByteArray>
46#include <QtCore/QString>
47#include <QtCore/QList>
48
49
50QT_BEGIN_NAMESPACE
51
52typedef QList<TranslatorMessage> TML;
53
54/*
55 How similar are two texts? The approach used here relies on co-occurrence
56 matrices and is very efficient.
57
58 Let's see with an example: how similar are "here" and "hither"? The
59 co-occurrence matrix M for "here" is M[h,e] = 1, M[e,r] = 1, M[r,e] = 1, and 0
60 elsewhere; the matrix N for "hither" is N[h,i] = 1, N[i,t] = 1, ...,
61 N[h,e] = 1, N[e,r] = 1, and 0 elsewhere. The union U of both matrices is the
62 matrix U[i,j] = max { M[i,j], N[i,j] }, and the intersection V is
63 V[i,j] = min { M[i,j], N[i,j] }. The score for a pair of texts is
64
65 score = (sum of V[i,j] over all i, j) / (sum of U[i,j] over all i, j),
66
67 a formula suggested by Arnt Gulbrandsen. Here we have
68
69 score = 2 / 6,
70
71 or one third.
72
73 The implementation differs from this in a few details. Most importantly,
74 repetitions are ignored; for input "xxx", M[x,x] equals 1, not 2.
75*/
76
77/*
78 Every character is assigned to one of 20 buckets so that the co-occurrence
79 matrix requires only 20 * 20 = 400 bits, not 256 * 256 = 65536 bits or even
80 more if we want the whole Unicode. Which character falls in which bucket is
81 arbitrary.
82
83 The second half of the table is a replica of the first half, because of
84 laziness.
85*/
86static const int indexOf[256] = {
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89// ! " # $ % & ' ( ) * + , - . /
90 0, 2, 6, 7, 10, 12, 15, 19, 2, 6, 7, 10, 12, 15, 19, 0,
91// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
92 1, 3, 4, 5, 8, 9, 11, 13, 14, 16, 2, 6, 7, 10, 12, 15,
93// @ A B C D E F G H I J K L M N O
94 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
95// P Q R S T U V W X Y Z [ \ ] ^ _
96 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
97// ` a b c d e f g h i j k l m n o
98 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
99// p q r s t u v w x y z { | } ~
100 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
101
102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 2, 6, 7, 10, 12, 15, 19, 2, 6, 7, 10, 12, 15, 19, 0,
105 1, 3, 4, 5, 8, 9, 11, 13, 14, 16, 2, 6, 7, 10, 12, 15,
106 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
107 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0,
108 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 6, 10, 11, 12, 13, 14,
109 15, 12, 16, 17, 18, 19, 2, 10, 15, 7, 19, 2, 6, 7, 10, 0
110};
111
112/*
113 The entry bitCount[i] (for i between 0 and 255) is the number of bits used to
114 represent i in binary.
115*/
116static const int bitCount[256] = {
117 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
118 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
119 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
120 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
121 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
122 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
123 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
124 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
125 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
126 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
127 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
128 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
129 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
130 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
131 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
132 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
133};
134
135struct CoMatrix
136{
137 /*
138 The matrix has 20 * 20 = 400 entries. This requires 50 bytes, or 13
139 words. Some operations are performed on words for more efficiency.
140 */
141 union {
142 quint8 b[52];
143 quint32 w[13];
144 };
145
146 CoMatrix() { memset( b, 0, 52 ); }
147
148 CoMatrix(const QString &str)
149 {
150 QByteArray ba = str.toUtf8();
151 const char *text = ba.constData();
152 char c = '\0', d;
153 memset( b, 0, 52 );
154 /*
155 The Knuth books are not in the office only for show; they help make
156 loops 30% faster and 20% as readable.
157 */
158 while ( (d = *text) != '\0' ) {
159 setCoOccurence( c, d );
160 if ( (c = *++text) != '\0' ) {
161 setCoOccurence( d, c );
162 text++;
163 }
164 }
165 }
166
167 void setCoOccurence( char c, char d ) {
168 int k = indexOf[(uchar) c] + 20 * indexOf[(uchar) d];
169 b[k >> 3] |= (1 << (k & 0x7));
170 }
171
172 int worth() const {
173 int w = 0;
174 for ( int i = 0; i < 50; i++ )
175 w += bitCount[b[i]];
176 return w;
177 }
178};
179
180static inline CoMatrix reunion(const CoMatrix &m, const CoMatrix &n)
181{
182 CoMatrix p;
183 for (int i = 0; i < 13; ++i)
184 p.w[i] = m.w[i] | n.w[i];
185 return p;
186}
187
188static inline CoMatrix intersection(const CoMatrix &m, const CoMatrix &n)
189{
190 CoMatrix p;
191 for (int i = 0; i < 13; ++i)
192 p.w[i] = m.w[i] & n.w[i];
193 return p;
194}
195
196StringSimilarityMatcher::StringSimilarityMatcher(const QString &stringToMatch)
197{
198 m_cm = new CoMatrix(stringToMatch);
199 m_length = stringToMatch.length();
200}
201
202int StringSimilarityMatcher::getSimilarityScore(const QString &strCandidate)
203{
204 CoMatrix cmTarget(strCandidate);
205 int delta = qAbs(m_length - strCandidate.size());
206 int score = ( (intersection(*m_cm, cmTarget).worth() + 1) << 10 ) /
207 ( reunion(*m_cm, cmTarget).worth() + (delta << 1) + 1 );
208 return score;
209}
210
211StringSimilarityMatcher::~StringSimilarityMatcher()
212{
213 delete m_cm;
214}
215
216/**
217 * Checks how similar two strings are.
218 * The return value is the score, and a higher score is more similar
219 * than one with a low score.
220 * Linguist considers a score over 190 to be a good match.
221 * \sa StringSimilarityMatcher
222 */
223int getSimilarityScore(const QString &str1, const QString &str2)
224{
225 CoMatrix cmTarget(str2);
226 CoMatrix cm(str1);
227 int delta = qAbs(str1.size() - str2.size());
228
229 int score = ( (intersection(cm, cmTarget).worth() + 1) << 10 )
230 / ( reunion(cm, cmTarget).worth() + (delta << 1) + 1 );
231
232 return score;
233}
234
235CandidateList similarTextHeuristicCandidates(const Translator *tor,
236 const QString &text, int maxCandidates)
237{
238 QList<int> scores;
239 CandidateList candidates;
240
241 TML all = tor->translatedMessages();
242
243 foreach (const TranslatorMessage &mtm, all) {
244 if (mtm.type() == TranslatorMessage::Unfinished
245 || mtm.translation().isEmpty())
246 continue;
247
248 QString s = mtm.sourceText();
249 int score = getSimilarityScore(s, text);
250
251 if (candidates.size() == maxCandidates && score > scores[maxCandidates - 1] )
252 candidates.removeLast();
253
254 if (candidates.size() < maxCandidates && score >= textSimilarityThreshold) {
255 Candidate cand( s, mtm.translation() );
256
257 int i;
258 for (i = 0; i < candidates.size(); i++) {
259 if (score >= scores.at(i)) {
260 if (score == scores.at(i)) {
261 if (candidates.at(i) == cand)
262 goto continue_outer_loop;
263 } else {
264 break;
265 }
266 }
267 }
268 scores.insert(i, score);
269 candidates.insert(i, cand);
270 }
271 continue_outer_loop:
272 ;
273 }
274 return candidates;
275}
276
277QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.