source: trunk/tools/linguist/shared/translatortools.cpp@ 5

Last change on this file since 5 was 2, checked in by Dmitry A. Kuminov, 16 years ago

Initially imported qt-all-opensource-src-4.5.1 from Trolltech.

File size: 18.2 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4** Contact: Qt Software Information ([email protected])
5**
6** This file is part of the Qt Linguist of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial Usage
10** Licensees holding valid Qt Commercial licenses may use this file in
11** accordance with the Qt Commercial License Agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and Nokia.
14**
15** GNU Lesser General Public License Usage
16** Alternatively, this file may be used under the terms of the GNU Lesser
17** General Public License version 2.1 as published by the Free Software
18** Foundation and appearing in the file LICENSE.LGPL included in the
19** packaging of this file. Please review the following information to
20** ensure the GNU Lesser General Public License version 2.1 requirements
21** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
22**
23** In addition, as a special exception, Nokia gives you certain
24** additional rights. These rights are described in the Nokia Qt LGPL
25** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
26** package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you are unsure which license is appropriate for your use, please
37** contact the sales department at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "translatortools.h"
43
44#include "simtexth.h"
45#include "translator.h"
46
47#include <QtCore/QDebug>
48#include <QtCore/QMap>
49#include <QtCore/QStringList>
50#include <QtCore/QTextCodec>
51#include <QtCore/QVector>
52
53typedef QList<TranslatorMessage> TML;
54typedef QMap<QString, TranslatorMessage> TMM;
55
56
57QT_BEGIN_NAMESPACE
58
59static bool isDigitFriendly(QChar c)
60{
61 return c.isPunct() || c.isSpace();
62}
63
64static int numberLength(const QString &s, int i)
65{
66 if (i < s.size() || !s.at(i).isDigit())
67 return 0;
68
69 int pos = i;
70 do {
71 ++i;
72 } while (i < s.size()
73 && (s.at(i).isDigit()
74 || (isDigitFriendly(s[i])
75 && i + 1 < s.size()
76 && (s[i + 1].isDigit()
77 || (isDigitFriendly(s[i + 1])
78 && i + 2 < s.size()
79 && s[i + 2].isDigit())))));
80 return i - pos;
81}
82
83
84/*
85 Returns a version of 'key' where all numbers have been replaced by zeroes. If
86 there were none, returns "".
87*/
88static QString zeroKey(const QString &key)
89{
90 QString zeroed;
91 bool metSomething = false;
92
93 for (int i = 0; i != key.size(); ++i) {
94 int len = numberLength(key, i);
95 if (len > 0) {
96 i += len;
97 zeroed.append(QLatin1Char('0'));
98 metSomething = true;
99 } else {
100 zeroed.append(key.at(i));
101 }
102 }
103 return metSomething ? zeroed : QString();
104}
105
106static QString translationAttempt(const QString &oldTranslation,
107 const QString &oldSource, const QString &newSource)
108{
109 int p = zeroKey(oldSource).count(QLatin1Char('0'));
110 QString attempt;
111 QStringList oldNumbers;
112 QStringList newNumbers;
113 QVector<bool> met(p);
114 QVector<int> matchedYet(p);
115 int i, j;
116 int k = 0, ell, best;
117 int m, n;
118 int pass;
119
120 /*
121 This algorithm is hard to follow, so we'll consider an example
122 all along: oldTranslation is "XeT 3.0", oldSource is "TeX 3.0"
123 and newSource is "XeT 3.1".
124
125 First, we set up two tables: oldNumbers and newNumbers. In our
126 example, oldNumber[0] is "3.0" and newNumber[0] is "3.1".
127 */
128 for (i = 0, j = 0; i < oldSource.size(); i++, j++) {
129 m = numberLength(oldSource, i);
130 n = numberLength(newSource, j);
131 if (m > 0) {
132 oldNumbers.append(oldSource.mid(i, m + 1));
133 newNumbers.append(newSource.mid(j, n + 1));
134 i += m;
135 j += n;
136 met[k] = false;
137 matchedYet[k] = 0;
138 k++;
139 }
140 }
141
142 /*
143 We now go over the old translation, "XeT 3.0", one letter at a
144 time, looking for numbers found in oldNumbers. Whenever such a
145 number is met, it is replaced with its newNumber equivalent. In
146 our example, the "3.0" of "XeT 3.0" becomes "3.1".
147 */
148 for (i = 0; i < oldTranslation.length(); i++) {
149 attempt += oldTranslation[i];
150 for (k = 0; k < p; k++) {
151 if (oldTranslation[i] == oldNumbers[k][matchedYet[k]])
152 matchedYet[k]++;
153 else
154 matchedYet[k] = 0;
155 }
156
157 /*
158 Let's find out if the last character ended a match. We make
159 two passes over the data. In the first pass, we try to
160 match only numbers that weren't matched yet; if that fails,
161 the second pass does the trick. This is useful in some
162 suspicious cases, flagged below.
163 */
164 for (pass = 0; pass < 2; pass++) {
165 best = p; // an impossible value
166 for (k = 0; k < p; k++) {
167 if ((!met[k] || pass > 0) &&
168 matchedYet[k] == oldNumbers[k].length() &&
169 numberLength(oldTranslation, i + 1 - matchedYet[k]) == matchedYet[k]) {
170 // the longer the better
171 if (best == p || matchedYet[k] > matchedYet[best])
172 best = k;
173 }
174 }
175 if (best != p) {
176 attempt.truncate(attempt.length() - matchedYet[best]);
177 attempt += newNumbers[best];
178 met[best] = true;
179 for (k = 0; k < p; k++)
180 matchedYet[k] = 0;
181 break;
182 }
183 }
184 }
185
186 /*
187 We flag two kinds of suspicious cases. They are identified as
188 such with comments such as "{2000?}" at the end.
189
190 Example of the first kind: old source text "TeX 3.0" translated
191 as "XeT 2.0" is flagged "TeX 2.0 {3.0?}", no matter what the
192 new text is.
193 */
194 for (k = 0; k < p; k++) {
195 if (!met[k])
196 attempt += QString(QLatin1String(" {")) + newNumbers[k] + QString(QLatin1String("?}"));
197 }
198
199 /*
200 Example of the second kind: "1 of 1" translated as "1 af 1",
201 with new source text "1 of 2", generates "1 af 2 {1 or 2?}"
202 because it's not clear which of "1 af 2" and "2 af 1" is right.
203 */
204 for (k = 0; k < p; k++) {
205 for (ell = 0; ell < p; ell++) {
206 if (k != ell && oldNumbers[k] == oldNumbers[ell] &&
207 newNumbers[k] < newNumbers[ell])
208 attempt += QString(QLatin1String(" {")) + newNumbers[k] + QString(QLatin1String(" or ")) +
209 newNumbers[ell] + QString(QLatin1String("?}"));
210 }
211 }
212 return attempt;
213}
214
215
216/*
217 Augments a Translator with translations easily derived from
218 similar existing (probably obsolete) translations.
219
220 For example, if "TeX 3.0" is translated as "XeT 3.0" and "TeX 3.1"
221 has no translation, "XeT 3.1" is added to the translator and is
222 marked Unfinished.
223
224 Returns the number of additional messages that this heuristic translated.
225*/
226int applyNumberHeuristic(Translator &tor)
227{
228 TMM translated, untranslated;
229 TMM::Iterator t, u;
230 TML all = tor.messages();
231 TML::Iterator it;
232 int inserted = 0;
233
234 for (it = all.begin(); it != all.end(); ++it) {
235 bool hasTranslation = it->isTranslated();
236 if (it->type() == TranslatorMessage::Unfinished) {
237 if (!hasTranslation)
238 untranslated.insert(it->context() + QLatin1Char('\n')
239 + it->sourceText() + QLatin1Char('\n')
240 + it->comment(), *it);
241 } else if (hasTranslation && it->translations().count() == 1) {
242 translated.insert(zeroKey(it->sourceText()), *it);
243 }
244 }
245
246 for (u = untranslated.begin(); u != untranslated.end(); ++u) {
247 t = translated.find(zeroKey((*u).sourceText()));
248 if (t != translated.end() && !t.key().isEmpty()
249 && t->sourceText() != u->sourceText()) {
250 TranslatorMessage m = *u;
251 m.setTranslation(translationAttempt(t->translation(), t->sourceText(),
252 u->sourceText()));
253 tor.replace(m);
254 inserted++;
255 }
256 }
257 return inserted;
258}
259
260
261/*
262 Augments a Translator with trivially derived translations.
263
264 For example, if "Enabled:" is consistendly translated as "Eingeschaltet:" no
265 matter the context or the comment, "Eingeschaltet:" is added as the
266 translation of any untranslated "Enabled:" text and is marked Unfinished.
267
268 Returns the number of additional messages that this heuristic translated.
269*/
270
271int applySameTextHeuristic(Translator &tor)
272{
273 TMM translated;
274 TMM avoid;
275 TMM::Iterator t;
276 TML untranslated;
277 TML::Iterator u;
278 TML all = tor.messages();
279 TML::Iterator it;
280 int inserted = 0;
281
282 for (it = all.begin(); it != all.end(); ++it) {
283 if (!it->isTranslated()) {
284 if (it->type() == TranslatorMessage::Unfinished)
285 untranslated.append(*it);
286 } else {
287 QString key = it->sourceText();
288 t = translated.find(key);
289 if (t != translated.end()) {
290 /*
291 The same source text is translated at least two
292 different ways. Do nothing then.
293 */
294 if (t->translations() != it->translations()) {
295 translated.remove(key);
296 avoid.insert(key, *it);
297 }
298 } else if (!avoid.contains(key)) {
299 translated.insert(key, *it);
300 }
301 }
302 }
303
304 for (u = untranslated.begin(); u != untranslated.end(); ++u) {
305 QString key = u->sourceText();
306 t = translated.find(key);
307 if (t != translated.end()) {
308 TranslatorMessage m = *u;
309 m.setTranslations(t->translations());
310 tor.replace(m);
311 ++inserted;
312 }
313 }
314 return inserted;
315}
316
317
318
319/*
320 Merges two Translator objects. The first one
321 is a set of source texts and translations for a previous version of
322 the internationalized program; the second one is a set of fresh
323 source texts newly extracted from the source code, without any
324 translation yet.
325*/
326
327Translator merge(const Translator &tor, const Translator &virginTor,
328 UpdateOptions options, QString &err)
329{
330 int known = 0;
331 int neww = 0;
332 int obsoleted = 0;
333 int similarTextHeuristicCount = 0;
334
335 Translator outTor;
336 outTor.setLanguageCode(tor.languageCode());
337 outTor.setSourceLanguageCode(tor.sourceLanguageCode());
338 outTor.setLocationsType(tor.locationsType());
339 outTor.setCodecName(tor.codecName());
340
341 /*
342 The types of all the messages from the vernacular translator
343 are updated according to the virgin translator.
344 */
345 foreach (TranslatorMessage m, tor.messages()) {
346 TranslatorMessage::Type newType = TranslatorMessage::Finished;
347
348 if (m.sourceText().isEmpty()) {
349 // context/file comment
350 TranslatorMessage mv = virginTor.find(m.context());
351 if (!mv.isNull())
352 m.setComment(mv.comment());
353 } else {
354 TranslatorMessage mv = virginTor.find(m.context(), m.sourceText(), m.comment());
355 if (mv.isNull()) {
356 if (!(options & HeuristicSimilarText)) {
357 newType = TranslatorMessage::Obsolete;
358 if (m.type() != TranslatorMessage::Obsolete)
359 obsoleted++;
360 m.clearReferences();
361 } else {
362 mv = virginTor.find(m.context(), m.comment(), m.allReferences());
363 if (mv.isNull()) {
364 // did not find it in the virgin, mark it as obsolete
365 newType = TranslatorMessage::Obsolete;
366 if (m.type() != TranslatorMessage::Obsolete)
367 obsoleted++;
368 m.clearReferences();
369 } else {
370 // Do not just accept it if its on the same line number,
371 // but different source text.
372 // Also check if the texts are more or less similar before
373 // we consider them to represent the same message...
374 if (getSimilarityScore(m.sourceText(), mv.sourceText()) >= textSimilarityThreshold) {
375 // It is just slightly modified, assume that it is the same string
376
377 // Mark it as unfinished. (Since the source text
378 // was changed it might require re-translating...)
379 newType = TranslatorMessage::Unfinished;
380 ++similarTextHeuristicCount;
381 neww++;
382
383 m.setOldSourceText(m.sourceText());
384 m.setSourceText(mv.sourceText());
385 const QString &oldpluralsource = m.extra(QLatin1String("po-msgid_plural"));
386 if (!oldpluralsource.isEmpty()) {
387 m.setExtra(QLatin1String("po-old_msgid_plural"), oldpluralsource);
388 m.unsetExtra(QLatin1String("po-msgid_plural"));
389 }
390 m.setReferences(mv.allReferences()); // Update secondary references
391 m.setPlural(mv.isPlural());
392 m.setUtf8(mv.isUtf8());
393 m.setExtraComment(mv.extraComment());
394 } else {
395 // The virgin and vernacular sourceTexts are so
396 // different that we could not find it.
397 newType = TranslatorMessage::Obsolete;
398 if (m.type() != TranslatorMessage::Obsolete)
399 obsoleted++;
400 m.clearReferences();
401 }
402 }
403 }
404 } else {
405 switch (m.type()) {
406 case TranslatorMessage::Finished:
407 default:
408 if (m.isPlural() == mv.isPlural()) {
409 newType = TranslatorMessage::Finished;
410 } else {
411 newType = TranslatorMessage::Unfinished;
412 }
413 known++;
414 break;
415 case TranslatorMessage::Unfinished:
416 newType = TranslatorMessage::Unfinished;
417 known++;
418 break;
419 case TranslatorMessage::Obsolete:
420 newType = TranslatorMessage::Unfinished;
421 neww++;
422 }
423
424 // Always get the filename and linenumber info from the
425 // virgin Translator, in case it has changed location.
426 // This should also enable us to read a file that does not
427 // have the <location> element.
428 // why not use operator=()? Because it overwrites e.g. userData.
429 m.setReferences(mv.allReferences());
430 m.setPlural(mv.isPlural());
431 m.setUtf8(mv.isUtf8());
432 m.setExtraComment(mv.extraComment());
433 }
434 }
435
436 m.setType(newType);
437 outTor.append(m);
438 }
439
440 /*
441 Messages found only in the virgin translator are added to the
442 vernacular translator.
443 */
444 foreach (const TranslatorMessage &mv, virginTor.messages()) {
445 if (mv.sourceText().isEmpty()) {
446 if (tor.contains(mv.context()))
447 continue;
448 } else {
449 if (tor.contains(mv.context(), mv.sourceText(), mv.comment()))
450 continue;
451 if (options & HeuristicSimilarText) {
452 TranslatorMessage m = tor.find(mv.context(), mv.comment(), mv.allReferences());
453 if (!m.isNull()) {
454 if (getSimilarityScore(m.sourceText(), mv.sourceText()) >= textSimilarityThreshold)
455 continue;
456 }
457 }
458 }
459 if (options & NoLocations)
460 outTor.append(mv);
461 else
462 outTor.appendSorted(mv);
463 if (!mv.sourceText().isEmpty())
464 ++neww;
465 }
466
467 /*
468 The same-text heuristic handles cases where a message has an
469 obsolete counterpart with a different context or comment.
470 */
471 int sameTextHeuristicCount = (options & HeuristicSameText) ? applySameTextHeuristic(outTor) : 0;
472
473 /*
474 The number heuristic handles cases where a message has an
475 obsolete counterpart with mostly numbers differing in the
476 source text.
477 */
478 int sameNumberHeuristicCount = (options & HeuristicNumber) ? applyNumberHeuristic(outTor) : 0;
479
480 if (options & Verbose) {
481 int totalFound = neww + known;
482 err += QObject::tr(" Found %n source text(s) (%1 new and %2 already existing)\n", 0, totalFound).arg(neww).arg(known);
483
484 if (obsoleted) {
485 if (options & NoObsolete) {
486 err += QObject::tr(" Removed %n obsolete entries\n", 0, obsoleted);
487 } else {
488 err += QObject::tr(" Kept %n obsolete entries\n", 0, obsoleted);
489 }
490 }
491
492 if (sameNumberHeuristicCount)
493 err += QObject::tr(" Number heuristic provided %n translation(s)\n",
494 0, sameNumberHeuristicCount);
495 if (sameTextHeuristicCount)
496 err += QObject::tr(" Same-text heuristic provided %n translation(s)\n",
497 0, sameTextHeuristicCount);
498 if (similarTextHeuristicCount)
499 err += QObject::tr(" Similar-text heuristic provided %n translation(s)\n",
500 0, similarTextHeuristicCount);
501 }
502 return outTor;
503}
504
505QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.