source: trunk/tools/assistant/lib/qhelpsearchindexreader_default.cpp@ 846

Last change on this file since 846 was 846, checked in by Dmitry A. Kuminov, 14 years ago

trunk: Merged in qt 4.7.2 sources from branches/vendor/nokia/qt.

File size: 18.7 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
4** All rights reserved.
5** Contact: Nokia Corporation ([email protected])
6**
7** This file is part of the Qt Assistant of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial Usage
11** Licensees holding valid Qt Commercial licenses may use this file in
12** accordance with the Qt Commercial License Agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and Nokia.
15**
16** GNU Lesser General Public License Usage
17** Alternatively, this file may be used under the terms of the GNU Lesser
18** General Public License version 2.1 as published by the Free Software
19** Foundation and appearing in the file LICENSE.LGPL included in the
20** packaging of this file. Please review the following information to
21** ensure the GNU Lesser General Public License version 2.1 requirements
22** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23**
24** In addition, as a special exception, Nokia gives you certain additional
25** rights. These rights are described in the Nokia Qt LGPL Exception
26** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you have questions regarding the use of this file, please contact
37** Nokia at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qhelpenginecore.h"
43#include "qhelpsearchindexreader_default_p.h"
44
45#include <QtCore/QDir>
46#include <QtCore/QUrl>
47#include <QtCore/QFile>
48#include <QtCore/QVariant>
49#include <QtCore/QFileInfo>
50#include <QtCore/QDataStream>
51#include <QtCore/QTextStream>
52
53QT_BEGIN_NAMESPACE
54
55namespace fulltextsearch {
56namespace std {
57
58namespace {
59 QStringList split( const QString &str )
60 {
61 QStringList lst;
62 int j = 0;
63 int i = str.indexOf(QLatin1Char('*'), j );
64
65 if (str.startsWith(QLatin1String("*")))
66 lst << QLatin1String("*");
67
68 while ( i != -1 ) {
69 if ( i > j && i <= (int)str.length() ) {
70 lst << str.mid( j, i - j );
71 lst << QLatin1String("*");
72 }
73 j = i + 1;
74 i = str.indexOf(QLatin1Char('*'), j );
75 }
76
77 int l = str.length() - 1;
78 if ( str.mid( j, l - j + 1 ).length() > 0 )
79 lst << str.mid( j, l - j + 1 );
80
81 return lst;
82 }
83}
84
85
86Reader::Reader()
87 : indexPath(QString())
88 , indexFile(QString())
89 , documentFile(QString())
90{
91 termList.clear();
92 indexTable.clear();
93 searchIndexTable.clear();
94}
95
96Reader::~Reader()
97{
98 reset();
99 searchIndexTable.clear();
100}
101
102bool Reader::readIndex()
103{
104 if (indexTable.contains(indexFile))
105 return true;
106
107 QFile idxFile(indexFile);
108 if (!idxFile.open(QFile::ReadOnly))
109 return false;
110
111 QString key;
112 int numOfDocs;
113 EntryTable entryTable;
114 QVector<Document> docs;
115 QDataStream dictStream(&idxFile);
116 while (!dictStream.atEnd()) {
117 dictStream >> key;
118 dictStream >> numOfDocs;
119 docs.resize(numOfDocs);
120 dictStream >> docs;
121 entryTable.insert(key, new Entry(docs));
122 }
123 idxFile.close();
124
125 if (entryTable.isEmpty())
126 return false;
127
128 QFile docFile(documentFile);
129 if (!docFile.open(QFile::ReadOnly))
130 return false;
131
132 QString title, url;
133 DocumentList documentList;
134 QDataStream docStream(&docFile);
135 while (!docStream.atEnd()) {
136 docStream >> title;
137 docStream >> url;
138 documentList.append(QStringList(title) << url);
139 }
140 docFile.close();
141
142 if (documentList.isEmpty()) {
143 cleanupIndex(entryTable);
144 return false;
145 }
146
147 indexTable.insert(indexFile, Index(entryTable, documentList));
148 return true;
149}
150
151bool Reader::initCheck() const
152{
153 return !searchIndexTable.isEmpty();
154}
155
156void Reader::setIndexPath(const QString &path)
157{
158 indexPath = path;
159}
160
161void Reader::filterFilesForAttributes(const QStringList &attributes)
162{
163 searchIndexTable.clear();
164 for(IndexTable::ConstIterator it = indexTable.begin(); it != indexTable.end(); ++it) {
165 const QString fileName = it.key();
166 bool containsAll = true;
167 QStringList split = fileName.split(QLatin1String("@"));
168 foreach (const QString &attribute, attributes) {
169 if (!split.contains(attribute, Qt::CaseInsensitive)) {
170 containsAll = false;
171 break;
172 }
173 }
174
175 if (containsAll)
176 searchIndexTable.insert(fileName, it.value());
177 }
178}
179
180void Reader::setIndexFile(const QString &namespaceName, const QString &attributes)
181{
182 QString extension = namespaceName + QLatin1String("@") + attributes;
183 indexFile = indexPath + QLatin1String("/indexdb40.") + extension;
184 documentFile = indexPath + QLatin1String("/indexdoc40.") + extension;
185}
186
187bool Reader::splitSearchTerm(const QString &searchTerm, QStringList *terms,
188 QStringList *termSeq, QStringList *seqWords)
189{
190 QString term = searchTerm;
191
192 term = term.simplified();
193 term = term.replace(QLatin1String("\'"), QLatin1String("\""));
194 term = term.replace(QLatin1String("`"), QLatin1String("\""));
195 term = term.replace(QLatin1String("-"), QLatin1String(" "));
196 term = term.replace(QRegExp(QLatin1String("\\s[\\S]?\\s")), QLatin1String(" "));
197
198 *terms = term.split(QLatin1Char(' '));
199 QStringList::iterator it = terms->begin();
200 for (; it != terms->end(); ++it) {
201 (*it) = (*it).simplified();
202 (*it) = (*it).toLower();
203 (*it) = (*it).replace(QLatin1String("\""), QLatin1String(""));
204 }
205
206 if (term.contains(QLatin1Char('\"'))) {
207 if ((term.count(QLatin1Char('\"')))%2 == 0) {
208 int beg = 0;
209 int end = 0;
210 QString s;
211 beg = term.indexOf(QLatin1Char('\"'), beg);
212 while (beg != -1) {
213 beg++;
214 end = term.indexOf(QLatin1Char('\"'), beg);
215 s = term.mid(beg, end - beg);
216 s = s.toLower();
217 s = s.simplified();
218 if (s.contains(QLatin1Char('*'))) {
219 qWarning("Full Text Search, using a wildcard within phrases is not allowed.");
220 return false;
221 }
222 *seqWords += s.split(QLatin1Char(' '));
223 *termSeq << s;
224 beg = term.indexOf(QLatin1Char('\"'), end + 1);
225 }
226 } else {
227 qWarning("Full Text Search, the closing quotation mark is missing.");
228 return false;
229 }
230 }
231
232 return true;
233}
234
235void Reader::searchInIndex(const QStringList &terms)
236{
237 foreach (const QString &term, terms) {
238 QVector<Document> documents;
239
240 for(IndexTable::ConstIterator it = searchIndexTable.begin();
241 it != searchIndexTable.end(); ++it) {
242 EntryTable entryTable = it.value().first;
243 DocumentList documentList = it.value().second;
244
245 if (term.contains(QLatin1Char('*')))
246 documents = setupDummyTerm(getWildcardTerms(term, entryTable), entryTable);
247 else if (entryTable.value(term))
248 documents = entryTable.value(term)->documents;
249 else
250 continue;
251
252 if (!documents.isEmpty()) {
253 DocumentInfo info;
254 QString title, url;
255 QVector<DocumentInfo> documentsInfo;
256 foreach(const Document &doc, documents) {
257 info.docNumber = doc.docNumber;
258 info.frequency = doc.frequency;
259 info.documentUrl = documentList.at(doc.docNumber).at(1);
260 info.documentTitle = documentList.at(doc.docNumber).at(0);
261 documentsInfo.append(info);
262 }
263
264 bool found = false;
265 for(QList<TermInfo>::Iterator tit = termList.begin();
266 tit != termList.end(); ++tit) {
267 TermInfo *t = &(*tit);
268 if(t->term == term) {
269 t->documents += documentsInfo;
270 t->frequency += documentsInfo.count();
271 found = true; break;
272 }
273 }
274 if (!found)
275 termList.append(TermInfo(term, documentsInfo.count(), documentsInfo));
276 }
277 }
278 }
279 qSort(termList);
280}
281
282QVector<DocumentInfo> Reader::hits()
283{
284 QVector<DocumentInfo> documents;
285 if (!termList.count())
286 return documents;
287
288 documents = termList.takeFirst().documents;
289 for(QList<TermInfo>::Iterator it = termList.begin(); it != termList.end(); ++it) {
290 TermInfo *t = &(*it);
291 QVector<DocumentInfo> docs = t->documents;
292 for(QVector<DocumentInfo>::Iterator minDoc_it = documents.begin();
293 minDoc_it != documents.end(); ) {
294 bool found = false;
295 for (QVector<DocumentInfo>::ConstIterator doc_it = docs.constBegin();
296 doc_it != docs.constEnd(); ++doc_it ) {
297 if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
298 (*minDoc_it).frequency += (*doc_it).frequency;
299 found = true;
300 break;
301 }
302 }
303 if (!found)
304 minDoc_it = documents.erase(minDoc_it);
305 else
306 ++minDoc_it;
307 }
308 }
309
310 qSort(documents);
311 return documents;
312}
313
314bool Reader::searchForPattern(const QStringList &patterns, const QStringList &words,
315 const QByteArray &data)
316{
317 if (data.isEmpty())
318 return false;
319
320 for(QHash<QString, PosEntry*>::ConstIterator mit =
321 miniIndex.begin(); mit != miniIndex.end(); ++mit) {
322 delete mit.value();
323 }
324 miniIndex.clear();
325
326 wordNum = 3;
327 QStringList::ConstIterator cIt = words.begin();
328 for ( ; cIt != words.end(); ++cIt )
329 miniIndex.insert(*cIt, new PosEntry(0));
330
331 QTextStream s(data);
332 QString text = s.readAll();
333 bool valid = true;
334 const QChar *buf = text.unicode();
335 QChar str[64];
336 QChar c = buf[0];
337 int j = 0;
338 int i = 0;
339 while ( j < text.length() ) {
340 if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
341 valid = false;
342 if ( i > 1 )
343 buildMiniIndex( QString(str,i) );
344 i = 0;
345 c = buf[++j];
346 continue;
347 }
348 if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
349 valid = true;
350 c = buf[++j];
351 continue;
352 }
353 if ( !valid ) {
354 c = buf[++j];
355 continue;
356 }
357 if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
358 str[i] = c.toLower();
359 ++i;
360 } else {
361 if ( i > 1 )
362 buildMiniIndex( QString(str,i) );
363 i = 0;
364 }
365 c = buf[++j];
366 }
367 if ( i > 1 )
368 buildMiniIndex( QString(str,i) );
369
370 QStringList::ConstIterator patIt = patterns.begin();
371 QStringList wordLst;
372 QList<uint> a, b;
373 QList<uint>::iterator aIt;
374 for ( ; patIt != patterns.end(); ++patIt ) {
375 wordLst = (*patIt).split(QLatin1Char(' '));
376 a = miniIndex[ wordLst[0] ]->positions;
377 for ( int j = 1; j < (int)wordLst.count(); ++j ) {
378 b = miniIndex[ wordLst[j] ]->positions;
379 aIt = a.begin();
380 while ( aIt != a.end() ) {
381 if ( b.contains( *aIt + 1 )) {
382 (*aIt)++;
383 ++aIt;
384 } else {
385 aIt = a.erase( aIt );
386 }
387 }
388 }
389 }
390 if ( a.count() )
391 return true;
392 return false;
393}
394
395QVector<Document> Reader::setupDummyTerm(const QStringList &terms,
396 const EntryTable &entryTable)
397{
398 QList<Term> termList;
399 for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) {
400 if (entryTable.value(*it)) {
401 Entry *e = entryTable.value(*it);
402 termList.append(Term(*it, e->documents.count(), e->documents ) );
403 }
404 }
405 QVector<Document> maxList(0);
406 if ( !termList.count() )
407 return maxList;
408 qSort(termList);
409
410 maxList = termList.takeLast().documents;
411 for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
412 Term *t = &(*it);
413 QVector<Document> docs = t->documents;
414 for (QVector<Document>::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
415 if ( maxList.indexOf( *docIt ) == -1 )
416 maxList.append( *docIt );
417 }
418 }
419 return maxList;
420}
421
422QStringList Reader::getWildcardTerms(const QString &term,
423 const EntryTable &entryTable)
424{
425 QStringList lst;
426 QStringList terms = split(term);
427 QStringList::Iterator iter;
428
429 for(EntryTable::ConstIterator it = entryTable.begin();
430 it != entryTable.end(); ++it) {
431 int index = 0;
432 bool found = false;
433 QString text( it.key() );
434 for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
435 if ( *iter == QLatin1String("*") ) {
436 found = true;
437 continue;
438 }
439 if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
440 found = false;
441 break;
442 }
443 index = text.indexOf( *iter, index );
444 if ( *iter == terms.last() && index != (int)text.length()-1 ) {
445 index = text.lastIndexOf( *iter );
446 if ( index != (int)text.length() - (int)(*iter).length() ) {
447 found = false;
448 break;
449 }
450 }
451 if ( index != -1 ) {
452 found = true;
453 index += (*iter).length();
454 continue;
455 } else {
456 found = false;
457 break;
458 }
459 }
460 if (found)
461 lst << text;
462 }
463
464 return lst;
465}
466
467void Reader::buildMiniIndex(const QString &string)
468{
469 if (miniIndex[string])
470 miniIndex[string]->positions.append(wordNum);
471 ++wordNum;
472}
473
474void Reader::reset()
475{
476 for(IndexTable::Iterator it = indexTable.begin();
477 it != indexTable.end(); ++it) {
478 cleanupIndex(it.value().first);
479 it.value().second.clear();
480 }
481}
482
483void Reader::cleanupIndex(EntryTable &entryTable)
484{
485 for(EntryTable::ConstIterator it =
486 entryTable.begin(); it != entryTable.end(); ++it) {
487 delete it.value();
488 }
489
490 entryTable.clear();
491}
492
493
494QHelpSearchIndexReaderDefault::QHelpSearchIndexReaderDefault()
495 : QHelpSearchIndexReader()
496{
497 // nothing todo
498}
499
500QHelpSearchIndexReaderDefault::~QHelpSearchIndexReaderDefault()
501{
502}
503
504void QHelpSearchIndexReaderDefault::run()
505{
506 mutex.lock();
507
508 if (m_cancel) {
509 mutex.unlock();
510 return;
511 }
512
513 const QList<QHelpSearchQuery> &queryList = this->m_query;
514 const QLatin1String key("DefaultSearchNamespaces");
515 const QString collectionFile(this->m_collectionFile);
516 const QString indexPath = m_indexFilesFolder;
517
518 mutex.unlock();
519
520 QString queryTerm;
521 foreach (const QHelpSearchQuery &query, queryList) {
522 if (query.fieldName == QHelpSearchQuery::DEFAULT) {
523 queryTerm = query.wordList.at(0);
524 break;
525 }
526 }
527
528 if (queryTerm.isEmpty())
529 return;
530
531 QHelpEngineCore engine(collectionFile, 0);
532 if (!engine.setupData())
533 return;
534
535 const QStringList registeredDocs = engine.registeredDocumentations();
536 const QStringList indexedNamespaces = engine.customValue(key).toString().
537 split(QLatin1String("|"), QString::SkipEmptyParts);
538
539 emit searchingStarted();
540
541 // setup the reader
542 m_reader.setIndexPath(indexPath);
543 foreach(const QString &namespaceName, registeredDocs) {
544 mutex.lock();
545 if (m_cancel) {
546 mutex.unlock();
547 searchingFinished(0); // TODO: check this ???
548 return;
549 }
550 mutex.unlock();
551
552 const QList<QStringList> attributeSets =
553 engine.filterAttributeSets(namespaceName);
554
555 foreach (const QStringList &attributes, attributeSets) {
556 // read all index files
557 m_reader.setIndexFile(namespaceName, attributes.join(QLatin1String("@")));
558 if (!m_reader.readIndex()) {
559 qWarning("Full Text Search, could not read file for namespace: %s.",
560 namespaceName.toUtf8().constData());
561 }
562 }
563 }
564
565 // get the current filter attributes and minimize the index files table
566 m_reader.filterFilesForAttributes(engine.filterAttributes(engine.currentFilter()));
567
568 hitList.clear();
569 QStringList terms, termSeq, seqWords;
570 if (m_reader.initCheck() && // check if we could read anything
571 m_reader.splitSearchTerm(queryTerm, &terms, &termSeq, &seqWords) ) {
572
573 // search for term(s)
574 m_reader.searchInIndex(terms); // TODO: should this be interruptible as well ???
575
576 QVector<DocumentInfo> hits = m_reader.hits();
577 if (!hits.isEmpty()) {
578 if (termSeq.isEmpty()) {
579 foreach (const DocumentInfo &docInfo, hits) {
580 mutex.lock();
581 if (m_cancel) {
582 mutex.unlock();
583 searchingFinished(0); // TODO: check this, speed issue while locking???
584 return;
585 }
586 mutex.unlock();
587 hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
588 }
589 } else {
590 foreach (const DocumentInfo &docInfo, hits) {
591 mutex.lock();
592 if (m_cancel) {
593 mutex.unlock();
594 searchingFinished(0); // TODO: check this, speed issue while locking???
595 return;
596 }
597 mutex.unlock();
598
599 if (m_reader.searchForPattern(termSeq, seqWords, engine.fileData(docInfo.documentUrl))) // TODO: should this be interruptible as well ???
600 hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
601 }
602 }
603 }
604 }
605
606 emit searchingFinished(hitList.count());
607}
608
609} // namespace std
610} // namespace fulltextsearch
611
612QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.