source: trunk/tools/assistant/lib/qhelpsearchindexreader_default.cpp@ 109

Last change on this file since 109 was 2, checked in by Dmitry A. Kuminov, 16 years ago

Initially imported qt-all-opensource-src-4.5.1 from Trolltech.

File size: 19.6 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4** Contact: Qt Software Information ([email protected])
5**
6** This file is part of the Qt Assistant of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial Usage
10** Licensees holding valid Qt Commercial licenses may use this file in
11** accordance with the Qt Commercial License Agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and Nokia.
14**
15** GNU Lesser General Public License Usage
16** Alternatively, this file may be used under the terms of the GNU Lesser
17** General Public License version 2.1 as published by the Free Software
18** Foundation and appearing in the file LICENSE.LGPL included in the
19** packaging of this file. Please review the following information to
20** ensure the GNU Lesser General Public License version 2.1 requirements
21** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
22**
23** In addition, as a special exception, Nokia gives you certain
24** additional rights. These rights are described in the Nokia Qt LGPL
25** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
26** package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you are unsure which license is appropriate for your use, please
37** contact the sales department at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qhelpenginecore.h"
43#include "qhelpsearchindexreader_default_p.h"
44
45#include <QtCore/QDir>
46#include <QtCore/QUrl>
47#include <QtCore/QFile>
48#include <QtCore/QVariant>
49#include <QtCore/QFileInfo>
50#include <QtCore/QDataStream>
51#include <QtCore/QTextStream>
52
53QT_BEGIN_NAMESPACE
54
55namespace qt {
56 namespace fulltextsearch {
57 namespace std {
58
59namespace {
60 QStringList split( const QString &str )
61 {
62 QStringList lst;
63 int j = 0;
64 int i = str.indexOf(QLatin1Char('*'), j );
65
66 if (str.startsWith(QLatin1String("*")))
67 lst << QLatin1String("*");
68
69 while ( i != -1 ) {
70 if ( i > j && i <= (int)str.length() ) {
71 lst << str.mid( j, i - j );
72 lst << QLatin1String("*");
73 }
74 j = i + 1;
75 i = str.indexOf(QLatin1Char('*'), j );
76 }
77
78 int l = str.length() - 1;
79 if ( str.mid( j, l - j + 1 ).length() > 0 )
80 lst << str.mid( j, l - j + 1 );
81
82 return lst;
83 }
84}
85
86
87Reader::Reader()
88 : indexPath(QString())
89 , indexFile(QString())
90 , documentFile(QString())
91{
92 termList.clear();
93 indexTable.clear();
94 searchIndexTable.clear();
95}
96
97Reader::~Reader()
98{
99 reset();
100 searchIndexTable.clear();
101}
102
103bool Reader::readIndex()
104{
105 if (indexTable.contains(indexFile))
106 return true;
107
108 QFile idxFile(indexFile);
109 if (!idxFile.open(QFile::ReadOnly))
110 return false;
111
112 QString key;
113 int numOfDocs;
114 EntryTable entryTable;
115 QVector<Document> docs;
116 QDataStream dictStream(&idxFile);
117 while (!dictStream.atEnd()) {
118 dictStream >> key;
119 dictStream >> numOfDocs;
120 docs.resize(numOfDocs);
121 dictStream >> docs;
122 entryTable.insert(key, new Entry(docs));
123 }
124 idxFile.close();
125
126 if (entryTable.isEmpty())
127 return false;
128
129 QFile docFile(documentFile);
130 if (!docFile.open(QFile::ReadOnly))
131 return false;
132
133 QString title, url;
134 DocumentList documentList;
135 QDataStream docStream(&docFile);
136 while (!docStream.atEnd()) {
137 docStream >> title;
138 docStream >> url;
139 documentList.append(QStringList(title) << url);
140 }
141 docFile.close();
142
143 if (documentList.isEmpty()) {
144 cleanupIndex(entryTable);
145 return false;
146 }
147
148 indexTable.insert(indexFile, Index(entryTable, documentList));
149 return true;
150}
151
152bool Reader::initCheck() const
153{
154 return !searchIndexTable.isEmpty();
155}
156
157void Reader::setIndexPath(const QString &path)
158{
159 indexPath = path;
160}
161
162void Reader::filterFilesForAttributes(const QStringList &attributes)
163{
164 searchIndexTable.clear();
165 for(IndexTable::ConstIterator it = indexTable.begin(); it != indexTable.end(); ++it) {
166 const QString fileName = it.key();
167 bool containsAll = true;
168 QStringList split = fileName.split(QLatin1String("@"));
169 foreach (const QString attribute, attributes) {
170 if (!split.contains(attribute, Qt::CaseInsensitive)) {
171 containsAll = false;
172 break;
173 }
174 }
175
176 if (containsAll)
177 searchIndexTable.insert(fileName, it.value());
178 }
179}
180
181void Reader::setIndexFile(const QString &namespaceName, const QString &attributes)
182{
183 QString extention = namespaceName + QLatin1String("@") + attributes;
184 indexFile = indexPath + QLatin1String("/indexdb40.") + extention;
185 documentFile = indexPath + QLatin1String("/indexdoc40.") + extention;
186}
187
188bool Reader::splitSearchTerm(const QString &searchTerm, QStringList *terms,
189 QStringList *termSeq, QStringList *seqWords)
190{
191 QString term = searchTerm;
192
193 term = term.simplified();
194 term = term.replace(QLatin1String("\'"), QLatin1String("\""));
195 term = term.replace(QLatin1String("`"), QLatin1String("\""));
196 term = term.replace(QLatin1String("-"), QLatin1String(" "));
197 term = term.replace(QRegExp(QLatin1String("\\s[\\S]?\\s")), QLatin1String(" "));
198
199 *terms = term.split(QLatin1Char(' '));
200 QStringList::iterator it = terms->begin();
201 for (; it != terms->end(); ++it) {
202 (*it) = (*it).simplified();
203 (*it) = (*it).toLower();
204 (*it) = (*it).replace(QLatin1String("\""), QLatin1String(""));
205 }
206
207 if (term.contains(QLatin1Char('\"'))) {
208 if ((term.count(QLatin1Char('\"')))%2 == 0) {
209 int beg = 0;
210 int end = 0;
211 QString s;
212 beg = term.indexOf(QLatin1Char('\"'), beg);
213 while (beg != -1) {
214 beg++;
215 end = term.indexOf(QLatin1Char('\"'), beg);
216 s = term.mid(beg, end - beg);
217 s = s.toLower();
218 s = s.simplified();
219 if (s.contains(QLatin1Char('*'))) {
220 qWarning("Full Text Search, using a wildcard within phrases is not allowed.");
221 return false;
222 }
223 *seqWords += s.split(QLatin1Char(' '));
224 *termSeq << s;
225 beg = term.indexOf(QLatin1Char('\"'), end + 1);
226 }
227 } else {
228 qWarning("Full Text Search, the closing quotation mark is missing.");
229 return false;
230 }
231 }
232
233 return true;
234}
235
236void Reader::searchInIndex(const QStringList &terms)
237{
238 foreach (const QString term, terms) {
239 QVector<Document> documents;
240
241 for(IndexTable::ConstIterator it = searchIndexTable.begin();
242 it != searchIndexTable.end(); ++it) {
243 EntryTable entryTable = it.value().first;
244 DocumentList documentList = it.value().second;
245
246 if (term.contains(QLatin1Char('*')))
247 documents = setupDummyTerm(getWildcardTerms(term, entryTable), entryTable);
248 else if (entryTable.value(term))
249 documents = entryTable.value(term)->documents;
250 else
251 continue;
252
253 if (!documents.isEmpty()) {
254 DocumentInfo info;
255 QString title, url;
256 QVector<DocumentInfo> documentsInfo;
257 foreach(const Document doc, documents) {
258 info.docNumber = doc.docNumber;
259 info.frequency = doc.frequency;
260 info.documentUrl = documentList.at(doc.docNumber).at(1);
261 info.documentTitle = documentList.at(doc.docNumber).at(0);
262 documentsInfo.append(info);
263 }
264
265 bool found = false;
266 for(QList<TermInfo>::Iterator tit = termList.begin();
267 tit != termList.end(); ++tit) {
268 TermInfo *t = &(*tit);
269 if(t->term == term) {
270 t->documents += documentsInfo;
271 t->frequency += documentsInfo.count();
272 found = true; break;
273 }
274 }
275 if (!found)
276 termList.append(TermInfo(term, documentsInfo.count(), documentsInfo));
277 }
278 }
279 }
280 qSort(termList);
281}
282
283QVector<DocumentInfo> Reader::hits()
284{
285 QVector<DocumentInfo> documents;
286 if (!termList.count())
287 return documents;
288
289 documents = termList.takeFirst().documents;
290 for(QList<TermInfo>::Iterator it = termList.begin(); it != termList.end(); ++it) {
291 TermInfo *t = &(*it);
292 QVector<DocumentInfo> docs = t->documents;
293 for(QVector<DocumentInfo>::Iterator minDoc_it = documents.begin();
294 minDoc_it != documents.end(); ) {
295 bool found = false;
296 for (QVector<DocumentInfo>::ConstIterator doc_it = docs.constBegin();
297 doc_it != docs.constEnd(); ++doc_it ) {
298 if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
299 (*minDoc_it).frequency += (*doc_it).frequency;
300 found = true;
301 break;
302 }
303 }
304 if (!found)
305 minDoc_it = documents.erase(minDoc_it);
306 else
307 ++minDoc_it;
308 }
309 }
310
311 qSort(documents);
312 return documents;
313}
314
315bool Reader::searchForPattern(const QStringList &patterns, const QStringList &words,
316 const QByteArray &data)
317{
318 if (data.isEmpty())
319 return false;
320
321 for(QHash<QString, PosEntry*>::ConstIterator mit =
322 miniIndex.begin(); mit != miniIndex.end(); ++mit) {
323 delete mit.value();
324 }
325 miniIndex.clear();
326
327 wordNum = 3;
328 QStringList::ConstIterator cIt = words.begin();
329 for ( ; cIt != words.end(); ++cIt )
330 miniIndex.insert(*cIt, new PosEntry(0));
331
332 QTextStream s(data);
333 QString text = s.readAll();
334 bool valid = true;
335 const QChar *buf = text.unicode();
336 QChar str[64];
337 QChar c = buf[0];
338 int j = 0;
339 int i = 0;
340 while ( j < text.length() ) {
341 if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
342 valid = false;
343 if ( i > 1 )
344 buildMiniIndex( QString(str,i) );
345 i = 0;
346 c = buf[++j];
347 continue;
348 }
349 if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
350 valid = true;
351 c = buf[++j];
352 continue;
353 }
354 if ( !valid ) {
355 c = buf[++j];
356 continue;
357 }
358 if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
359 str[i] = c.toLower();
360 ++i;
361 } else {
362 if ( i > 1 )
363 buildMiniIndex( QString(str,i) );
364 i = 0;
365 }
366 c = buf[++j];
367 }
368 if ( i > 1 )
369 buildMiniIndex( QString(str,i) );
370
371 QStringList::ConstIterator patIt = patterns.begin();
372 QStringList wordLst;
373 QList<uint> a, b;
374 QList<uint>::iterator aIt;
375 for ( ; patIt != patterns.end(); ++patIt ) {
376 wordLst = (*patIt).split(QLatin1Char(' '));
377 a = miniIndex[ wordLst[0] ]->positions;
378 for ( int j = 1; j < (int)wordLst.count(); ++j ) {
379 b = miniIndex[ wordLst[j] ]->positions;
380 aIt = a.begin();
381 while ( aIt != a.end() ) {
382 if ( b.contains( *aIt + 1 )) {
383 (*aIt)++;
384 ++aIt;
385 } else {
386 aIt = a.erase( aIt );
387 }
388 }
389 }
390 }
391 if ( a.count() )
392 return true;
393 return false;
394}
395
396QVector<Document> Reader::setupDummyTerm(const QStringList &terms,
397 const EntryTable &entryTable)
398{
399 QList<Term> termList;
400 for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it) {
401 if (entryTable.value(*it)) {
402 Entry *e = entryTable.value(*it);
403 termList.append(Term(*it, e->documents.count(), e->documents ) );
404 }
405 }
406 QVector<Document> maxList(0);
407 if ( !termList.count() )
408 return maxList;
409 qSort(termList);
410
411 maxList = termList.takeLast().documents;
412 for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
413 Term *t = &(*it);
414 QVector<Document> docs = t->documents;
415 for (QVector<Document>::iterator docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
416 if ( maxList.indexOf( *docIt ) == -1 )
417 maxList.append( *docIt );
418 }
419 }
420 return maxList;
421}
422
423QStringList Reader::getWildcardTerms(const QString &term,
424 const EntryTable &entryTable)
425{
426 QStringList lst;
427 QStringList terms = split(term);
428 QStringList::Iterator iter;
429
430 for(EntryTable::ConstIterator it = entryTable.begin();
431 it != entryTable.end(); ++it) {
432 int index = 0;
433 bool found = false;
434 QString text( it.key() );
435 for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
436 if ( *iter == QLatin1String("*") ) {
437 found = true;
438 continue;
439 }
440 if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
441 found = false;
442 break;
443 }
444 index = text.indexOf( *iter, index );
445 if ( *iter == terms.last() && index != (int)text.length()-1 ) {
446 index = text.lastIndexOf( *iter );
447 if ( index != (int)text.length() - (int)(*iter).length() ) {
448 found = false;
449 break;
450 }
451 }
452 if ( index != -1 ) {
453 found = true;
454 index += (*iter).length();
455 continue;
456 } else {
457 found = false;
458 break;
459 }
460 }
461 if (found)
462 lst << text;
463 }
464
465 return lst;
466}
467
468void Reader::buildMiniIndex(const QString &string)
469{
470 if (miniIndex[string])
471 miniIndex[string]->positions.append(wordNum);
472 ++wordNum;
473}
474
475void Reader::reset()
476{
477 for(IndexTable::Iterator it = indexTable.begin();
478 it != indexTable.end(); ++it) {
479 cleanupIndex(it.value().first);
480 it.value().second.clear();
481 }
482}
483
484void Reader::cleanupIndex(EntryTable &entryTable)
485{
486 for(EntryTable::ConstIterator it =
487 entryTable.begin(); it != entryTable.end(); ++it) {
488 delete it.value();
489 }
490
491 entryTable.clear();
492}
493
494
495QHelpSearchIndexReader::QHelpSearchIndexReader()
496 : QThread()
497 , m_cancel(false)
498{
499 // nothing todo
500}
501
502QHelpSearchIndexReader::~QHelpSearchIndexReader()
503{
504 mutex.lock();
505 this->m_cancel = true;
506 waitCondition.wakeOne();
507 mutex.unlock();
508
509 wait();
510}
511
512void QHelpSearchIndexReader::cancelSearching()
513{
514 mutex.lock();
515 this->m_cancel = true;
516 mutex.unlock();
517}
518
519void QHelpSearchIndexReader::search(const QString &collectionFile,
520 const QString &indexFilesFolder,
521 const QList<QHelpSearchQuery> &queryList)
522{
523 QMutexLocker lock(&mutex);
524
525 this->hitList.clear();
526 this->m_cancel = false;
527 this->m_query = queryList;
528 this->m_collectionFile = collectionFile;
529 this->m_indexFilesFolder = indexFilesFolder;
530
531 start(QThread::NormalPriority);
532}
533
534int QHelpSearchIndexReader::hitsCount() const
535{
536 return hitList.count();
537}
538
539QHelpSearchEngine::SearchHit QHelpSearchIndexReader::hit(int index) const
540{
541 return hitList.at(index);
542}
543
544void QHelpSearchIndexReader::run()
545{
546 mutex.lock();
547
548 if (m_cancel) {
549 mutex.unlock();
550 return;
551 }
552
553 const QList<QHelpSearchQuery> &queryList = this->m_query;
554 const QLatin1String key("DefaultSearchNamespaces");
555 const QString collectionFile(this->m_collectionFile);
556 const QString indexPath = m_indexFilesFolder;
557
558 mutex.unlock();
559
560 QString queryTerm;
561 foreach (const QHelpSearchQuery query, queryList) {
562 if (query.fieldName == QHelpSearchQuery::DEFAULT) {
563 queryTerm = query.wordList.at(0);
564 break;
565 }
566 }
567
568 if (queryTerm.isEmpty())
569 return;
570
571 QHelpEngineCore engine(collectionFile, 0);
572 if (!engine.setupData())
573 return;
574
575 const QStringList registeredDocs = engine.registeredDocumentations();
576 const QStringList indexedNamespaces = engine.customValue(key).toString().
577 split(QLatin1String("|"), QString::SkipEmptyParts);
578
579 emit searchingStarted();
580
581 // setup the reader
582 m_reader.setIndexPath(indexPath);
583 foreach(const QString namespaceName, registeredDocs) {
584 mutex.lock();
585 if (m_cancel) {
586 mutex.unlock();
587 searchingFinished(0); // TODO: check this ???
588 return;
589 }
590 mutex.unlock();
591
592 const QList<QStringList> attributeSets =
593 engine.filterAttributeSets(namespaceName);
594
595 foreach (QStringList attributes, attributeSets) {
596 // read all index files
597 m_reader.setIndexFile(namespaceName, attributes.join(QLatin1String("@")));
598 if (!m_reader.readIndex()) {
599 qWarning("Full Text Search, could not read file for namespace: %s.",
600 namespaceName.toUtf8().constData());
601 }
602 }
603 }
604
605 // get the current filter attributes and minimize the index files table
606 m_reader.filterFilesForAttributes(engine.filterAttributes(engine.currentFilter()));
607
608 hitList.clear();
609 QStringList terms, termSeq, seqWords;
610 if (m_reader.initCheck() && // check if we could read anything
611 m_reader.splitSearchTerm(queryTerm, &terms, &termSeq, &seqWords) ) {
612
613 // search for term(s)
614 m_reader.searchInIndex(terms); // TODO: should this be interruptible as well ???
615
616 QVector<DocumentInfo> hits = m_reader.hits();
617 if (!hits.isEmpty()) {
618 if (termSeq.isEmpty()) {
619 foreach (const DocumentInfo docInfo, hits) {
620 mutex.lock();
621 if (m_cancel) {
622 mutex.unlock();
623 searchingFinished(0); // TODO: check this, speed issue while locking???
624 return;
625 }
626 mutex.unlock();
627 hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
628 }
629 } else {
630 foreach (const DocumentInfo docInfo, hits) {
631 mutex.lock();
632 if (m_cancel) {
633 mutex.unlock();
634 searchingFinished(0); // TODO: check this, speed issue while locking???
635 return;
636 }
637 mutex.unlock();
638
639 if (m_reader.searchForPattern(termSeq, seqWords, engine.fileData(docInfo.documentUrl))) // TODO: should this be interruptible as well ???
640 hitList.append(qMakePair(docInfo.documentTitle, docInfo.documentUrl));
641 }
642 }
643 }
644 }
645
646 emit searchingFinished(hitList.count());
647}
648
649 } // namespace std
650 } // namespace fulltextsearch
651} // namespace qt
652
653QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.