1 | /****************************************************************************
|
---|
2 | **
|
---|
3 | ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
|
---|
4 | ** Contact: Qt Software Information ([email protected])
|
---|
5 | **
|
---|
6 | ** This file is part of the Qt Assistant of the Qt Toolkit.
|
---|
7 | **
|
---|
8 | ** $QT_BEGIN_LICENSE:LGPL$
|
---|
9 | ** Commercial Usage
|
---|
10 | ** Licensees holding valid Qt Commercial licenses may use this file in
|
---|
11 | ** accordance with the Qt Commercial License Agreement provided with the
|
---|
12 | ** Software or, alternatively, in accordance with the terms contained in
|
---|
13 | ** a written agreement between you and Nokia.
|
---|
14 | **
|
---|
15 | ** GNU Lesser General Public License Usage
|
---|
16 | ** Alternatively, this file may be used under the terms of the GNU Lesser
|
---|
17 | ** General Public License version 2.1 as published by the Free Software
|
---|
18 | ** Foundation and appearing in the file LICENSE.LGPL included in the
|
---|
19 | ** packaging of this file. Please review the following information to
|
---|
20 | ** ensure the GNU Lesser General Public License version 2.1 requirements
|
---|
21 | ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
|
---|
22 | **
|
---|
23 | ** In addition, as a special exception, Nokia gives you certain
|
---|
24 | ** additional rights. These rights are described in the Nokia Qt LGPL
|
---|
25 | ** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
|
---|
26 | ** package.
|
---|
27 | **
|
---|
28 | ** GNU General Public License Usage
|
---|
29 | ** Alternatively, this file may be used under the terms of the GNU
|
---|
30 | ** General Public License version 3.0 as published by the Free Software
|
---|
31 | ** Foundation and appearing in the file LICENSE.GPL included in the
|
---|
32 | ** packaging of this file. Please review the following information to
|
---|
33 | ** ensure the GNU General Public License version 3.0 requirements will be
|
---|
34 | ** met: http://www.gnu.org/copyleft/gpl.html.
|
---|
35 | **
|
---|
36 | ** If you are unsure which license is appropriate for your use, please
|
---|
37 | ** contact the sales department at [email protected].
|
---|
38 | ** $QT_END_LICENSE$
|
---|
39 | **
|
---|
40 | ****************************************************************************/
|
---|
41 |
|
---|
42 | #include "index.h"
|
---|
43 |
|
---|
44 | #include <QFile>
|
---|
45 | #include <QDir>
|
---|
46 | #include <QStringList>
|
---|
47 | #include <QApplication>
|
---|
48 | #include <QByteArray>
|
---|
49 | #include <QTextStream>
|
---|
50 | #include <QtAlgorithms>
|
---|
51 | #include <QUrl>
|
---|
52 | #include <QTextCodec>
|
---|
53 | #include <ctype.h>
|
---|
54 | #include <QTextDocument>
|
---|
55 |
|
---|
56 | QT_BEGIN_NAMESPACE
|
---|
57 |
|
---|
58 | struct Term {
|
---|
59 | Term() : frequency(-1) {}
|
---|
60 | Term( const QString &t, int f, QVector<Document> l ) : term( t ), frequency( f ), documents( l ) {}
|
---|
61 | QString term;
|
---|
62 | int frequency;
|
---|
63 | QVector<Document>documents;
|
---|
64 | bool operator<( const Term &i2 ) const { return frequency < i2.frequency; }
|
---|
65 | };
|
---|
66 |
|
---|
67 | QDataStream &operator>>( QDataStream &s, Document &l )
|
---|
68 | {
|
---|
69 | s >> l.docNumber;
|
---|
70 | s >> l.frequency;
|
---|
71 | return s;
|
---|
72 | }
|
---|
73 |
|
---|
74 | QDataStream &operator<<( QDataStream &s, const Document &l )
|
---|
75 | {
|
---|
76 | s << (qint16)l.docNumber;
|
---|
77 | s << (qint16)l.frequency;
|
---|
78 | return s;
|
---|
79 | }
|
---|
80 |
|
---|
81 | Index::Index( const QString &dp, const QString &hp )
|
---|
82 | : QObject( 0 ), docPath( dp )
|
---|
83 | {
|
---|
84 | Q_UNUSED(hp);
|
---|
85 |
|
---|
86 | alreadyHaveDocList = false;
|
---|
87 | lastWindowClosed = false;
|
---|
88 | connect( qApp, SIGNAL(lastWindowClosed()),
|
---|
89 | this, SLOT(setLastWinClosed()) );
|
---|
90 | }
|
---|
91 |
|
---|
92 | Index::Index( const QStringList &dl, const QString &hp )
|
---|
93 | : QObject( 0 )
|
---|
94 | {
|
---|
95 | Q_UNUSED(hp);
|
---|
96 | docList = dl;
|
---|
97 | alreadyHaveDocList = true;
|
---|
98 | lastWindowClosed = false;
|
---|
99 | connect( qApp, SIGNAL(lastWindowClosed()),
|
---|
100 | this, SLOT(setLastWinClosed()) );
|
---|
101 | }
|
---|
102 |
|
---|
103 | void Index::setLastWinClosed()
|
---|
104 | {
|
---|
105 | lastWindowClosed = true;
|
---|
106 | }
|
---|
107 |
|
---|
108 | void Index::setDictionaryFile( const QString &f )
|
---|
109 | {
|
---|
110 | dictFile = f;
|
---|
111 | }
|
---|
112 |
|
---|
113 | void Index::setDocListFile( const QString &f )
|
---|
114 | {
|
---|
115 | docListFile = f;
|
---|
116 | }
|
---|
117 |
|
---|
118 | void Index::setDocList( const QStringList &lst )
|
---|
119 | {
|
---|
120 | docList = lst;
|
---|
121 | }
|
---|
122 |
|
---|
123 | int Index::makeIndex()
|
---|
124 | {
|
---|
125 | if ( !alreadyHaveDocList )
|
---|
126 | setupDocumentList();
|
---|
127 | if ( docList.isEmpty() )
|
---|
128 | return 1;
|
---|
129 | QStringList::Iterator it = docList.begin();
|
---|
130 | int steps = docList.count() / 100;
|
---|
131 | if ( !steps )
|
---|
132 | steps++;
|
---|
133 | int prog = 0;
|
---|
134 | for ( int i = 0; it != docList.end(); ++it, ++i ) {
|
---|
135 | if ( lastWindowClosed ) {
|
---|
136 | return -1;
|
---|
137 | }
|
---|
138 | QUrl url(*it);
|
---|
139 | parseDocument( url.toLocalFile(), i );
|
---|
140 | if ( i%steps == 0 ) {
|
---|
141 | prog++;
|
---|
142 | emit indexingProgress( prog );
|
---|
143 | }
|
---|
144 | }
|
---|
145 | return 0;
|
---|
146 | }
|
---|
147 |
|
---|
148 | void Index::setupDocumentList()
|
---|
149 | {
|
---|
150 | QDir d( docPath );
|
---|
151 | QStringList filters;
|
---|
152 | filters.append(QLatin1String("*.html"));
|
---|
153 | QStringList lst = d.entryList(filters);
|
---|
154 | QStringList::ConstIterator it = lst.constBegin();
|
---|
155 | for ( ; it != lst.constEnd(); ++it )
|
---|
156 | docList.append( QLatin1String("file:") + docPath + QLatin1String("/") + *it );
|
---|
157 | }
|
---|
158 |
|
---|
159 | void Index::insertInDict( const QString &str, int docNum )
|
---|
160 | {
|
---|
161 | if ( str == QLatin1String("amp") || str == QLatin1String("nbsp"))
|
---|
162 | return;
|
---|
163 | Entry *e = 0;
|
---|
164 | if ( dict.count() )
|
---|
165 | e = dict[ str ];
|
---|
166 |
|
---|
167 | if ( e ) {
|
---|
168 | if ( e->documents.last().docNumber != docNum )
|
---|
169 | e->documents.append( Document(docNum, 1 ) );
|
---|
170 | else
|
---|
171 | e->documents.last().frequency++;
|
---|
172 | } else {
|
---|
173 | dict.insert( str, new Entry( docNum ) );
|
---|
174 | }
|
---|
175 | }
|
---|
176 |
|
---|
177 | QString Index::getCharsetForDocument(QFile *file)
|
---|
178 | {
|
---|
179 | QTextStream s(file);
|
---|
180 | QString contents = s.readAll();
|
---|
181 |
|
---|
182 | QString encoding;
|
---|
183 | int start = contents.indexOf(QLatin1String("<meta"), 0, Qt::CaseInsensitive);
|
---|
184 | if (start > 0) {
|
---|
185 | int end = contents.indexOf(QLatin1String(">"), start);
|
---|
186 | QString meta = contents.mid(start+5, end-start);
|
---|
187 | meta = meta.toLower();
|
---|
188 | QRegExp r(QLatin1String("charset=([^\"\\s]+)"));
|
---|
189 | if (r.indexIn(meta) != -1) {
|
---|
190 | encoding = r.cap(1);
|
---|
191 | }
|
---|
192 | }
|
---|
193 |
|
---|
194 | file->seek(0);
|
---|
195 | if (encoding.isEmpty())
|
---|
196 | return QLatin1String("utf-8");
|
---|
197 | return encoding;
|
---|
198 | }
|
---|
199 |
|
---|
200 | void Index::parseDocument( const QString &filename, int docNum )
|
---|
201 | {
|
---|
202 | QFile file( filename );
|
---|
203 | if ( !file.open(QFile::ReadOnly) ) {
|
---|
204 | qWarning( "can not open file %s", qPrintable(filename) );
|
---|
205 | return;
|
---|
206 | }
|
---|
207 |
|
---|
208 | QTextStream s(&file);
|
---|
209 | QString en = getCharsetForDocument(&file);
|
---|
210 | s.setCodec(QTextCodec::codecForName(en.toLatin1().constData()));
|
---|
211 |
|
---|
212 | QString text = s.readAll();
|
---|
213 | if (text.isNull())
|
---|
214 | return;
|
---|
215 |
|
---|
216 | bool valid = true;
|
---|
217 | const QChar *buf = text.unicode();
|
---|
218 | QChar str[64];
|
---|
219 | QChar c = buf[0];
|
---|
220 | int j = 0;
|
---|
221 | int i = 0;
|
---|
222 | while ( j < text.length() ) {
|
---|
223 | if ( c == QLatin1Char('<') || c == QLatin1Char('&') ) {
|
---|
224 | valid = false;
|
---|
225 | if ( i > 1 )
|
---|
226 | insertInDict( QString(str,i), docNum );
|
---|
227 | i = 0;
|
---|
228 | c = buf[++j];
|
---|
229 | continue;
|
---|
230 | }
|
---|
231 | if ( ( c == QLatin1Char('>') || c == QLatin1Char(';') ) && !valid ) {
|
---|
232 | valid = true;
|
---|
233 | c = buf[++j];
|
---|
234 | continue;
|
---|
235 | }
|
---|
236 | if ( !valid ) {
|
---|
237 | c = buf[++j];
|
---|
238 | continue;
|
---|
239 | }
|
---|
240 | if ( ( c.isLetterOrNumber() || c == QLatin1Char('_') ) && i < 63 ) {
|
---|
241 | str[i] = c.toLower();
|
---|
242 | ++i;
|
---|
243 | } else {
|
---|
244 | if ( i > 1 )
|
---|
245 | insertInDict( QString(str,i), docNum );
|
---|
246 | i = 0;
|
---|
247 | }
|
---|
248 | c = buf[++j];
|
---|
249 | }
|
---|
250 | if ( i > 1 )
|
---|
251 | insertInDict( QString(str,i), docNum );
|
---|
252 | file.close();
|
---|
253 | }
|
---|
254 |
|
---|
255 | void Index::writeDict()
|
---|
256 | {
|
---|
257 | QFile f( dictFile );
|
---|
258 | if ( !f.open(QFile::WriteOnly ) )
|
---|
259 | return;
|
---|
260 | QDataStream s( &f );
|
---|
261 | for(QHash<QString, Entry *>::Iterator it = dict.begin(); it != dict.end(); ++it) {
|
---|
262 | s << it.key();
|
---|
263 | s << it.value()->documents.count();
|
---|
264 | s << it.value()->documents;
|
---|
265 | }
|
---|
266 | f.close();
|
---|
267 | writeDocumentList();
|
---|
268 | }
|
---|
269 |
|
---|
270 | void Index::writeDocumentList()
|
---|
271 | {
|
---|
272 | QFile f( docListFile );
|
---|
273 | if ( !f.open(QFile::WriteOnly ) )
|
---|
274 | return;
|
---|
275 | QDataStream s( &f );
|
---|
276 | s << docList;
|
---|
277 | }
|
---|
278 |
|
---|
279 | void Index::readDict()
|
---|
280 | {
|
---|
281 | QFile f( dictFile );
|
---|
282 | if ( !f.open(QFile::ReadOnly ) )
|
---|
283 | return;
|
---|
284 |
|
---|
285 | dict.clear();
|
---|
286 | QDataStream s( &f );
|
---|
287 | QString key;
|
---|
288 | int numOfDocs;
|
---|
289 | QVector<Document> docs;
|
---|
290 | while ( !s.atEnd() ) {
|
---|
291 | s >> key;
|
---|
292 | s >> numOfDocs;
|
---|
293 | docs.resize(numOfDocs);
|
---|
294 | s >> docs;
|
---|
295 | dict.insert( key, new Entry( docs ) );
|
---|
296 | }
|
---|
297 | f.close();
|
---|
298 | readDocumentList();
|
---|
299 | }
|
---|
300 |
|
---|
301 | void Index::readDocumentList()
|
---|
302 | {
|
---|
303 | QFile f( docListFile );
|
---|
304 | if ( !f.open(QFile::ReadOnly ) )
|
---|
305 | return;
|
---|
306 | QDataStream s( &f );
|
---|
307 | s >> docList;
|
---|
308 | }
|
---|
309 |
|
---|
310 | QStringList Index::query( const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords )
|
---|
311 | {
|
---|
312 | QList<Term> termList;
|
---|
313 | for (QStringList::ConstIterator it = terms.begin(); it != terms.end(); ++it ) {
|
---|
314 | Entry *e = 0;
|
---|
315 | if ( (*it).contains(QLatin1Char('*')) ) {
|
---|
316 | QVector<Document> wcts = setupDummyTerm( getWildcardTerms( *it ) );
|
---|
317 | termList.append( Term(QLatin1String("dummy"), wcts.count(), wcts ) );
|
---|
318 | } else if ( dict[ *it ] ) {
|
---|
319 | e = dict[ *it ];
|
---|
320 | termList.append( Term( *it, e->documents.count(), e->documents ) );
|
---|
321 | } else {
|
---|
322 | return QStringList();
|
---|
323 | }
|
---|
324 | }
|
---|
325 | if ( !termList.count() )
|
---|
326 | return QStringList();
|
---|
327 | qSort(termList);
|
---|
328 |
|
---|
329 | QVector<Document> minDocs = termList.takeFirst().documents;
|
---|
330 | for(QList<Term>::Iterator it = termList.begin(); it != termList.end(); ++it) {
|
---|
331 | Term *t = &(*it);
|
---|
332 | QVector<Document> docs = t->documents;
|
---|
333 | for(QVector<Document>::Iterator minDoc_it = minDocs.begin(); minDoc_it != minDocs.end(); ) {
|
---|
334 | bool found = false;
|
---|
335 | for (QVector<Document>::ConstIterator doc_it = docs.constBegin(); doc_it != docs.constEnd(); ++doc_it ) {
|
---|
336 | if ( (*minDoc_it).docNumber == (*doc_it).docNumber ) {
|
---|
337 | (*minDoc_it).frequency += (*doc_it).frequency;
|
---|
338 | found = true;
|
---|
339 | break;
|
---|
340 | }
|
---|
341 | }
|
---|
342 | if ( !found )
|
---|
343 | minDoc_it = minDocs.erase( minDoc_it );
|
---|
|
---|