1
2
3
4
5
6
7
8
9
10
11
12
13
14 from HTMLParser import HTMLParser
15 import re
16
17 from genshi import Markup, HTML, escape, unescape
18 from genshi.core import END, QName, START, stripentities, striptags
19 from genshi.builder import Element, ElementFactory, Fragment, tag
20 from genshi.filters.html import HTMLSanitizer
21 from genshi.input import ParseError
22 try:
23 from babel.support import LazyProxy
24 except ImportError:
25 LazyProxy = None
26
27 from trac.core import TracError
28 from trac.util.text import to_unicode
29
30 __all__ = ['Deuglifier', 'FormTokenInjector', 'TracHTMLSanitizer', 'escape',
31 'find_element', 'html', 'is_safe_origin', 'plaintext', 'tag',
32 'to_fragment', 'unescape']
33
34
36 """Sanitize HTML constructions which are potentially vector of
37 phishing or XSS attacks, in user-supplied HTML.
38
39 See also `genshi.HTMLSanitizer`_.
40
41 .. _genshi.HTMLSanitizer:
42 http://genshi.edgewall.org/wiki/Documentation/filters.html#html-sanitizer
43 """
44
45 SAFE_CSS = frozenset([
46
47 'background', 'background-attachment', 'background-color',
48 'background-image', 'background-position', 'background-repeat',
49 'border', 'border-bottom', 'border-bottom-color',
50 'border-bottom-style', 'border-bottom-left-radius',
51 'border-bottom-right-radius', 'border-bottom-width',
52 'border-collapse', 'border-color', 'border-left', 'border-left-color',
53 'border-left-style', 'border-left-width', 'border-radius',
54 'border-right', 'border-right-color', 'border-right-style',
55 'border-right-width', 'border-spacing', 'border-style', 'border-top',
56 'border-top-color', 'border-top-left-radius', 'border-top-right-radius',
57 'border-top-style', 'border-top-width', 'border-width', 'bottom',
58 'caption-side', 'clear', 'clip', 'color', 'content',
59 'counter-increment', 'counter-reset', 'cursor', 'direction',
60 'display', 'empty-cells', 'float', 'font', 'font-family', 'font-size',
61 'font-style', 'font-variant', 'font-weight', 'height', 'left',
62 'letter-spacing', 'line-height', 'list-style', 'list-style-image',
63 'list-style-position', 'list-style-type', 'margin', 'margin-bottom',
64 'margin-left', 'margin-right', 'margin-top', 'max-height', 'max-width',
65 'min-height', 'min-width', 'opacity', 'orphans', 'outline',
66 'outline-color', 'outline-style', 'outline-width', 'overflow',
67 'padding', 'padding-bottom', 'padding-left', 'padding-right',
68 'padding-top', 'page-break-after', 'page-break-before',
69 'page-break-inside', 'position', 'quotes', 'right', 'table-layout',
70 'text-align', 'text-decoration', 'text-indent', 'text-transform',
71 'top', 'unicode-bidi', 'vertical-align', 'visibility', 'white-space',
72 'widows', 'width', 'word-spacing', 'z-index',
73 ])
74
75 SAFE_CROSS_ORIGINS = frozenset(['data:'])
76
85
86
87 _EXPRESSION_SEARCH = re.compile(
88 u'[eE\uFF25\uFF45]'
89
90 u'[xX\uFF38\uFF58]'
91
92 u'[pP\uFF30\uFF50]'
93
94 u'[rR\u0280\uFF32\uFF52]'
95
96
97 u'[eE\uFF25\uFF45]'
98
99 u'[sS\uFF33\uFF53]{2}'
100
101 u'[iI\u026A\uFF29\uFF49]'
102
103
104 u'[oO\uFF2F\uFF4F]'
105
106 u'[nN\u0274\uFF2E\uFF4E]'
107
108
109 ).search
110
111
112
113 _URL_FINDITER = re.compile(
114 u'[Uu][Rr\u0280][Ll\u029F]\s*\(([^)]+)').finditer
115
117 decls = []
118 text = self._strip_css_comments(self._replace_unicode_escapes(text))
119 for decl in filter(None, text.split(';')):
120 decl = decl.strip()
121 if not decl:
122 continue
123 try:
124 prop, value = decl.split(':', 1)
125 except ValueError:
126 continue
127 if not self.is_safe_css(prop.strip().lower(), value.strip()):
128 continue
129 if not self._EXPRESSION_SEARCH(decl) and \
130 all(self._is_safe_origin(match.group(1))
131 for match in self._URL_FINDITER(decl)):
132 decls.append(decl.strip())
133 return decls
134
136 """Remove input type="password" elements from the stream
137 """
138 suppress = False
139 for kind, data, pos in super(TracHTMLSanitizer, self).__call__(stream):
140 if kind is START:
141 tag, attrs = data
142 if (tag == 'input' and
143 attrs.get('type', '').lower() == 'password'):
144 suppress = True
145 else:
146 if tag == 'img' and \
147 not self._is_safe_origin(attrs.get('src', '')):
148 attrs |= [(QName('crossorigin'), 'anonymous')]
149 data = (tag, attrs)
150 yield kind, data, pos
151 elif kind is END:
152 if not suppress:
153 yield kind, data, pos
154 suppress = False
155 else:
156 yield kind, data, pos
157
159 """Determine whether the given css property declaration is to be
160 considered safe for inclusion in the output.
161 """
162 if prop not in self.safe_css:
163 return False
164
165 if prop == 'position':
166 return value.lower() == 'static'
167
168 if prop.startswith('margin'):
169 return '-' not in value
170 return True
171
172 _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub
173 _UNICODE_ESCAPE = re.compile(
174 r"""\\([0-9a-fA-F]{1,6})\s?|\\([^\r\n\f0-9a-fA-F'"{};:()#*])""",
175 re.UNICODE).sub
176
180
182 def _repl(match):
183 t = match.group(1)
184 if t:
185 code = int(t, 16)
186 chr = unichr(code)
187 if code <= 0x1f:
188
189
190 chr = ' '
191 elif chr == '\\':
192 chr = r'\\'
193 return chr
194 t = match.group(2)
195 if t == '\\':
196 return r'\\'
197 else:
198 return t
199 return self._UNICODE_ESCAPE(_repl,
200 self._NORMALIZE_NEWLINES('\n', text))
201
202 _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub
203
209
210
212 """Help base class used for cleaning up HTML riddled with ``<FONT
213 COLOR=...>`` tags and replace them with appropriate ``<span
214 class="...">``.
215
216 The subclass must define a `rules()` static method returning a
217 list of regular expression fragments, each defining a capture
218 group in which the name will be reused for the span's class. Two
219 special group names, ``font`` and ``endfont`` are used to emit
220 ``<span>`` and ``</span>``, respectively.
221 """
223 self = object.__new__(cls)
224 if not hasattr(cls, '_compiled_rules'):
225 cls._compiled_rules = re.compile('(?:%s)' % '|'.join(cls.rules()))
226 self._compiled_rules = cls._compiled_rules
227 return self
228
231
233 for mtype, match in fullmatch.groupdict().items():
234 if match:
235 if mtype == 'font':
236 return '<span>'
237 elif mtype == 'endfont':
238 return '</span>'
239 return '<span class="code-%s">' % mtype
240
241
284
285
287 """A `genshi.builder.ElementFactory` which applies `func` to the
288 named attributes before creating a `genshi.builder.Element`.
289 """
290
291 - def __init__(self, func, namespace=None):
292 ElementFactory.__init__(self, namespace=namespace)
293 self.func = func
294
297
298 html = TransposingElementFactory(str.lower)
299
300
301 try:
302 escape('', False)
303 except TypeError:
304 _escape = escape
305
306 - def escape(text, quotes=True):
307 if text:
308 return _escape(text, quotes=quotes)
309 else:
310 return Markup(u'')
311
312
313 -def plaintext(text, keeplinebreaks=True):
314 """Extract the text elements from (X)HTML content
315
316 :param text: `unicode` or `genshi.builder.Fragment`
317 :param keeplinebreaks: optionally keep linebreaks
318 """
319 if isinstance(text, Fragment):
320 text = text.generate().render('text', encoding=None)
321 else:
322 text = stripentities(striptags(text))
323 if not keeplinebreaks:
324 text = text.replace(u'\n', u' ')
325 return text
326
327
329 """Return the first element in the fragment having the given attribute,
330 class or tag, using a preorder depth-first search.
331 """
332 if isinstance(frag, Element):
333 if attr is not None and attr in frag.attrib:
334 return frag
335 if cls is not None and cls in frag.attrib.get('class', '').split():
336 return frag
337 if tag is not None and tag == frag.tag:
338 return frag
339 if isinstance(frag, Fragment):
340 for child in frag.children:
341 elt = find_element(child, attr, cls, tag)
342 if elt is not None:
343 return elt
344
345
347 """Whether the given uri is a safe cross-origin."""
348 if not uri or ':' not in uri and not uri.startswith('//'):
349 return True
350 if any(safe == '*' for safe in safe_origins):
351 return True
352 if uri.startswith('//') and req:
353 uri = '%s:%s' % (req.scheme, uri)
354
355 normalize_re = re.compile(r'(?:[a-zA-Z][-a-zA-Z0-9+._]*:)?//[^/]+$')
356
357 def normalize_uri(uri):
358 if normalize_re.match(uri):
359 uri += '/'
360 return uri
361
362 uri = normalize_uri(uri)
363 for safe in safe_origins:
364 safe = normalize_uri(safe)
365 if safe == uri:
366 return True
367 if safe.endswith(':') and uri.startswith(safe):
368 return True
369 if uri.startswith(safe if safe.endswith('/') else safe + '/'):
370 return True
371 return False
372
373
375 """A Genshi stream filter for expanding `genshi.Markup` events.
376
377 Note: Expansion may not be possible if the fragment is badly
378 formed, or partial.
379 """
380 for event in stream:
381 if isinstance(event[1], Markup):
382 try:
383 for subevent in HTML(event[1]):
384 yield subevent
385 except ParseError:
386 yield event
387 else:
388 yield event
389
390
392 """Convert input to a `Fragment` object."""
393
394 while isinstance(input, TracError) or \
395 isinstance(input, Exception) and len(input.args) == 1:
396 input = input.args[0]
397 if LazyProxy and isinstance(input, LazyProxy):
398 input = input.value
399 if isinstance(input, Fragment):
400 return input
401 return tag(to_unicode(input))
402