Package trac :: Package util :: Module html

Source Code for Module trac.util.html

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Copyright (C) 2003-2020 Edgewall Software 
  4  # All rights reserved. 
  5  # 
  6  # This software is licensed as described in the file COPYING, which 
  7  # you should have received as part of this distribution. The terms 
  8  # are also available at https://trac.edgewall.org/wiki/TracLicense. 
  9  # 
 10  # This software consists of voluntary contributions made by many 
 11  # individuals. For the exact contribution history, see the revision 
 12  # history and logs, available at https://trac.edgewall.org/log/. 
 13   
 14  from HTMLParser import HTMLParser 
 15  import re 
 16   
 17  from genshi import Markup, HTML, escape, unescape 
 18  from genshi.core import END, QName, START, stripentities, striptags 
 19  from genshi.builder import Element, ElementFactory, Fragment, tag 
 20  from genshi.filters.html import HTMLSanitizer 
 21  from genshi.input import ParseError 
 22  try: 
 23      from babel.support import LazyProxy 
 24  except ImportError: 
 25      LazyProxy = None 
 26   
 27  from trac.core import TracError 
 28  from trac.util.text import to_unicode 
 29   
 30  __all__ = ['Deuglifier', 'FormTokenInjector', 'TracHTMLSanitizer', 'escape', 
 31             'find_element', 'html', 'is_safe_origin', 'plaintext', 'tag', 
 32             'to_fragment', 'unescape'] 
 33   
 34   
35 -class TracHTMLSanitizer(HTMLSanitizer):
36 """Sanitize HTML constructions which are potentially vector of 37 phishing or XSS attacks, in user-supplied HTML. 38 39 See also `genshi.HTMLSanitizer`_. 40 41 .. _genshi.HTMLSanitizer: 42 http://genshi.edgewall.org/wiki/Documentation/filters.html#html-sanitizer 43 """ 44 45 SAFE_CSS = frozenset([ 46 # CSS 3 properties <http://www.w3.org/TR/CSS/#properties> 47 'background', 'background-attachment', 'background-color', 48 'background-image', 'background-position', 'background-repeat', 49 'border', 'border-bottom', 'border-bottom-color', 50 'border-bottom-style', 'border-bottom-left-radius', 51 'border-bottom-right-radius', 'border-bottom-width', 52 'border-collapse', 'border-color', 'border-left', 'border-left-color', 53 'border-left-style', 'border-left-width', 'border-radius', 54 'border-right', 'border-right-color', 'border-right-style', 55 'border-right-width', 'border-spacing', 'border-style', 'border-top', 56 'border-top-color', 'border-top-left-radius', 'border-top-right-radius', 57 'border-top-style', 'border-top-width', 'border-width', 'bottom', 58 'caption-side', 'clear', 'clip', 'color', 'content', 59 'counter-increment', 'counter-reset', 'cursor', 'direction', 60 'display', 'empty-cells', 'float', 'font', 'font-family', 'font-size', 61 'font-style', 'font-variant', 'font-weight', 'height', 'left', 62 'letter-spacing', 'line-height', 'list-style', 'list-style-image', 63 'list-style-position', 'list-style-type', 'margin', 'margin-bottom', 64 'margin-left', 'margin-right', 'margin-top', 'max-height', 'max-width', 65 'min-height', 'min-width', 'opacity', 'orphans', 'outline', 66 'outline-color', 'outline-style', 'outline-width', 'overflow', 67 'padding', 'padding-bottom', 'padding-left', 'padding-right', 68 'padding-top', 'page-break-after', 'page-break-before', 69 'page-break-inside', 'position', 'quotes', 'right', 'table-layout', 70 'text-align', 'text-decoration', 'text-indent', 'text-transform', 71 'top', 'unicode-bidi', 'vertical-align', 'visibility', 'white-space', 72 'widows', 'width', 'word-spacing', 'z-index', 73 ]) 74 75 SAFE_CROSS_ORIGINS = frozenset(['data:']) 76
77 - def __init__(self, safe_schemes=HTMLSanitizer.SAFE_SCHEMES, 78 safe_css=SAFE_CSS, safe_origins=SAFE_CROSS_ORIGINS):
79 safe_attrs = HTMLSanitizer.SAFE_ATTRS | frozenset(['style']) 80 safe_schemes = frozenset(safe_schemes) 81 super(TracHTMLSanitizer, self).__init__(safe_attrs=safe_attrs, 82 safe_schemes=safe_schemes) 83 self.safe_css = frozenset(safe_css) 84 self.safe_origins = frozenset(safe_origins)
85 86 # IE6 <http://heideri.ch/jso/#80> 87 _EXPRESSION_SEARCH = re.compile( 88 u'[eE\uFF25\uFF45]' # FULLWIDTH LATIN CAPITAL LETTER E 89 # FULLWIDTH LATIN SMALL LETTER E 90 u'[xX\uFF38\uFF58]' # FULLWIDTH LATIN CAPITAL LETTER X 91 # FULLWIDTH LATIN SMALL LETTER X 92 u'[pP\uFF30\uFF50]' # FULLWIDTH LATIN CAPITAL LETTER P 93 # FULLWIDTH LATIN SMALL LETTER P 94 u'[rR\u0280\uFF32\uFF52]' # LATIN LETTER SMALL CAPITAL R 95 # FULLWIDTH LATIN CAPITAL LETTER R 96 # FULLWIDTH LATIN SMALL LETTER R 97 u'[eE\uFF25\uFF45]' # FULLWIDTH LATIN CAPITAL LETTER E 98 # FULLWIDTH LATIN SMALL LETTER E 99 u'[sS\uFF33\uFF53]{2}' # FULLWIDTH LATIN CAPITAL LETTER S 100 # FULLWIDTH LATIN SMALL LETTER S 101 u'[iI\u026A\uFF29\uFF49]' # LATIN LETTER SMALL CAPITAL I 102 # FULLWIDTH LATIN CAPITAL LETTER I 103 # FULLWIDTH LATIN SMALL LETTER I 104 u'[oO\uFF2F\uFF4F]' # FULLWIDTH LATIN CAPITAL LETTER O 105 # FULLWIDTH LATIN SMALL LETTER O 106 u'[nN\u0274\uFF2E\uFF4E]' # LATIN LETTER SMALL CAPITAL N 107 # FULLWIDTH LATIN CAPITAL LETTER N 108 # FULLWIDTH LATIN SMALL LETTER N 109 ).search 110 111 # IE6 <http://openmya.hacker.jp/hasegawa/security/expression.txt> 112 # 7) Particular bit of Unicode characters 113 _URL_FINDITER = re.compile( 114 u'[Uu][Rr\u0280][Ll\u029F]\s*\(([^)]+)').finditer 115
116 - def sanitize_css(self, text):
117 decls = [] 118 text = self._strip_css_comments(self._replace_unicode_escapes(text)) 119 for decl in filter(None, text.split(';')): 120 decl = decl.strip() 121 if not decl: 122 continue 123 try: 124 prop, value = decl.split(':', 1) 125 except ValueError: 126 continue 127 if not self.is_safe_css(prop.strip().lower(), value.strip()): 128 continue 129 if not self._EXPRESSION_SEARCH(decl) and \ 130 all(self._is_safe_origin(match.group(1)) 131 for match in self._URL_FINDITER(decl)): 132 decls.append(decl.strip()) 133 return decls
134
135 - def __call__(self, stream):
136 """Remove input type="password" elements from the stream 137 """ 138 suppress = False 139 for kind, data, pos in super(TracHTMLSanitizer, self).__call__(stream): 140 if kind is START: 141 tag, attrs = data 142 if (tag == 'input' and 143 attrs.get('type', '').lower() == 'password'): 144 suppress = True 145 else: 146 if tag == 'img' and \ 147 not self._is_safe_origin(attrs.get('src', '')): 148 attrs |= [(QName('crossorigin'), 'anonymous')] 149 data = (tag, attrs) 150 yield kind, data, pos 151 elif kind is END: 152 if not suppress: 153 yield kind, data, pos 154 suppress = False 155 else: 156 yield kind, data, pos
157
158 - def is_safe_css(self, prop, value):
159 """Determine whether the given css property declaration is to be 160 considered safe for inclusion in the output. 161 """ 162 if prop not in self.safe_css: 163 return False 164 # Position can be used for phishing, 'static' excepted 165 if prop == 'position': 166 return value.lower() == 'static' 167 # Negative margins can be used for phishing 168 if prop.startswith('margin'): 169 return '-' not in value 170 return True
171 172 _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub 173 _UNICODE_ESCAPE = re.compile( 174 r"""\\([0-9a-fA-F]{1,6})\s?|\\([^\r\n\f0-9a-fA-F'"{};:()#*])""", 175 re.UNICODE).sub 176
177 - def _is_safe_origin(self, uri):
178 return (self.is_safe_uri(uri) and 179 is_safe_origin(self.safe_origins, uri))
180
181 - def _replace_unicode_escapes(self, text):
182 def _repl(match): 183 t = match.group(1) 184 if t: 185 code = int(t, 16) 186 chr = unichr(code) 187 if code <= 0x1f: 188 # replace space character because IE ignores control 189 # characters 190 chr = ' ' 191 elif chr == '\\': 192 chr = r'\\' 193 return chr 194 t = match.group(2) 195 if t == '\\': 196 return r'\\' 197 else: 198 return t
199 return self._UNICODE_ESCAPE(_repl, 200 self._NORMALIZE_NEWLINES('\n', text))
201 202 _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub 203
204 - def _strip_css_comments(self, text):
205 """Replace comments with space character instead of superclass which 206 removes comments to avoid problems when nested comments. 207 """ 208 return self._CSS_COMMENTS(' ', text)
209 210
211 -class Deuglifier(object):
212 """Help base class used for cleaning up HTML riddled with ``<FONT 213 COLOR=...>`` tags and replace them with appropriate ``<span 214 class="...">``. 215 216 The subclass must define a `rules()` static method returning a 217 list of regular expression fragments, each defining a capture 218 group in which the name will be reused for the span's class. Two 219 special group names, ``font`` and ``endfont`` are used to emit 220 ``<span>`` and ``</span>``, respectively. 221 """
222 - def __new__(cls):
223 self = object.__new__(cls) 224 if not hasattr(cls, '_compiled_rules'): 225 cls._compiled_rules = re.compile('(?:%s)' % '|'.join(cls.rules())) 226 self._compiled_rules = cls._compiled_rules 227 return self
228
229 - def format(self, indata):
230 return re.sub(self._compiled_rules, self.replace, indata)
231
232 - def replace(self, fullmatch):
233 for mtype, match in fullmatch.groupdict().items(): 234 if match: 235 if mtype == 'font': 236 return '<span>' 237 elif mtype == 'endfont': 238 return '</span>' 239 return '<span class="code-%s">' % mtype
240 241
242 -class FormTokenInjector(HTMLParser):
243 """Identify and protect forms from CSRF attacks. 244 245 This filter works by adding a input type=hidden field to POST forms. 246 """
247 - def __init__(self, form_token, out):
248 HTMLParser.__init__(self) 249 self.out = out 250 self.token = form_token
251
252 - def handle_starttag(self, tag, attrs):
253 self.out.write(self.get_starttag_text()) 254 if tag.lower() == 'form': 255 for name, value in attrs: 256 if name.lower() == 'method' and value.lower() == 'post': 257 self.out.write('<input type="hidden" name="__FORM_TOKEN"' 258 ' value="%s"/>' % self.token) 259 break
260
261 - def handle_startendtag(self, tag, attrs):
262 self.out.write(self.get_starttag_text())
263
264 - def handle_charref(self, name):
265 self.out.write('&#%s;' % name)
266
267 - def handle_entityref(self, name):
268 self.out.write('&%s;' % name)
269
270 - def handle_comment(self, data):
271 self.out.write('<!--%s-->' % data)
272
273 - def handle_decl(self, data):
274 self.out.write('<!%s>' % data)
275
276 - def handle_pi(self, data):
277 self.out.write('<?%s?>' % data)
278
279 - def handle_data(self, data):
280 self.out.write(data)
281
282 - def handle_endtag(self, tag):
283 self.out.write('</' + tag + '>')
284 285
286 -class TransposingElementFactory(ElementFactory):
287 """A `genshi.builder.ElementFactory` which applies `func` to the 288 named attributes before creating a `genshi.builder.Element`. 289 """ 290
291 - def __init__(self, func, namespace=None):
292 ElementFactory.__init__(self, namespace=namespace) 293 self.func = func
294
295 - def __getattr__(self, name):
296 return ElementFactory.__getattr__(self, self.func(name))
297 298 html = TransposingElementFactory(str.lower) 299 300 301 try: 302 escape('', False) # detect genshi:#439 on Genshi 0.6 with speedups 303 except TypeError: 304 _escape = escape 305
306 - def escape(text, quotes=True):
307 if text: 308 return _escape(text, quotes=quotes) 309 else: 310 return Markup(u'')
311 312
313 -def plaintext(text, keeplinebreaks=True):
314 """Extract the text elements from (X)HTML content 315 316 :param text: `unicode` or `genshi.builder.Fragment` 317 :param keeplinebreaks: optionally keep linebreaks 318 """ 319 if isinstance(text, Fragment): 320 text = text.generate().render('text', encoding=None) 321 else: 322 text = stripentities(striptags(text)) 323 if not keeplinebreaks: 324 text = text.replace(u'\n', u' ') 325 return text
326 327
328 -def find_element(frag, attr=None, cls=None, tag=None):
329 """Return the first element in the fragment having the given attribute, 330 class or tag, using a preorder depth-first search. 331 """ 332 if isinstance(frag, Element): 333 if attr is not None and attr in frag.attrib: 334 return frag 335 if cls is not None and cls in frag.attrib.get('class', '').split(): 336 return frag 337 if tag is not None and tag == frag.tag: 338 return frag 339 if isinstance(frag, Fragment): 340 for child in frag.children: 341 elt = find_element(child, attr, cls, tag) 342 if elt is not None: 343 return elt
344 345
346 -def is_safe_origin(safe_origins, uri, req=None):
347 """Whether the given uri is a safe cross-origin.""" 348 if not uri or ':' not in uri and not uri.startswith('//'): 349 return True 350 if any(safe == '*' for safe in safe_origins): 351 return True 352 if uri.startswith('//') and req: 353 uri = '%s:%s' % (req.scheme, uri) 354 355 normalize_re = re.compile(r'(?:[a-zA-Z][-a-zA-Z0-9+._]*:)?//[^/]+$') 356 357 def normalize_uri(uri): 358 if normalize_re.match(uri): 359 uri += '/' 360 return uri
361 362 uri = normalize_uri(uri) 363 for safe in safe_origins: 364 safe = normalize_uri(safe) 365 if safe == uri: 366 return True 367 if safe.endswith(':') and uri.startswith(safe): 368 return True 369 if uri.startswith(safe if safe.endswith('/') else safe + '/'): 370 return True 371 return False 372 373
374 -def expand_markup(stream, ctxt=None):
375 """A Genshi stream filter for expanding `genshi.Markup` events. 376 377 Note: Expansion may not be possible if the fragment is badly 378 formed, or partial. 379 """ 380 for event in stream: 381 if isinstance(event[1], Markup): 382 try: 383 for subevent in HTML(event[1]): 384 yield subevent 385 except ParseError: 386 yield event 387 else: 388 yield event
389 390
391 -def to_fragment(input):
392 """Convert input to a `Fragment` object.""" 393 394 while isinstance(input, TracError) or \ 395 isinstance(input, Exception) and len(input.args) == 1: 396 input = input.args[0] 397 if LazyProxy and isinstance(input, LazyProxy): 398 input = input.value 399 if isinstance(input, Fragment): 400 return input 401 return tag(to_unicode(input))
402