trac.util.html

1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2003-2023 Edgewall Software 4 # All rights reserved. 5 # 6 # This software is licensed as described in the file COPYING, which 7 # you should have received as part of this distribution. The terms 8 # are also available at https://trac.edgewall.org/wiki/TracLicense. 9 # 10 # This software consists of voluntary contributions made by many 11 # individuals. For the exact contribution history, see the revision 12 # history and logs, available at https://trac.edgewall.org/log/. 13 14 # Note that a significant part of the code in this module was taken 15 # from the Genshi project (http://genshi.edgewall.org): 16 # - escape utilities from genshi.core, 17 # - strip utilities from genshi.util, 18 # - the tag builder API from genshi.builder, 19 # - the HTMLSanitizer from genshi.filters.html. 20 21 import io 22 import re 23 import sys 24 from HTMLParser import HTMLParser 25 import htmlentitydefs as entities 26 27 from markupsafe import Markup, escape as escape_quotes 28 29 """Utilities for producing HTML content. 30 31 Imports related to the legacy Genshi template engine should all go 32 through this module: 33 34 from trac.util.html import genshi, Stream 35 36 If Genshi is not installed, `genshi` and all related symbols will be 37 `None`. 38 39 """ 40 41 try: 42 import genshi 43 import genshi.input 44 from genshi.core import Attrs, QName, Stream, COMMENT, START, END, TEXT 45 from genshi.input import ParseError

46 - def stream_to_unicode(stream):

47 return Markup(stream.render('xhtml', encoding=None, 48 strip_whitespace=False))

49 except ImportError: 50 genshi = stream_to_unicode = None 51 HTML = COMMENT = START = END = TEXT = Attrs = QName = Stream = None 52 ParseError = None 53 54 try: 55 from babel.support import LazyProxy 56 except ImportError: 57 LazyProxy = None 58 59 from trac.core import TracError 60 from trac.util.text import to_unicode 61 62 __all__ = ['Deuglifier', 'FormTokenInjector', 'TracHTMLSanitizer', 'escape', 63 'find_element', 'html', 'is_safe_origin', 'plaintext', 'tag', 64 'to_fragment', 'stripentities', 'striptags', 'valid_html_bytes', 65 'unescape'] 66 67 68 _name2codepoint = entities.name2codepoint.copy() 69 _name2codepoint['apos'] = 39 # single quote 70 71

72 -def escape(str, quotes=True):

73 """Create a Markup instance from a string and escape special characters 74 it may contain (<, >, & and \"). 75 76 :param text: the string to escape; if not a string, it is assumed that 77 the input can be converted to a string 78 :param quotes: if ``True``, double quote characters are escaped in 79 addition to the other special characters 80 81 >>> escape('"1 < 2"') 82 Markup(u'"1 < 2"') 83 84 >>> escape(['"1 < 2"']) 85 Markup(u"['"1 < 2"']") 86 87 If the `quotes` parameter is set to `False`, the \" character is left 88 as is. Escaping quotes is generally only required for strings that are 89 to be used in attribute values. 90 91 >>> escape('"1 < 2"', quotes=False) 92 Markup(u'"1 < 2"') 93 94 >>> escape(['"1 < 2"'], quotes=False) 95 Markup(u'[\\'"1 < 2"\\']') 96 97 However, `escape` behaves slightly differently with `Markup` and 98 `Fragment` behave instances, as they are passed through 99 unmodified. 100 101 >>> escape(Markup('"1 < 2 '"')) 102 Markup(u'"1 < 2 '"') 103 104 >>> escape(Markup('"1 < 2 '"'), quotes=False) 105 Markup(u'"1 < 2 '"') 106 107 >>> escape(tag.b('"1 < 2"')) 108 Markup(u'"1 < 2"') 109 110 >>> escape(tag.b('"1 < 2"'), quotes=False) 111 Markup(u'"1 < 2"') 112 113 :return: the escaped `Markup` string 114 :rtype: `Markup` 115 116 """ 117 if LazyProxy and isinstance(str, LazyProxy): 118 str = str.value 119 if isinstance(str, Markup): 120 return str 121 if isinstance(str, Fragment): 122 return Markup(str) 123 e = escape_quotes(str) 124 if quotes: 125 if ''' not in e: 126 return e 127 return Markup(unicode(e).replace(''', "'")) 128 elif '&#3' not in e: 129 return e 130 return Markup(unicode(e).replace('"', '"').replace(''', "'"))

131 132

133 -def unescape(text):

134 """Reverse-escapes &, <, >, and \" and returns a `unicode` object. 135 136 >>> unescape(Markup('1 < 2')) 137 u'1 < 2' 138 139 If the provided `text` object is not a `Markup` instance, it is returned 140 unchanged. 141 142 >>> unescape('1 < 2') 143 '1 < 2' 144 145 :param text: the text to unescape 146 :return: the unescsaped string 147 :rtype: `unicode` 148 """ 149 if not text: 150 return '' 151 if not isinstance(text, Markup): 152 return text 153 return text.unescape()

154 155 156 _STRIPENTITIES_RE = re.compile(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)')

157 -def stripentities(text, keepxmlentities=False):

158 """Return a copy of the given text with any character or numeric entities 159 replaced by the equivalent UTF-8 characters. 160 161 >>> stripentities('1 < 2') 162 u'1 < 2' 163 >>> stripentities('more …') 164 u'more \u2026' 165 >>> stripentities('…') 166 u'\u2026' 167 >>> stripentities('…') 168 u'\u2026' 169 >>> stripentities(Markup(u'\u2026')) 170 u'\u2026' 171 172 If the `keepxmlentities` parameter is provided and is a truth value, the 173 core XML entities (&, ', >, < and ") are left intact. 174 175 >>> stripentities('1 < 2 …', keepxmlentities=True) 176 u'1 < 2 \u2026' 177 178 :return: a `unicode` instance with entities removed 179 :rtype: `unicode` 180 """ 181 def _replace_entity(match): 182 if match.group(1): # numeric entity 183 ref = match.group(1) 184 if ref.startswith(('x', 'X')): 185 ref = int(ref[1:], 16) 186 else: 187 ref = int(ref, 10) 188 return _unichr(ref) 189 else: # character entity 190 ref = match.group(2) 191 if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'): 192 return '&%s;' % ref 193 try: 194 return _unichr(_name2codepoint[ref]) 195 except KeyError: 196 if keepxmlentities: 197 return '&%s;' % ref 198 else: 199 return ref

200 if isinstance(text, Markup): 201 text = unicode(text) 202 return _STRIPENTITIES_RE.sub(_replace_entity, text) 203 204

205 -def striptags(text):

206 """Return a copy of the text with any XML/HTML tags removed. 207 208 >>> striptags('Foo bar') 209 u'Foo bar' 210 >>> striptags('Foo') 211 u'Foo' 212 >>> striptags('Foo ') 213 u'Foo' 214 215 HTML/XML comments are stripped, too: 216 217 >>> striptags('test') 218 u'test' 219 220 :param text: the string to remove tags from 221 :return: a `unicode` instance with all tags removed 222 :rtype: `unicode` 223 """ 224 return Markup(text).striptags()

225 226 227 # -- Simplified genshi.builder API 228 229 230 NO_YES = ('no', 'yes') 231 OFF_ON = ('off', 'on') 232 FALSE_TRUE = ('false', 'true') 233 234 SPECIAL_HTML_ATTRS = dict( 235 autofocus=None, autoplay=None, checked=None, controls=None, 236 default=None, defer=None, disabled=None, formnovalidate=None, hidden=None, 237 ismap=None, loop=None, multiple=None, muted=None, novalidate=None, 238 open=None, readonly=None, required=None, reversed=None, scoped=None, 239 seamless=None, selected=None, 240 contenteditable=FALSE_TRUE, draggable=FALSE_TRUE, spellcheck=FALSE_TRUE, 241 translate=NO_YES, 242 autocomplete=OFF_ON, 243 ) 244 SPECIAL_HTML_ATTRS['async'] = None 245

246 -def html_attribute(key, val):

247 """Returns the actual value for the attribute ``key``, for the given 248 ``value``. 249 250 This follows the rules described in the HTML5_ spec (Double-quoted 251 attribute value syntax). 252 253 .. _HTML5: https://www.w3.org/TR/html-markup/global-attributes.html#global-attributes 254 255 In addition, it treats the ``'class'`` and the ``'style'`` 256 attributes in a special way, as it processes them through 257 `classes` and `styles`. 258 259 :rtype: a `Markup` object containing the escaped attribute value, 260 but it can also be `None` to indicate that the attribute 261 should be omitted from the output 262 263 """ 264 if key == 'class': 265 if isinstance(val, dict): 266 val = classes(**val) or None 267 elif isinstance(val, list): 268 val = classes(*val) or None 269 elif key == 'style': 270 if isinstance(val, list): 271 val = styles(*val) or None 272 else: 273 val = styles(val) or None 274 else: 275 if key in SPECIAL_HTML_ATTRS: 276 values = SPECIAL_HTML_ATTRS[key] 277 if values is None: 278 val = key if val else None 279 else: 280 val = values[bool(val)] 281 return None if val is None else escape(val)

282

283 -def classes(*args, **kwargs):

284 """Helper function for dynamically assembling a list of CSS class 285 names in templates. 286 287 Any positional arguments are added to the list of class names. All 288 positional arguments must be strings: 289 290 >>> classes('foo', 'bar') 291 u'foo bar' 292 293 In addition, the names of any supplied keyword arguments are added 294 if they have a truth value: 295 296 >>> classes('foo', bar=True) 297 u'foo bar' 298 >>> classes('foo', bar=False) 299 u'foo' 300 301 If none of the arguments are added to the list, this function 302 returns `''`: 303 304 >>> classes(bar=False) 305 u'' 306 307 """ 308 classes = list(filter(None, args)) + [k for k, v in kwargs.items() if v] 309 return u' '.join(classes)

310

311 -def styles(*args, **kwargs):

312 """Helper function for dynamically assembling a list of CSS style name 313 and values in templates. 314 315 Any positional arguments are added to the list of styles. All 316 positional arguments must be strings or dicts: 317 318 >>> styles('foo: bar', 'fu: baz', {'bottom-right': '1em'}) 319 u'foo: bar; fu: baz; bottom-right: 1em' 320 321 In addition, the names of any supplied keyword arguments are added 322 if they have a string value: 323 324 >>> styles(foo='bar', fu='baz') 325 u'foo: bar; fu: baz' 326 >>> styles(foo='bar', bar=False) 327 u'foo: bar' 328 329 If none of the arguments are added to the list, this function 330 returns `''`: 331 332 >>> styles(bar=False) 333 u'' 334 335 """ 336 args = list(filter(None, args)) 337 d = {} 338 styles = [] 339 for arg in args: 340 if isinstance(arg, dict): 341 d.update(arg) 342 else: 343 styles.append(arg) 344 d.update(kwargs) 345 styles.extend('%s: %s' % (k, v) for k, v in d.iteritems() if v) 346 return u'; '.join(styles)

347 348

349 -class Fragment(object):

350 """A fragment represents a sequence of strings or elements.""" 351 352 __slots__ = ('children',) 353

354 - def __init__(self, *args):

355 self.children = [] 356 for arg in args: 357 self.append(arg)

358

359 - def __html__(self):

360 return Markup(unicode(self))

361

362 - def __unicode__(self):

363 return u''.join(escape(c, False) for c in self.children)

364

365 - def __str__(self):

366 return self.__unicode__().encode('utf-8')

367

368 - def __add__(self, other):

369 return Fragment(self, other)

370

371 - def __call__(self, *args):

372 for arg in args: 373 self.append(arg) 374 return self

375

376 - def append(self, arg):

377 global genshi 378 if arg: # ignore most false values (None, False, [], (), ''), except 0! 379 if isinstance(arg, (Fragment, basestring, int, float, long)): 380 self.children.append(arg) 381 elif genshi and isinstance(arg, Stream): 382 # legacy support for Genshi streams 383 self.children.append(stream_to_unicode(arg)) 384 else: 385 # support iterators and generators 386 try: 387 for elt in arg: 388 self.append(elt) 389 except TypeError: 390 self.children.append(arg) 391 elif arg == 0: 392 self.children.append(u'0')

393

394 - def as_text(self):

395 return u''.join(c.as_text() if isinstance(c, Fragment) else unicode(c) 396 for c in self.children)

397 398 if genshi:

399 - def __iter__(self):

400 """Genshi compatibility layer. 401 402 :deprecated: this will be removed in Trac 1.5.1. 403 """ 404 yield TEXT, Markup(self), (None, -1, -1)

405 406

407 -class XMLElement(Fragment):

408 """An element represents an XML element, with a tag name, attributes 409 and content. 410 411 """ 412 413 __slots__ = ('tag', 'attrib') 414 415 EMPTY_ATTRIB = {} 416 417 VOID_ELEMENTS = () 418 419 CLOSE_TAG = u'/>' 420

421 - def __init__(self, tag, *args, **kwargs):

422 Fragment.__init__(self, *args) 423 self.tag = unicode(tag) 424 self.attrib = self._dict_from_kwargs(kwargs) \ 425 if kwargs else self.EMPTY_ATTRIB

426

427 - def _attr_value(self, k, v):

428 return v

429

430 - def _dict_from_kwargs(self, kwargs):

431 attrs = [] 432 for k, v in kwargs.iteritems(): 433 if v is not None: 434 if k[-1:] == '_': 435 k = k[:-1] 436 v = self._attr_value(k, v) 437 if v is not None: 438 attrs.append((k, escape(v))) 439 return dict(attrs)

440

441 - def __call__(self, *args, **kwargs):

442 if kwargs: 443 d = self._dict_from_kwargs(kwargs) 444 if d: 445 if self.attrib: 446 self.attrib.update(d) 447 else: 448 self.attrib = d 449 for arg in args: 450 self.append(arg) 451 return self

452

453 - def __unicode__(self):

454 elt = u'<' + self.tag 455 if self.attrib: 456 # Sorting the attributes makes the unit-tests more robust 457 attrs = [] 458 for k in sorted(self.attrib): 459 v = self.attrib[k] 460 if v: 461 attrs.append(' %s="%s"' % (k, v)) 462 if attrs: 463 elt += u''.join(attrs) 464 if self.children or (self.VOID_ELEMENTS and 465 self.tag not in self.VOID_ELEMENTS): 466 elt += u'>' + Fragment.__unicode__(self) + u'</' + self.tag + u'>' 467 else: 468 elt += self.CLOSE_TAG 469 return elt

470 471

472 -class Element(XMLElement):

473 """An element represents an HTML element, with a tag name, attributes 474 and content. 475 476 Some elements and attributes are rendered specially, according to 477 the HTML5 specification (or going there...) 478 479 """ 480 481 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 482 'img', 'input', 'keygen', 'link', 'meta', 'param', 483 'source', 'track', 'wbr'} 484 CLOSE_TAG = u' />' 485 486 __slots__ = () 487

488 - def _attr_value(self, k, v):

489 return html_attribute(k, v)

490 491

492 -class XMLElementFactory(object):

493 """An XML element factory can be used to build Fragments and 494 XMLElements for arbitrary tag names. 495 496 """ 497

498 - def __call__(self, *args):

499 return Fragment(*args)

500

501 - def __getattr__(self, tag):

502 return XMLElement(tag)

503 504 xml = XMLElementFactory() 505

506 -class ElementFactory(XMLElementFactory):

507 """An element factory can be used to build Fragments and Elements for 508 arbitrary tag names. 509 510 """ 511

512 - def __getattr__(self, tag):

513 return Element(tag)

514 515 tag = html = ElementFactory() 516 517

518 -class TracHTMLSanitizer(object):

519 520 """Sanitize HTML constructions which are potentially vector of 521 phishing or XSS attacks, in user-supplied HTML. 522 523 The usual way to use the sanitizer is to call the `sanitize` 524 method on some potentially unsafe HTML content. 525 526 Note that for backward compatibility, the TracHTMLSanitizer still 527 behaves as a Genshi filter. 528 529 See also `genshi.HTMLSanitizer`_ from which the TracHTMLSanitizer 530 has evolved. 531 532 .. _genshi.HTMLSanitizer: 533 http://genshi.edgewall.org/wiki/Documentation/filters.html#html-sanitizer 534 535 """ 536 537 # TODO: check from time to time if there are any upstream changes 538 # we could integrate. 539 540 SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', 541 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 542 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 543 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 544 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', 545 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', 546 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', 547 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', 548 'ul', 'var']) 549 550 SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey', 551 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding', 552 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 553 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', 554 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', 555 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', 556 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', 557 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 558 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 559 'span', 'src', 'start', 'style', 560 'summary', 'tabindex', 'target', 'title', 561 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) 562 563 SAFE_CSS = frozenset([ 564 # CSS 3 properties <http://www.w3.org/TR/CSS/#properties> 565 'background', 'background-attachment', 'background-color', 566 'background-image', 'background-position', 'background-repeat', 567 'border', 'border-bottom', 'border-bottom-color', 568 'border-bottom-style', 'border-bottom-left-radius', 569 'border-bottom-right-radius', 'border-bottom-width', 570 'border-collapse', 'border-color', 'border-left', 'border-left-color', 571 'border-left-style', 'border-left-width', 'border-radius', 572 'border-right', 'border-right-color', 'border-right-style', 573 'border-right-width', 'border-spacing', 'border-style', 'border-top', 574 'border-top-color', 'border-top-left-radius', 'border-top-right-radius', 575 'border-top-style', 'border-top-width', 'border-width', 'bottom', 576 'caption-side', 'clear', 'clip', 'color', 'content', 577 'counter-increment', 'counter-reset', 'cursor', 'direction', 578 'display', 'empty-cells', 'float', 'font', 'font-family', 'font-size', 579 'font-style', 'font-variant', 'font-weight', 'height', 'left', 580 'letter-spacing', 'line-height', 'list-style', 'list-style-image', 581 'list-style-position', 'list-style-type', 'margin', 'margin-bottom', 582 'margin-left', 'margin-right', 'margin-top', 'max-height', 'max-width', 583 'min-height', 'min-width', 'opacity', 'orphans', 'outline', 584 'outline-color', 'outline-style', 'outline-width', 'overflow', 585 'padding', 'padding-bottom', 'padding-left', 'padding-right', 586 'padding-top', 'page-break-after', 'page-break-before', 587 'page-break-inside', 'position', 'quotes', 'right', 'table-layout', 588 'text-align', 'text-decoration', 'text-indent', 'text-transform', 589 'top', 'unicode-bidi', 'vertical-align', 'visibility', 'white-space', 590 'widows', 'width', 'word-spacing', 'z-index', 591 ]) 592 593 SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) 594 595 URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', 596 'src']) 597 598 SAFE_CROSS_ORIGINS = frozenset(['data:']) 599

600 - def __init__(self, safe_schemes=SAFE_SCHEMES, safe_css=SAFE_CSS, 601 safe_tags=SAFE_TAGS, safe_attrs=SAFE_ATTRS, 602 uri_attrs=URI_ATTRS, safe_origins=SAFE_CROSS_ORIGINS):

603 """Note: safe_schemes and safe_css have to remain the first 604 parameters, for backward-compatibility purpose. 605 """ 606 self.safe_tags = safe_tags 607 # The set of tag names that are considered safe. 608 self.safe_attrs = safe_attrs 609 # The set of attribute names that are considered safe. 610 self.safe_css = safe_css 611 # The set of CSS properties that are considered safe. 612 self.uri_attrs = uri_attrs 613 # The set of names of attributes that may contain URIs. 614 self.safe_schemes = safe_schemes 615 # The set of URI schemes that are considered safe. 616 self.safe_origins = safe_origins

617 # The set of URI cross origins that are considered safe. 618 619 # IE6 <http://heideri.ch/jso/#80> 620 _EXPRESSION_SEARCH = re.compile( 621 u'[eE\uFF25\uFF45]' # FULLWIDTH LATIN CAPITAL LETTER E 622 # FULLWIDTH LATIN SMALL LETTER E 623 u'[xX\uFF38\uFF58]' # FULLWIDTH LATIN CAPITAL LETTER X 624 # FULLWIDTH LATIN SMALL LETTER X 625 u'[pP\uFF30\uFF50]' # FULLWIDTH LATIN CAPITAL LETTER P 626 # FULLWIDTH LATIN SMALL LETTER P 627 u'[rR\u0280\uFF32\uFF52]' # LATIN LETTER SMALL CAPITAL R 628 # FULLWIDTH LATIN CAPITAL LETTER R 629 # FULLWIDTH LATIN SMALL LETTER R 630 u'[eE\uFF25\uFF45]' # FULLWIDTH LATIN CAPITAL LETTER E 631 # FULLWIDTH LATIN SMALL LETTER E 632 u'[sS\uFF33\uFF53]{2}' # FULLWIDTH LATIN CAPITAL LETTER S 633 # FULLWIDTH LATIN SMALL LETTER S 634 u'[iI\u026A\uFF29\uFF49]' # LATIN LETTER SMALL CAPITAL I 635 # FULLWIDTH LATIN CAPITAL LETTER I 636 # FULLWIDTH LATIN SMALL LETTER I 637 u'[oO\uFF2F\uFF4F]' # FULLWIDTH LATIN CAPITAL LETTER O 638 # FULLWIDTH LATIN SMALL LETTER O 639 u'[nN\u0274\uFF2E\uFF4E]' # LATIN LETTER SMALL CAPITAL N 640 # FULLWIDTH LATIN CAPITAL LETTER N 641 # FULLWIDTH LATIN SMALL LETTER N 642 ).search 643 644 # IE6 <http://openmya.hacker.jp/hasegawa/security/expression.txt> 645 # 7) Particular bit of Unicode characters 646 _URL_FINDITER = re.compile( 647 u'[Uu][Rr\u0280][Ll\u029F]\s*\(([^)]+)').finditer 648

649 - def sanitize(self, html):

650 """Transforms the incoming HTML by removing anything's that deemed 651 unsafe. 652 653 :param html: the input HTML 654 :type: basestring 655 :return: the sanitized content 656 :rtype: Markup 657 658 """ 659 transform = HTMLSanitization(self, io.StringIO()) 660 transform.feed(html) 661 transform.close() 662 return Markup(transform.out.getvalue())

663 664 if genshi:

665 - def __call__(self, stream):

666 """Apply the filter to the given stream. 667 668 :deprecated: the ability to behave as a Genshi filter will be 669 removed in Trac 1.5.1. 670 671 :param stream: the markup event stream to filter 672 """ 673 waiting_for = None 674 675 for kind, data, pos in stream: 676 if kind is START: 677 if waiting_for: 678 continue 679 tag, attrs = data 680 if not self.is_safe_elem(tag, attrs): 681 waiting_for = tag 682 continue 683 new_attrs = self.sanitize_attrs(tag, dict(attrs)) 684 yield kind, (tag, Attrs(new_attrs.iteritems())), pos 685 686 elif kind is END: 687 tag = data 688 if waiting_for: 689 if waiting_for == tag: 690 waiting_for = None 691 else: 692 yield kind, data, pos 693 694 elif kind is not COMMENT: 695 if not waiting_for: 696 yield kind, data, pos

697

698 - def is_safe_css(self, prop, value):

699 """Determine whether the given css property declaration is to be 700 considered safe for inclusion in the output. 701 702 """ 703 if prop not in self.safe_css: 704 return False 705 # Position can be used for phishing, 'static' excepted 706 if prop == 'position': 707 return value.lower() == 'static' 708 # Negative margins can be used for phishing 709 if prop.startswith('margin'): 710 return '-' not in value 711 return True

712

713 - def is_safe_elem(self, tag, attrs):

714 """Determine whether the given element should be considered safe for 715 inclusion in the output. 716 717 :param tag: the tag name of the element 718 :type tag: QName or basestring 719 :param attrs: the element attributes 720 :type attrs: Attrs or list 721 :return: whether the element should be considered safe 722 :rtype: bool 723 724 """ 725 if tag not in self.safe_tags: 726 return False 727 if hasattr(tag, 'localname'): # in Genshi QName 728 tag = tag.localname 729 if tag == 'input': 730 # TODO (1.5.1) no more Attrs 731 if Attrs and isinstance(attrs, Attrs): 732 input_type = attrs.get('type', '').lower() 733 if input_type == 'password': 734 return False 735 else: 736 if ('type', 'password') in attrs: 737 return False 738 return True

739

740 - def is_safe_uri(self, uri):

741 """Determine whether the given URI is to be considered safe for 742 inclusion in the output. 743 744 The default implementation checks whether the scheme of the URI is in 745 the set of allowed URIs (`safe_schemes`). 746 747 >>> sanitizer = TracHTMLSanitizer() 748 >>> sanitizer.is_safe_uri('http://example.org/') 749 True 750 >>> sanitizer.is_safe_uri('javascript:alert(document.cookie)') 751 False 752 753 :param uri: the URI to check 754 :return: `True` if the URI can be considered safe, `False` otherwise 755 :rtype: `bool` 756 757 """ 758 if '#' in uri: 759 uri = uri.split('#', 1)[0] # Strip out the fragment identifier 760 if ':' not in uri: 761 return True # This is a relative URI 762 chars = [char for char in uri.split(':', 1)[0] if char.isalnum()] 763 return ''.join(chars).lower() in self.safe_schemes

764

765 - def sanitize_attrs(self, tag, attrs):

766 """Remove potentially dangerous attributes and sanitize the style 767 attribute . 768 769 :param tag: the tag name of the element 770 :type attrs: dict corresponding to tag attributes 771 :return: a dict containing only safe or sanitized attributes 772 :rtype: dict 773 774 """ 775 new_attrs = {} 776 for attr, value in attrs.iteritems(): 777 if value is None: 778 value = attr 779 if attr not in self.safe_attrs: 780 continue 781 elif attr in self.uri_attrs: 782 # Don't allow URI schemes such as "javascript:" 783 if not self.is_safe_uri(value): 784 continue 785 elif attr == 'style': 786 # Remove dangerous CSS declarations from inline styles 787 decls = self.sanitize_css(value) 788 if not decls: 789 continue 790 value = '; '.join(decls) 791 new_attrs[attr] = value 792 if tag == 'img' and 'src' in new_attrs and \ 793 not self._is_safe_origin(new_attrs['src']): 794 attr = 'crossorigin' 795 if QName and isinstance(tag, QName): 796 attr = QName(attr) 797 new_attrs[attr] = 'anonymous' 798 return new_attrs

799

800 - def sanitize_css(self, text):

801 """Remove potentially dangerous property declarations from CSS code. 802 803 In particular, properties using the CSS ``url()`` function 804 with a scheme that is not considered safe are removed: 805 806 >>> sanitizer = TracHTMLSanitizer() 807 >>> sanitizer.sanitize_css(u''' 808 ... background: url(javascript:alert("foo")); 809 ... color: #000; 810 ... ''') 811 [u'color: #000'] 812 813 Also, the proprietary Internet Explorer function 814 ``expression()`` is always stripped: 815 816 >>> sanitizer.sanitize_css(u''' 817 ... background: #fff; 818 ... color: #000; 819 ... width: e/**/xpression(alert("F")); 820 ... ''') 821 [u'background: #fff', u'color: #000', u'width: e xpression(alert("F"))'] 822 823 :param text: the CSS text; this is expected to be `unicode` and to not 824 contain any character or numeric references 825 :return: a list of declarations that are considered safe 826 :rtype: `list` 827 828 """ 829 decls = [] 830 text = self._strip_css_comments(self._replace_unicode_escapes(text)) 831 for decl in filter(None, text.split(';')): 832 decl = decl.strip() 833 if not decl: 834 continue 835 try: 836 prop, value = decl.split(':', 1) 837 except ValueError: 838 continue 839 if not self.is_safe_css(prop.strip().lower(), value.strip()): 840 continue 841 if not self._EXPRESSION_SEARCH(decl) and \ 842 all(self._is_safe_origin(match.group(1)) 843 for match in self._URL_FINDITER(decl)): 844 decls.append(decl.strip()) 845 return decls

846 847 _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub 848 _UNICODE_ESCAPE = re.compile( 849 r"""\\([0-9a-fA-F]{1,6})\s?|\\([^\r\n\f0-9a-fA-F'"{};:()#*])""", 850 re.UNICODE).sub 851

852 - def _is_safe_origin(self, uri):

853 return (self.is_safe_uri(uri) and 854 is_safe_origin(self.safe_origins, uri))

855

856 - def _replace_unicode_escapes(self, text):

857 def _repl(match): 858 t = match.group(1) 859 if t: 860 code = int(t, 16) 861 chr = _unichr(code) 862 if code <= 0x1f: 863 # replace space character because IE ignores control 864 # characters 865 chr = ' ' 866 elif chr == '\\': 867 chr = r'\\' 868 return chr 869 t = match.group(2) 870 if t == '\\': 871 return r'\\' 872 else: 873 return t

874 return self._UNICODE_ESCAPE(_repl, 875 self._NORMALIZE_NEWLINES('\n', text))

876 877 _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub 878

879 - def _strip_css_comments(self, text):

880 """Replace comments with space character instead of superclass which 881 removes comments to avoid problems when nested comments. 882 """ 883 return self._CSS_COMMENTS(' ', text)

884 885

886 -class Deuglifier(object):

887 """Help base class used for cleaning up HTML riddled with ```` tags and replace them with appropriate ````. 890 891 The subclass must define a `rules()` static method returning a 892 list of regular expression fragments, each defining a capture 893 group in which the name will be reused for the span's class. Two 894 special group names, ``font`` and ``endfont`` are used to emit 895 ```` and ````, respectively. 896 897 """

898 - def __new__(cls):

899 self = object.__new__(cls) 900 if not hasattr(cls, '_compiled_rules'): 901 cls._compiled_rules = re.compile('(?:%s)' % '|'.join(cls.rules())) 902 self._compiled_rules = cls._compiled_rules 903 return self

904

905 - def format(self, indata):

906 return re.sub(self._compiled_rules, self.replace, indata)

907

908 - def replace(self, fullmatch):

909 for mtype, match in fullmatch.groupdict().items(): 910 if match: 911 if mtype == 'font': 912 return '' 913 elif mtype == 'endfont': 914 return '' 915 return '' % mtype

916 917

918 -class HTMLTransform(HTMLParser):

919 """Convenience base class for writing HTMLParsers. 920 921 The default implementation of the HTMLParser ``handle_*`` methods 922 do nothing, while in our case we try to rewrite the incoming 923 document unmodified. 924 925 """ 926

927 - def __init__(self, out):

928 HTMLParser.__init__(self) 929 self.out = out 930 if isinstance(out, io.TextIOBase): 931 self._convert = lambda v: v.decode('utf-8') \ 932 if isinstance(v, bytes) else v 933 elif isinstance(out, io.IOBase): 934 self._convert = lambda v: v.encode('utf-8') \ 935 if isinstance(v, unicode) else v 936 else: 937 self._convert = lambda v: v

938

939 - def handle_starttag(self, tag, attrs):

940 self._write(self.get_starttag_text())

941

942 - def handle_startendtag(self, tag, attrs):

943 self._write(self.get_starttag_text())

944

945 - def handle_charref(self, name):

946 self._handle_charref(name)

947

948 - def handle_entityref(self, name):

949 self._handle_entityref(name)

950

951 - def handle_comment(self, data):

952 self._write('' % data)

953

954 - def handle_decl(self, data):

955 self._write('<!%s>' % data)

956

957 - def handle_pi(self, data):

958 self._write('<?%s?>' % data)

959

960 - def handle_data(self, data):

961 self._write(data)

962

963 - def handle_endtag(self, tag):

964 self._write('</' + tag + '>')

965

966 - def unescape(self, s):

967 return _html_parser_unescape(s)

968 969 _codepoint2ref = {38: '&', 60: '<', 62: '>', 34: '"'} 970

971 - def _handle_charref(self, name):

972 if name.startswith(('x', 'X')): 973 codepoint = int(name[1:], 16) 974 else: 975 codepoint = int(name) 976 if 0 <= codepoint <= 0x10ffff: 977 text = self._codepoint2ref.get(codepoint) or _unichr(codepoint) 978 else: 979 text = '&#%s;' % name 980 self._write(text)

981

982 - def _handle_entityref(self, name):

983 try: 984 codepoint = _name2codepoint[name] 985 except KeyError: 986 text = '&%s;' % name 987 else: 988 text = self._codepoint2ref.get(codepoint) or _unichr(codepoint) 989 self._write(text)

990

991 - def _write(self, data):

992 self.out.write(self._convert(data))

993 994

995 -class FormTokenInjector(HTMLTransform):

996 """Identify and protect forms from CSRF attacks. 997 998 This filter works by adding a input type=hidden field to POST 999 forms. 1000 1001 """

1002 - def __init__(self, form_token, out):

1003 HTMLTransform.__init__(self, out) 1004 self.token = form_token

1005

1006 - def handle_starttag(self, tag, attrs):

1007 HTMLTransform.handle_starttag(self, tag, attrs) 1008 if tag.lower() == 'form': 1009 for name, value in attrs: 1010 if name == 'method' and value.lower() == 'post': 1011 self._write('<input type="hidden" name="__FORM_TOKEN"' 1012 ' value="%s"/>' % self.token) 1013 break

1014

1015 -class HTMLSanitization(HTMLTransform):

1016 """Sanitize parsed HTML using TracHTMLSanitizer.""" 1017

1018 - def __init__(self, sanitizer, out):

1019 HTMLTransform.__init__(self, out) 1020 self.sanitizer = sanitizer 1021 self.waiting_for = None

1022

1023 - def _handle_start(self, tag, attrs, startend):

1024 if self.waiting_for: 1025 return 1026 if not self.sanitizer.is_safe_elem(tag, attrs): 1027 self.waiting_for = tag 1028 return 1029 1030 new_attrs = self.sanitizer.sanitize_attrs(tag, dict(attrs)) 1031 html_attrs = ''.join(' %s="%s"' % (name, escape(value)) 1032 for name, value in new_attrs.iteritems()) 1033 self._write('<%s%s%s>' % (tag, html_attrs, startend))

1034

1035 - def handle_starttag(self, tag, attrs):

1036 if not self.waiting_for: 1037 self._handle_start(tag, attrs, '')

1038

1039 - def handle_startendtag(self, tag, attrs):

1040 if not self.waiting_for: 1041 self._handle_start(tag, attrs, '/')

1042

1043 - def handle_charref(self, name):

1044 if not self.waiting_for: 1045 self._handle_charref(name)

1046

1047 - def handle_entityref(self, name):

1048 if not self.waiting_for: 1049 self._handle_entityref(name)

1050

1051 - def handle_comment(self, data):

1052 pass

1053

1054 - def handle_decl(self, data):

1055 if not self.waiting_for: 1056 self._write('<!%s>' % data)

1057

1058 - def handle_pi(self, data):

1059 if not self.waiting_for: 1060 self._write('<?%s?>' % data.replace('?>', ''))

1061

1062 - def handle_data(self, data):

1063 if not self.waiting_for: 1064 self._write(escape(data))

1065

1066 - def handle_endtag(self, tag):

1067 if self.waiting_for: 1068 if self.waiting_for == tag: 1069 self.waiting_for = None 1070 else: 1071 self._write('</' + tag + '>')

1072 1073

1074 -def plaintext(text, keeplinebreaks=True):

1075 """Extract the text elements from (X)HTML content 1076 1077 >>> plaintext('1 < 2') 1078 u'1 < 2' 1079 1080 >>> plaintext(tag('1 ', tag.b('<'), ' 2')) 1081 u'1 < 2' 1082 1083 >>> plaintext('''1 1084 ... < 1085 ... 2''', keeplinebreaks=False) 1086 u'1 < 2' 1087 1088 :param text: `unicode` or `Fragment` 1089 :param keeplinebreaks: optionally keep linebreaks 1090 1091 """ 1092 if LazyProxy and isinstance(text, LazyProxy): 1093 text = text.value 1094 if isinstance(text, Fragment): 1095 text = text.as_text() 1096 else: 1097 text = stripentities(striptags(text)) 1098 if not keeplinebreaks: 1099 text = text.replace(u'\n', u' ') 1100 return text

1101 1102

1103 -def find_element(frag, attr=None, cls=None, tag=None):

1104 """Return the first element in the fragment having the given 1105 attribute, class or tag, using a preorder depth-first search. 1106 1107 """ 1108 if LazyProxy and isinstance(frag, LazyProxy): 1109 frag = frag.value 1110 if isinstance(frag, Element): 1111 if attr is not None and attr in frag.attrib: 1112 return frag 1113 if cls is not None and cls in frag.attrib.get('class', '').split(): 1114 return frag 1115 if tag is not None and tag == frag.tag: 1116 return frag 1117 if isinstance(frag, Fragment): 1118 for child in frag.children: 1119 elt = find_element(child, attr, cls, tag) 1120 if elt is not None: 1121 return elt

1122 1123

1124 -def is_safe_origin(safe_origins, uri, req=None):

1125 """Whether the given uri is a safe cross-origin.""" 1126 if not uri or ':' not in uri and not uri.startswith('//'): 1127 return True 1128 if any(safe == '*' for safe in safe_origins): 1129 return True 1130 if uri.startswith('//') and req: 1131 uri = '%s:%s' % (req.scheme, uri) 1132 1133 normalize_re = re.compile(r'(?:[a-zA-Z][-a-zA-Z0-9+._]*:)?//[^/]+$') 1134 1135 def normalize_uri(uri): 1136 if normalize_re.match(uri): 1137 uri += '/' 1138 return uri

1139 1140 uri = normalize_uri(uri) 1141 for safe in safe_origins: 1142 safe = normalize_uri(safe) 1143 if safe == uri: 1144 return True 1145 if safe.endswith(':') and uri.startswith(safe): 1146 return True 1147 if uri.startswith(safe if safe.endswith('/') else safe + '/'): 1148 return True 1149 return False 1150 1151

1152 -def to_fragment(input):

1153 """Convert input to a `Fragment` object.""" 1154 1155 while isinstance(input, TracError) or \ 1156 isinstance(input, Exception) and len(input.args) == 1: 1157 input = input.args[0] 1158 if LazyProxy and isinstance(input, LazyProxy): 1159 input = input.value 1160 if isinstance(input, Fragment): 1161 return input 1162 return tag(to_unicode(input))

1163 1164 1165 # Mappings for removal of control characters 1166 _translate_nop = ''.join(chr(i) for i in xrange(256)) 1167 _invalid_control_chars = ''.join(chr(i) for i in xrange(32) 1168 if i not in [0x09, 0x0a, 0x0d]) 1169

1170 -def valid_html_bytes(bytes):

1171 return bytes.translate(_translate_nop, _invalid_control_chars)

1172 1173 1174 if sys.maxunicode > 0xffff: 1175 _unichr = unichr 1176 else:

1177 - def _unichr(codepoint): # narrow Python build

1178 try: 1179 return unichr(codepoint) 1180 except ValueError: 1181 if not (0 <= codepoint <= 0x10ffff): 1182 raise 1183 s = r'\U%08x' % codepoint 1184 try: 1185 return s.decode('unicode-escape') 1186 except Exception as e: 1187 raise ValueError(e) 1188 1189 1190 _reference_re = re.compile(r'&(?:#[xX][0-9a-fA-F]+|#[0-9]+|\w{1,8});') 1191

1192 -def _html_parser_unescape(s):

1193 """This is to avoid an issue which HTMLParser.unescape() raises 1194 ValueError or OverflowError from unichr() when character reference 1195 with a large integer in the attribute. 1196 """ 1197 1198 def repl(match): 1199 match = match.group(0) 1200 name = match[1:-1] 1201 if name.startswith(('#x', '#X')): 1202 codepoint = int(name[2:], 16) 1203 elif name.startswith('#'): 1204 codepoint = int(name[1:]) 1205 else: 1206 try: 1207 codepoint = _name2codepoint[name] 1208 except KeyError: 1209 return match 1210 if 0 <= codepoint <= 0x10ffff: 1211 return _unichr(codepoint) 1212 else: 1213 return match

1214 1215 return _reference_re.sub(repl, s) 1216 1217 1218 if genshi:

1219 - class GenshiHTMLParserFixup(genshi.input.HTMLParser):

1220

1221 - def handle_starttag(self, tag, attrib):

1222 fixed_attrib = [(QName(name), name if value is None else value) 1223 for name, value in attrib] 1224 self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) 1225 if tag in self._EMPTY_ELEMS: 1226 self._enqueue(END, QName(tag)) 1227 else: 1228 self._open_tags.append(tag)

1229

1230 - def handle_charref(self, name):

1231 if name.startswith(('x', 'X')): 1232 codepoint = int(name[1:], 16) 1233 else: 1234 codepoint = int(name) 1235 if 0 <= codepoint <= 0x10ffff: 1236 text = _unichr(codepoint) 1237 else: 1238 text = '&#%s;' % name 1239 self._enqueue(TEXT, text)

1240

1241 - def handle_entityref(self, name):

1242 text = None 1243 try: 1244 codepoint = _name2codepoint[name] 1245 except KeyError: 1246 pass 1247 else: 1248 if 0 <= codepoint <= 0x10ffff: 1249 text = _unichr(codepoint) 1250 self._enqueue(TEXT, text or '&%s;' % name)

1251

1252 - def unescape(self, s):

1253 return _html_parser_unescape(s)

1254 1255

1256 - def HTML(text, encoding=None):

1257 if isinstance(text, unicode): 1258 f = io.StringIO(text) 1259 encoding = None 1260 else: 1261 f = io.BytesIO(text) 1262 parser = GenshiHTMLParserFixup(f, encoding=encoding) 1263 return Stream(list(parser))

1264 1265

1266 - def expand_markup(stream, ctxt=None):

1267 """A Genshi stream filter for expanding `genshi.Markup` events. 1268 1269 :deprecated: will be removed in Trac 1.5.1. 1270 1271 Note: Expansion may not be possible if the fragment is badly 1272 formed, or partial. 1273 1274 """ 1275 for event in stream: 1276 if isinstance(event[1], Markup): 1277 try: 1278 for subevent in HTML(event[1]): 1279 yield subevent 1280 except ParseError: 1281 yield event 1282 else: 1283 yield event

1284 else: 1285 expand_markup = None 1286

Source Code for Module trac.util.html