1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 import io
22 import re
23 import sys
24 from HTMLParser import HTMLParser
25 import htmlentitydefs as entities
26
27 from markupsafe import Markup, escape as escape_quotes
28
29 """Utilities for producing HTML content.
30
31 Imports related to the legacy Genshi template engine should all go
32 through this module:
33
34 from trac.util.html import genshi, Stream
35
36 If Genshi is not installed, `genshi` and all related symbols will be
37 `None`.
38
39 """
40
41 try:
42 import genshi
43 import genshi.input
44 from genshi.core import Attrs, QName, Stream, COMMENT, START, END, TEXT
45 from genshi.input import ParseError
47 return Markup(stream.render('xhtml', encoding=None,
48 strip_whitespace=False))
49 except ImportError:
50 genshi = stream_to_unicode = None
51 HTML = COMMENT = START = END = TEXT = Attrs = QName = Stream = None
52 ParseError = None
53
54 try:
55 from babel.support import LazyProxy
56 except ImportError:
57 LazyProxy = None
58
59 from trac.core import TracError
60 from trac.util.text import to_unicode
61
62 __all__ = ['Deuglifier', 'FormTokenInjector', 'TracHTMLSanitizer', 'escape',
63 'find_element', 'html', 'is_safe_origin', 'plaintext', 'tag',
64 'to_fragment', 'stripentities', 'striptags', 'valid_html_bytes',
65 'unescape']
66
67
68 _name2codepoint = entities.name2codepoint.copy()
69 _name2codepoint['apos'] = 39
70
71
73 """Create a Markup instance from a string and escape special characters
74 it may contain (<, >, & and \").
75
76 :param text: the string to escape; if not a string, it is assumed that
77 the input can be converted to a string
78 :param quotes: if ``True``, double quote characters are escaped in
79 addition to the other special characters
80
81 >>> escape('"1 < 2"')
82 Markup(u'"1 < 2"')
83
84 >>> escape(['"1 < 2"'])
85 Markup(u"['"1 < 2"']")
86
87 If the `quotes` parameter is set to `False`, the \" character is left
88 as is. Escaping quotes is generally only required for strings that are
89 to be used in attribute values.
90
91 >>> escape('"1 < 2"', quotes=False)
92 Markup(u'"1 < 2"')
93
94 >>> escape(['"1 < 2"'], quotes=False)
95 Markup(u'[\\'"1 < 2"\\']')
96
97 However, `escape` behaves slightly differently with `Markup` and
98 `Fragment` behave instances, as they are passed through
99 unmodified.
100
101 >>> escape(Markup('"1 < 2 '"'))
102 Markup(u'"1 < 2 '"')
103
104 >>> escape(Markup('"1 < 2 '"'), quotes=False)
105 Markup(u'"1 < 2 '"')
106
107 >>> escape(tag.b('"1 < 2"'))
108 Markup(u'<b>"1 < 2"</b>')
109
110 >>> escape(tag.b('"1 < 2"'), quotes=False)
111 Markup(u'<b>"1 < 2"</b>')
112
113 :return: the escaped `Markup` string
114 :rtype: `Markup`
115
116 """
117 if LazyProxy and isinstance(str, LazyProxy):
118 str = str.value
119 if isinstance(str, Markup):
120 return str
121 if isinstance(str, Fragment):
122 return Markup(str)
123 e = escape_quotes(str)
124 if quotes:
125 if ''' not in e:
126 return e
127 return Markup(unicode(e).replace(''', "'"))
128 elif '' not in e:
129 return e
130 return Markup(unicode(e).replace('"', '"').replace(''', "'"))
131
132
134 """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
135
136 >>> unescape(Markup('1 < 2'))
137 u'1 < 2'
138
139 If the provided `text` object is not a `Markup` instance, it is returned
140 unchanged.
141
142 >>> unescape('1 < 2')
143 '1 < 2'
144
145 :param text: the text to unescape
146 :return: the unescsaped string
147 :rtype: `unicode`
148 """
149 if not text:
150 return ''
151 if not isinstance(text, Markup):
152 return text
153 return text.unescape()
154
155
156 _STRIPENTITIES_RE = re.compile(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)')
158 """Return a copy of the given text with any character or numeric entities
159 replaced by the equivalent UTF-8 characters.
160
161 >>> stripentities('1 < 2')
162 u'1 < 2'
163 >>> stripentities('more …')
164 u'more \u2026'
165 >>> stripentities('…')
166 u'\u2026'
167 >>> stripentities('…')
168 u'\u2026'
169 >>> stripentities(Markup(u'\u2026'))
170 u'\u2026'
171
172 If the `keepxmlentities` parameter is provided and is a truth value, the
173 core XML entities (&, ', >, < and ") are left intact.
174
175 >>> stripentities('1 < 2 …', keepxmlentities=True)
176 u'1 < 2 \u2026'
177
178 :return: a `unicode` instance with entities removed
179 :rtype: `unicode`
180 """
181 def _replace_entity(match):
182 if match.group(1):
183 ref = match.group(1)
184 if ref.startswith(('x', 'X')):
185 ref = int(ref[1:], 16)
186 else:
187 ref = int(ref, 10)
188 return _unichr(ref)
189 else:
190 ref = match.group(2)
191 if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
192 return '&%s;' % ref
193 try:
194 return _unichr(_name2codepoint[ref])
195 except KeyError:
196 if keepxmlentities:
197 return '&%s;' % ref
198 else:
199 return ref
200 if isinstance(text, Markup):
201 text = unicode(text)
202 return _STRIPENTITIES_RE.sub(_replace_entity, text)
203
204
225
226
227
228
229
230 NO_YES = ('no', 'yes')
231 OFF_ON = ('off', 'on')
232 FALSE_TRUE = ('false', 'true')
233
234 SPECIAL_HTML_ATTRS = dict(
235 autofocus=None, autoplay=None, checked=None, controls=None,
236 default=None, defer=None, disabled=None, formnovalidate=None, hidden=None,
237 ismap=None, loop=None, multiple=None, muted=None, novalidate=None,
238 open=None, readonly=None, required=None, reversed=None, scoped=None,
239 seamless=None, selected=None,
240 contenteditable=FALSE_TRUE, draggable=FALSE_TRUE, spellcheck=FALSE_TRUE,
241 translate=NO_YES,
242 autocomplete=OFF_ON,
243 )
244 SPECIAL_HTML_ATTRS['async'] = None
245
247 """Returns the actual value for the attribute ``key``, for the given
248 ``value``.
249
250 This follows the rules described in the HTML5_ spec (Double-quoted
251 attribute value syntax).
252
253 .. _HTML5: https://www.w3.org/TR/html-markup/global-attributes.html#global-attributes
254
255 In addition, it treats the ``'class'`` and the ``'style'``
256 attributes in a special way, as it processes them through
257 `classes` and `styles`.
258
259 :rtype: a `Markup` object containing the escaped attribute value,
260 but it can also be `None` to indicate that the attribute
261 should be omitted from the output
262
263 """
264 if key == 'class':
265 if isinstance(val, dict):
266 val = classes(**val) or None
267 elif isinstance(val, list):
268 val = classes(*val) or None
269 elif key == 'style':
270 if isinstance(val, list):
271 val = styles(*val) or None
272 else:
273 val = styles(val) or None
274 else:
275 if key in SPECIAL_HTML_ATTRS:
276 values = SPECIAL_HTML_ATTRS[key]
277 if values is None:
278 val = key if val else None
279 else:
280 val = values[bool(val)]
281 return None if val is None else escape(val)
282
284 """Helper function for dynamically assembling a list of CSS class
285 names in templates.
286
287 Any positional arguments are added to the list of class names. All
288 positional arguments must be strings:
289
290 >>> classes('foo', 'bar')
291 u'foo bar'
292
293 In addition, the names of any supplied keyword arguments are added
294 if they have a truth value:
295
296 >>> classes('foo', bar=True)
297 u'foo bar'
298 >>> classes('foo', bar=False)
299 u'foo'
300
301 If none of the arguments are added to the list, this function
302 returns `''`:
303
304 >>> classes(bar=False)
305 u''
306
307 """
308 classes = list(filter(None, args)) + [k for k, v in kwargs.items() if v]
309 return u' '.join(classes)
310
312 """Helper function for dynamically assembling a list of CSS style name
313 and values in templates.
314
315 Any positional arguments are added to the list of styles. All
316 positional arguments must be strings or dicts:
317
318 >>> styles('foo: bar', 'fu: baz', {'bottom-right': '1em'})
319 u'foo: bar; fu: baz; bottom-right: 1em'
320
321 In addition, the names of any supplied keyword arguments are added
322 if they have a string value:
323
324 >>> styles(foo='bar', fu='baz')
325 u'foo: bar; fu: baz'
326 >>> styles(foo='bar', bar=False)
327 u'foo: bar'
328
329 If none of the arguments are added to the list, this function
330 returns `''`:
331
332 >>> styles(bar=False)
333 u''
334
335 """
336 args = list(filter(None, args))
337 d = {}
338 styles = []
339 for arg in args:
340 if isinstance(arg, dict):
341 d.update(arg)
342 else:
343 styles.append(arg)
344 d.update(kwargs)
345 styles.extend('%s: %s' % (k, v) for k, v in d.iteritems() if v)
346 return u'; '.join(styles)
347
348
350 """A fragment represents a sequence of strings or elements."""
351
352 __slots__ = ('children',)
353
358
360 return Markup(unicode(self))
361
364
367
369 return Fragment(self, other)
370
372 for arg in args:
373 self.append(arg)
374 return self
375
377 global genshi
378 if arg:
379 if isinstance(arg, (Fragment, basestring, int, float, long)):
380 self.children.append(arg)
381 elif genshi and isinstance(arg, Stream):
382
383 self.children.append(stream_to_unicode(arg))
384 else:
385
386 try:
387 for elt in arg:
388 self.append(elt)
389 except TypeError:
390 self.children.append(arg)
391 elif arg == 0:
392 self.children.append(u'0')
393
395 return u''.join(c.as_text() if isinstance(c, Fragment) else unicode(c)
396 for c in self.children)
397
398 if genshi:
400 """Genshi compatibility layer.
401
402 :deprecated: this will be removed in Trac 1.5.1.
403 """
404 yield TEXT, Markup(self), (None, -1, -1)
405
406
408 """An element represents an XML element, with a tag name, attributes
409 and content.
410
411 """
412
413 __slots__ = ('tag', 'attrib')
414
415 EMPTY_ATTRIB = {}
416
417 VOID_ELEMENTS = ()
418
419 CLOSE_TAG = u'/>'
420
421 - def __init__(self, tag, *args, **kwargs):
422 Fragment.__init__(self, *args)
423 self.tag = unicode(tag)
424 self.attrib = self._dict_from_kwargs(kwargs) \
425 if kwargs else self.EMPTY_ATTRIB
426
429
431 attrs = []
432 for k, v in kwargs.iteritems():
433 if v is not None:
434 if k[-1:] == '_':
435 k = k[:-1]
436 v = self._attr_value(k, v)
437 if v is not None:
438 attrs.append((k, escape(v)))
439 return dict(attrs)
440
442 if kwargs:
443 d = self._dict_from_kwargs(kwargs)
444 if d:
445 if self.attrib:
446 self.attrib.update(d)
447 else:
448 self.attrib = d
449 for arg in args:
450 self.append(arg)
451 return self
452
454 elt = u'<' + self.tag
455 if self.attrib:
456
457 attrs = []
458 for k in sorted(self.attrib):
459 v = self.attrib[k]
460 if v:
461 attrs.append(' %s="%s"' % (k, v))
462 if attrs:
463 elt += u''.join(attrs)
464 if self.children or (self.VOID_ELEMENTS and
465 self.tag not in self.VOID_ELEMENTS):
466 elt += u'>' + Fragment.__unicode__(self) + u'</' + self.tag + u'>'
467 else:
468 elt += self.CLOSE_TAG
469 return elt
470
471
473 """An element represents an HTML element, with a tag name, attributes
474 and content.
475
476 Some elements and attributes are rendered specially, according to
477 the HTML5 specification (or going there...)
478
479 """
480
481 VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'command', 'embed', 'hr',
482 'img', 'input', 'keygen', 'link', 'meta', 'param',
483 'source', 'track', 'wbr'}
484 CLOSE_TAG = u' />'
485
486 __slots__ = ()
487
489 return html_attribute(k, v)
490
491
493 """An XML element factory can be used to build Fragments and
494 XMLElements for arbitrary tag names.
495
496 """
497
499 return Fragment(*args)
500
502 return XMLElement(tag)
503
504 xml = XMLElementFactory()
505
507 """An element factory can be used to build Fragments and Elements for
508 arbitrary tag names.
509
510 """
511
514
515 tag = html = ElementFactory()
516
517
519
520 """Sanitize HTML constructions which are potentially vector of
521 phishing or XSS attacks, in user-supplied HTML.
522
523 The usual way to use the sanitizer is to call the `sanitize`
524 method on some potentially unsafe HTML content.
525
526 Note that for backward compatibility, the TracHTMLSanitizer still
527 behaves as a Genshi filter.
528
529 See also `genshi.HTMLSanitizer`_ from which the TracHTMLSanitizer
530 has evolved.
531
532 .. _genshi.HTMLSanitizer:
533 http://genshi.edgewall.org/wiki/Documentation/filters.html#html-sanitizer
534
535 """
536
537
538
539
540 SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b',
541 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
542 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
543 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
544 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
545 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
546 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
547 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
548 'ul', 'var'])
549
550 SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey',
551 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding',
552 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
553 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
554 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
555 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
556 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
557 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
558 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
559 'span', 'src', 'start', 'style',
560 'summary', 'tabindex', 'target', 'title',
561 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
562
563 SAFE_CSS = frozenset([
564
565 'background', 'background-attachment', 'background-color',
566 'background-image', 'background-position', 'background-repeat',
567 'border', 'border-bottom', 'border-bottom-color',
568 'border-bottom-style', 'border-bottom-left-radius',
569 'border-bottom-right-radius', 'border-bottom-width',
570 'border-collapse', 'border-color', 'border-left', 'border-left-color',
571 'border-left-style', 'border-left-width', 'border-radius',
572 'border-right', 'border-right-color', 'border-right-style',
573 'border-right-width', 'border-spacing', 'border-style', 'border-top',
574 'border-top-color', 'border-top-left-radius', 'border-top-right-radius',
575 'border-top-style', 'border-top-width', 'border-width', 'bottom',
576 'caption-side', 'clear', 'clip', 'color', 'content',
577 'counter-increment', 'counter-reset', 'cursor', 'direction',
578 'display', 'empty-cells', 'float', 'font', 'font-family', 'font-size',
579 'font-style', 'font-variant', 'font-weight', 'height', 'left',
580 'letter-spacing', 'line-height', 'list-style', 'list-style-image',
581 'list-style-position', 'list-style-type', 'margin', 'margin-bottom',
582 'margin-left', 'margin-right', 'margin-top', 'max-height', 'max-width',
583 'min-height', 'min-width', 'opacity', 'orphans', 'outline',
584 'outline-color', 'outline-style', 'outline-width', 'overflow',
585 'padding', 'padding-bottom', 'padding-left', 'padding-right',
586 'padding-top', 'page-break-after', 'page-break-before',
587 'page-break-inside', 'position', 'quotes', 'right', 'table-layout',
588 'text-align', 'text-decoration', 'text-indent', 'text-transform',
589 'top', 'unicode-bidi', 'vertical-align', 'visibility', 'white-space',
590 'widows', 'width', 'word-spacing', 'z-index',
591 ])
592
593 SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None])
594
595 URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc',
596 'src'])
597
598 SAFE_CROSS_ORIGINS = frozenset(['data:'])
599
603 """Note: safe_schemes and safe_css have to remain the first
604 parameters, for backward-compatibility purpose.
605 """
606 self.safe_tags = safe_tags
607
608 self.safe_attrs = safe_attrs
609
610 self.safe_css = safe_css
611
612 self.uri_attrs = uri_attrs
613
614 self.safe_schemes = safe_schemes
615
616 self.safe_origins = safe_origins
617
618
619
620 _EXPRESSION_SEARCH = re.compile(
621 u'[eE\uFF25\uFF45]'
622
623 u'[xX\uFF38\uFF58]'
624
625 u'[pP\uFF30\uFF50]'
626
627 u'[rR\u0280\uFF32\uFF52]'
628
629
630 u'[eE\uFF25\uFF45]'
631
632 u'[sS\uFF33\uFF53]{2}'
633
634 u'[iI\u026A\uFF29\uFF49]'
635
636
637 u'[oO\uFF2F\uFF4F]'
638
639 u'[nN\u0274\uFF2E\uFF4E]'
640
641
642 ).search
643
644
645
646 _URL_FINDITER = re.compile(
647 u'[Uu][Rr\u0280][Ll\u029F]\s*\(([^)]+)').finditer
648
650 """Transforms the incoming HTML by removing anything's that deemed
651 unsafe.
652
653 :param html: the input HTML
654 :type: basestring
655 :return: the sanitized content
656 :rtype: Markup
657
658 """
659 transform = HTMLSanitization(self, io.StringIO())
660 transform.feed(html)
661 transform.close()
662 return Markup(transform.out.getvalue())
663
664 if genshi:
666 """Apply the filter to the given stream.
667
668 :deprecated: the ability to behave as a Genshi filter will be
669 removed in Trac 1.5.1.
670
671 :param stream: the markup event stream to filter
672 """
673 waiting_for = None
674
675 for kind, data, pos in stream:
676 if kind is START:
677 if waiting_for:
678 continue
679 tag, attrs = data
680 if not self.is_safe_elem(tag, attrs):
681 waiting_for = tag
682 continue
683 new_attrs = self.sanitize_attrs(tag, dict(attrs))
684 yield kind, (tag, Attrs(new_attrs.iteritems())), pos
685
686 elif kind is END:
687 tag = data
688 if waiting_for:
689 if waiting_for == tag:
690 waiting_for = None
691 else:
692 yield kind, data, pos
693
694 elif kind is not COMMENT:
695 if not waiting_for:
696 yield kind, data, pos
697
699 """Determine whether the given css property declaration is to be
700 considered safe for inclusion in the output.
701
702 """
703 if prop not in self.safe_css:
704 return False
705
706 if prop == 'position':
707 return value.lower() == 'static'
708
709 if prop.startswith('margin'):
710 return '-' not in value
711 return True
712
714 """Determine whether the given element should be considered safe for
715 inclusion in the output.
716
717 :param tag: the tag name of the element
718 :type tag: QName or basestring
719 :param attrs: the element attributes
720 :type attrs: Attrs or list
721 :return: whether the element should be considered safe
722 :rtype: bool
723
724 """
725 if tag not in self.safe_tags:
726 return False
727 if hasattr(tag, 'localname'):
728 tag = tag.localname
729 if tag == 'input':
730
731 if Attrs and isinstance(attrs, Attrs):
732 input_type = attrs.get('type', '').lower()
733 if input_type == 'password':
734 return False
735 else:
736 if ('type', 'password') in attrs:
737 return False
738 return True
739
741 """Determine whether the given URI is to be considered safe for
742 inclusion in the output.
743
744 The default implementation checks whether the scheme of the URI is in
745 the set of allowed URIs (`safe_schemes`).
746
747 >>> sanitizer = TracHTMLSanitizer()
748 >>> sanitizer.is_safe_uri('http://example.org/')
749 True
750 >>> sanitizer.is_safe_uri('javascript:alert(document.cookie)')
751 False
752
753 :param uri: the URI to check
754 :return: `True` if the URI can be considered safe, `False` otherwise
755 :rtype: `bool`
756
757 """
758 if '#' in uri:
759 uri = uri.split('#', 1)[0]
760 if ':' not in uri:
761 return True
762 chars = [char for char in uri.split(':', 1)[0] if char.isalnum()]
763 return ''.join(chars).lower() in self.safe_schemes
764
766 """Remove potentially dangerous attributes and sanitize the style
767 attribute .
768
769 :param tag: the tag name of the element
770 :type attrs: dict corresponding to tag attributes
771 :return: a dict containing only safe or sanitized attributes
772 :rtype: dict
773
774 """
775 new_attrs = {}
776 for attr, value in attrs.iteritems():
777 if value is None:
778 value = attr
779 if attr not in self.safe_attrs:
780 continue
781 elif attr in self.uri_attrs:
782
783 if not self.is_safe_uri(value):
784 continue
785 elif attr == 'style':
786
787 decls = self.sanitize_css(value)
788 if not decls:
789 continue
790 value = '; '.join(decls)
791 new_attrs[attr] = value
792 if tag == 'img' and 'src' in new_attrs and \
793 not self._is_safe_origin(new_attrs['src']):
794 attr = 'crossorigin'
795 if QName and isinstance(tag, QName):
796 attr = QName(attr)
797 new_attrs[attr] = 'anonymous'
798 return new_attrs
799
801 """Remove potentially dangerous property declarations from CSS code.
802
803 In particular, properties using the CSS ``url()`` function
804 with a scheme that is not considered safe are removed:
805
806 >>> sanitizer = TracHTMLSanitizer()
807 >>> sanitizer.sanitize_css(u'''
808 ... background: url(javascript:alert("foo"));
809 ... color: #000;
810 ... ''')
811 [u'color: #000']
812
813 Also, the proprietary Internet Explorer function
814 ``expression()`` is always stripped:
815
816 >>> sanitizer.sanitize_css(u'''
817 ... background: #fff;
818 ... color: #000;
819 ... width: e/**/xpression(alert("F"));
820 ... ''')
821 [u'background: #fff', u'color: #000', u'width: e xpression(alert("F"))']
822
823 :param text: the CSS text; this is expected to be `unicode` and to not
824 contain any character or numeric references
825 :return: a list of declarations that are considered safe
826 :rtype: `list`
827
828 """
829 decls = []
830 text = self._strip_css_comments(self._replace_unicode_escapes(text))
831 for decl in filter(None, text.split(';')):
832 decl = decl.strip()
833 if not decl:
834 continue
835 try:
836 prop, value = decl.split(':', 1)
837 except ValueError:
838 continue
839 if not self.is_safe_css(prop.strip().lower(), value.strip()):
840 continue
841 if not self._EXPRESSION_SEARCH(decl) and \
842 all(self._is_safe_origin(match.group(1))
843 for match in self._URL_FINDITER(decl)):
844 decls.append(decl.strip())
845 return decls
846
847 _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub
848 _UNICODE_ESCAPE = re.compile(
849 r"""\\([0-9a-fA-F]{1,6})\s?|\\([^\r\n\f0-9a-fA-F'"{};:()#*])""",
850 re.UNICODE).sub
851
855
857 def _repl(match):
858 t = match.group(1)
859 if t:
860 code = int(t, 16)
861 chr = _unichr(code)
862 if code <= 0x1f:
863
864
865 chr = ' '
866 elif chr == '\\':
867 chr = r'\\'
868 return chr
869 t = match.group(2)
870 if t == '\\':
871 return r'\\'
872 else:
873 return t
874 return self._UNICODE_ESCAPE(_repl,
875 self._NORMALIZE_NEWLINES('\n', text))
876
877 _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub
878
884
885
887 """Help base class used for cleaning up HTML riddled with ``<FONT
888 COLOR=...>`` tags and replace them with appropriate ``<span
889 class="...">``.
890
891 The subclass must define a `rules()` static method returning a
892 list of regular expression fragments, each defining a capture
893 group in which the name will be reused for the span's class. Two
894 special group names, ``font`` and ``endfont`` are used to emit
895 ``<span>`` and ``</span>``, respectively.
896
897 """
899 self = object.__new__(cls)
900 if not hasattr(cls, '_compiled_rules'):
901 cls._compiled_rules = re.compile('(?:%s)' % '|'.join(cls.rules()))
902 self._compiled_rules = cls._compiled_rules
903 return self
904
907
909 for mtype, match in fullmatch.groupdict().items():
910 if match:
911 if mtype == 'font':
912 return '<span>'
913 elif mtype == 'endfont':
914 return '</span>'
915 return '<span class="code-%s">' % mtype
916
917
993
994
1014
1016 """Sanitize parsed HTML using TracHTMLSanitizer."""
1017
1019 HTMLTransform.__init__(self, out)
1020 self.sanitizer = sanitizer
1021 self.waiting_for = None
1022
1024 if self.waiting_for:
1025 return
1026 if not self.sanitizer.is_safe_elem(tag, attrs):
1027 self.waiting_for = tag
1028 return
1029
1030 new_attrs = self.sanitizer.sanitize_attrs(tag, dict(attrs))
1031 html_attrs = ''.join(' %s="%s"' % (name, escape(value))
1032 for name, value in new_attrs.iteritems())
1033 self._write('<%s%s%s>' % (tag, html_attrs, startend))
1034
1036 if not self.waiting_for:
1037 self._handle_start(tag, attrs, '')
1038
1040 if not self.waiting_for:
1041 self._handle_start(tag, attrs, '/')
1042
1044 if not self.waiting_for:
1045 self._handle_charref(name)
1046
1048 if not self.waiting_for:
1049 self._handle_entityref(name)
1050
1053
1055 if not self.waiting_for:
1056 self._write('<!%s>' % data)
1057
1059 if not self.waiting_for:
1060 self._write('<?%s?>' % data.replace('?>', ''))
1061
1063 if not self.waiting_for:
1064 self._write(escape(data))
1065
1067 if self.waiting_for:
1068 if self.waiting_for == tag:
1069 self.waiting_for = None
1070 else:
1071 self._write('</' + tag + '>')
1072
1073
1074 -def plaintext(text, keeplinebreaks=True):
1075 """Extract the text elements from (X)HTML content
1076
1077 >>> plaintext('<b>1 < 2</b>')
1078 u'1 < 2'
1079
1080 >>> plaintext(tag('1 ', tag.b('<'), ' 2'))
1081 u'1 < 2'
1082
1083 >>> plaintext('''<b>1
1084 ... <
1085 ... 2</b>''', keeplinebreaks=False)
1086 u'1 < 2'
1087
1088 :param text: `unicode` or `Fragment`
1089 :param keeplinebreaks: optionally keep linebreaks
1090
1091 """
1092 if LazyProxy and isinstance(text, LazyProxy):
1093 text = text.value
1094 if isinstance(text, Fragment):
1095 text = text.as_text()
1096 else:
1097 text = stripentities(striptags(text))
1098 if not keeplinebreaks:
1099 text = text.replace(u'\n', u' ')
1100 return text
1101
1102
1104 """Return the first element in the fragment having the given
1105 attribute, class or tag, using a preorder depth-first search.
1106
1107 """
1108 if LazyProxy and isinstance(frag, LazyProxy):
1109 frag = frag.value
1110 if isinstance(frag, Element):
1111 if attr is not None and attr in frag.attrib:
1112 return frag
1113 if cls is not None and cls in frag.attrib.get('class', '').split():
1114 return frag
1115 if tag is not None and tag == frag.tag:
1116 return frag
1117 if isinstance(frag, Fragment):
1118 for child in frag.children:
1119 elt = find_element(child, attr, cls, tag)
1120 if elt is not None:
1121 return elt
1122
1123
1125 """Whether the given uri is a safe cross-origin."""
1126 if not uri or ':' not in uri and not uri.startswith('//'):
1127 return True
1128 if any(safe == '*' for safe in safe_origins):
1129 return True
1130 if uri.startswith('//') and req:
1131 uri = '%s:%s' % (req.scheme, uri)
1132
1133 normalize_re = re.compile(r'(?:[a-zA-Z][-a-zA-Z0-9+._]*:)?//[^/]+$')
1134
1135 def normalize_uri(uri):
1136 if normalize_re.match(uri):
1137 uri += '/'
1138 return uri
1139
1140 uri = normalize_uri(uri)
1141 for safe in safe_origins:
1142 safe = normalize_uri(safe)
1143 if safe == uri:
1144 return True
1145 if safe.endswith(':') and uri.startswith(safe):
1146 return True
1147 if uri.startswith(safe if safe.endswith('/') else safe + '/'):
1148 return True
1149 return False
1150
1151
1153 """Convert input to a `Fragment` object."""
1154
1155 while isinstance(input, TracError) or \
1156 isinstance(input, Exception) and len(input.args) == 1:
1157 input = input.args[0]
1158 if LazyProxy and isinstance(input, LazyProxy):
1159 input = input.value
1160 if isinstance(input, Fragment):
1161 return input
1162 return tag(to_unicode(input))
1163
1164
1165
1166 _translate_nop = ''.join(chr(i) for i in xrange(256))
1167 _invalid_control_chars = ''.join(chr(i) for i in xrange(32)
1168 if i not in [0x09, 0x0a, 0x0d])
1169
1172
1173
1174 if sys.maxunicode > 0xffff:
1175 _unichr = unichr
1176 else:
1178 try:
1179 return unichr(codepoint)
1180 except ValueError:
1181 if not (0 <= codepoint <= 0x10ffff):
1182 raise
1183 s = r'\U%08x' % codepoint
1184 try:
1185 return s.decode('unicode-escape')
1186 except Exception as e:
1187 raise ValueError(e)
1188
1189
1190 _reference_re = re.compile(r'&(?:#[xX][0-9a-fA-F]+|#[0-9]+|\w{1,8});')
1191
1193 """This is to avoid an issue which HTMLParser.unescape() raises
1194 ValueError or OverflowError from unichr() when character reference
1195 with a large integer in the attribute.
1196 """
1197
1198 def repl(match):
1199 match = match.group(0)
1200 name = match[1:-1]
1201 if name.startswith(('#x', '#X')):
1202 codepoint = int(name[2:], 16)
1203 elif name.startswith('#'):
1204 codepoint = int(name[1:])
1205 else:
1206 try:
1207 codepoint = _name2codepoint[name]
1208 except KeyError:
1209 return match
1210 if 0 <= codepoint <= 0x10ffff:
1211 return _unichr(codepoint)
1212 else:
1213 return match
1214
1215 return _reference_re.sub(repl, s)
1216
1217
1218 if genshi:
1220
1222 fixed_attrib = [(QName(name), name if value is None else value)
1223 for name, value in attrib]
1224 self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
1225 if tag in self._EMPTY_ELEMS:
1226 self._enqueue(END, QName(tag))
1227 else:
1228 self._open_tags.append(tag)
1229
1231 if name.startswith(('x', 'X')):
1232 codepoint = int(name[1:], 16)
1233 else:
1234 codepoint = int(name)
1235 if 0 <= codepoint <= 0x10ffff:
1236 text = _unichr(codepoint)
1237 else:
1238 text = '&#%s;' % name
1239 self._enqueue(TEXT, text)
1240
1242 text = None
1243 try:
1244 codepoint = _name2codepoint[name]
1245 except KeyError:
1246 pass
1247 else:
1248 if 0 <= codepoint <= 0x10ffff:
1249 text = _unichr(codepoint)
1250 self._enqueue(TEXT, text or '&%s;' % name)
1251
1253 return _html_parser_unescape(s)
1254
1255
1256 - def HTML(text, encoding=None):
1257 if isinstance(text, unicode):
1258 f = io.StringIO(text)
1259 encoding = None
1260 else:
1261 f = io.BytesIO(text)
1262 parser = GenshiHTMLParserFixup(f, encoding=encoding)
1263 return Stream(list(parser))
1264
1265
1267 """A Genshi stream filter for expanding `genshi.Markup` events.
1268
1269 :deprecated: will be removed in Trac 1.5.1.
1270
1271 Note: Expansion may not be possible if the fragment is badly
1272 formed, or partial.
1273
1274 """
1275 for event in stream:
1276 if isinstance(event[1], Markup):
1277 try:
1278 for subevent in HTML(event[1]):
1279 yield subevent
1280 except ParseError:
1281 yield event
1282 else:
1283 yield event
1284 else:
1285 expand_markup = None
1286