trac.util.text

1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2003-2023 Edgewall Software 4 # Copyright (C) 2003-2004 Jonas Borgström <[email protected]> 5 # Copyright (C) 2006 Matthew Good <[email protected]> 6 # Copyright (C) 2005-2006 Christian Boos <[email protected]> 7 # All rights reserved. 8 # 9 # This software is licensed as described in the file COPYING, which 10 # you should have received as part of this distribution. The terms 11 # are also available at https://trac.edgewall.org/wiki/TracLicense. 12 # 13 # This software consists of voluntary contributions made by many 14 # individuals. For the exact contribution history, see the revision 15 # history and logs, available at https://trac.edgewall.org/log/. 16 # 17 # Author: Jonas Borgström <[email protected]> 18 # Matthew Good <[email protected]> 19 # Christian Boos <[email protected]> 20 21 import __builtin__ 22 import locale 23 import os 24 import re 25 import sys 26 import textwrap 27 from urllib import quote, quote_plus, unquote 28 from unicodedata import east_asian_width 29 30 import jinja2 31 32 CRLF = '\r\n'

33 34 -class Empty(unicode):

35 """A special tag object evaluating to the empty string""" 36 __slots__ = []

37 38 empty = Empty() 39 40 del Empty # shouldn't be used outside of Trac core

41 42 43 # -- Jinja2 44 45 -def jinja2env(**kwargs):

46 """Creates a Jinja2 ``Environment`` configured with Trac conventions. 47 48 All default parameters can optionally be overridden. The ``loader`` 49 parameter is not set by default, so unless it is set by the 50 caller, only inline templates can be created from the environment. 51 52 :rtype: `jinja.Environment` 53 54 """ 55 exts = ('.html', '.rss', '.xml') 56 def filterout_none(v): 57 return '' if v is None else v

58 def autoescape_extensions(template): 59 return template and template.endswith(exts) 60 defaults = dict( 61 variable_start_string='${', 62 variable_end_string='}', 63 line_statement_prefix='#', 64 line_comment_prefix='##', 65 trim_blocks=True, 66 lstrip_blocks=True, 67 extensions=['jinja2.ext.do', 'jinja2.ext.i18n', 'jinja2.ext.with_'], 68 finalize=filterout_none, 69 autoescape=autoescape_extensions, 70 ) 71 defaults.update(kwargs) 72 jenv = jinja2.Environment(**defaults) 73 jenv.globals.update( 74 len=len, 75 ) 76 return jenv 77

78 -def jinja2template(template, text=False, **kwargs):

79 """Creates a Jinja2 ``Template`` from inlined source. 80 81 :param template: the template content 82 :param text: if set to `False`, the result of the variable 83 expansion will be XML/HTML escaped 84 :param kwargs: additional arguments to pass to `jinja2env`. See 85 `jinja2.Environment` for supported arguments. 86 """ 87 return jinja2env(autoescape=not text, **kwargs).from_string(template)

88

89 90 # -- Unicode 91 92 -def to_unicode(text, charset=None):

93 """Convert input to an `unicode` object. 94 95 For a `str` object, we'll first try to decode the bytes using the given 96 `charset` encoding (or UTF-8 if none is specified), then we fall back to 97 the latin1 encoding which might be correct or not, but at least preserves 98 the original byte sequence by mapping each byte to the corresponding 99 unicode code point in the range U+0000 to U+00FF. 100 101 For anything else, a simple `unicode()` conversion is attempted, 102 with special care taken with `Exception` objects. 103 """ 104 if isinstance(text, str): 105 try: 106 return unicode(text, charset or 'utf-8') 107 except UnicodeDecodeError: 108 return unicode(text, 'latin1') 109 elif isinstance(text, Exception): 110 if os.name == 'nt' and isinstance(text, EnvironmentError): 111 strerror = text.strerror 112 filename = text.filename 113 if isinstance(strerror, basestring) and \ 114 isinstance(filename, basestring): 115 try: 116 if not isinstance(strerror, unicode): 117 strerror = unicode(strerror, 'mbcs') 118 if not isinstance(filename, unicode): 119 filename = unicode(filename, 'mbcs') 120 except UnicodeError: 121 pass 122 else: 123 if isinstance(text, WindowsError): 124 return u"[Error %s] %s: '%s'" % (text.winerror, 125 strerror, filename) 126 else: 127 return u"[Errno %s] %s: '%s'" % (text.errno, strerror, 128 filename) 129 # the exception might have a localized error string encoded with 130 # ANSI codepage if OSError and IOError on Windows 131 try: 132 return unicode(str(text), 'mbcs') 133 except UnicodeError: 134 pass 135 # two possibilities for storing unicode strings in exception data: 136 try: 137 # custom __str__ method on the exception (e.g. PermissionError) 138 return unicode(text) 139 except UnicodeError: 140 # unicode arguments given to the exception (e.g. parse_date) 141 return ' '.join(to_unicode(arg) for arg in text.args) 142 return unicode(text)

143

144 145 -def exception_to_unicode(e, traceback=False):

146 """Convert an `Exception` to an `unicode` object. 147 148 In addition to `to_unicode`, this representation of the exception 149 also contains the class name and optionally the traceback. 150 """ 151 message = '%s: %s' % (e.__class__.__name__, to_unicode(e)) 152 if traceback: 153 from trac.util import get_last_traceback 154 traceback_only = get_last_traceback().split('\n')[:-2] 155 message = '\n%s\n%s' % (to_unicode('\n'.join(traceback_only)), message) 156 return message

157

158 159 -def path_to_unicode(path):

160 """Convert a filesystem path to unicode, using the filesystem encoding.""" 161 if isinstance(path, str): 162 try: 163 return unicode(path, sys.getfilesystemencoding()) 164 except UnicodeDecodeError: 165 return unicode(path, 'latin1') 166 return unicode(path)

167 168 169 _ws_leading_re = re.compile(u'\\A[\\s\u200b]+', re.UNICODE) 170 _ws_trailing_re = re.compile(u'[\\s\u200b]+\\Z', re.UNICODE)

171 172 -def stripws(text, leading=True, trailing=True):

173 """Strips unicode white-spaces and ZWSPs from ``text``. 174 175 :param leading: strips leading spaces from ``text`` unless ``leading`` is 176 `False`. 177 :param trailing: strips trailing spaces from ``text`` unless ``trailing`` 178 is `False`. 179 """ 180 if leading: 181 text = _ws_leading_re.sub('', text) 182 if trailing: 183 text = _ws_trailing_re.sub('', text) 184 return text

185

186 187 -def strip_line_ws(text, leading=True, trailing=True):

188 """Strips unicode white-spaces and ZWSPs from each line of ``text``. 189 190 :param leading: strips leading spaces from ``text`` unless ``leading`` is 191 `False`. 192 :param trailing: strips trailing spaces from ``text`` unless ``trailing`` 193 is `False`. 194 """ 195 lines = re.compile(r'(\n|\r\n|\r)').split(text) 196 if leading: 197 lines[::2] = (_ws_leading_re.sub('', line) for line in lines[::2]) 198 if trailing: 199 lines[::2] = (_ws_trailing_re.sub('', line) for line in lines[::2]) 200 return ''.join(lines)

201 202 203 _js_quote = {'\\': '\\\\', '"': '\\"', '\b': '\\b', '\f': '\\f', 204 '\n': '\\n', '\r': '\\r', '\t': '\\t', "'": "\\'"} 205 for i in list(xrange(0x20)) + [ord(c) for c in u'&<>\u2028\u2029']: 206 _js_quote.setdefault(unichr(i), '\\u%04x' % i) 207 _js_quote_re = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t\'&<>' + u'\u2028\u2029]') 208 _js_string_re = re.compile(r'[\x00-\x1f\\"\b\f\n\r\t&<>' + u'\u2028\u2029]')

209 210 211 -def javascript_quote(text):

212 """Quote strings for inclusion in single or double quote delimited 213 Javascript strings 214 """ 215 if not text: 216 return '' 217 def replace(match): 218 return _js_quote[match.group(0)]

219 return _js_quote_re.sub(replace, text) 220

221 222 -def to_js_string(text):

223 """Embed the given string in a double quote delimited Javascript string 224 (conform to the JSON spec) 225 """ 226 if not text: 227 return '""' 228 def replace(match): 229 return _js_quote[match.group(0)]

230 return '"%s"' % _js_string_re.sub(replace, text) 231

232 233 -def unicode_quote(value, safe='/'):

234 """A unicode aware version of `urllib.quote` 235 236 :param value: anything that converts to a `str`. If `unicode` 237 input is given, it will be UTF-8 encoded. 238 :param safe: as in `quote`, the characters that would otherwise be 239 quoted but shouldn't here (defaults to '/') 240 """ 241 return quote(value.encode('utf-8') if isinstance(value, unicode) 242 else str(value), safe)

243

244 245 -def unicode_quote_plus(value, safe=''):

246 """A unicode aware version of `urllib.quote_plus`. 247 248 :param value: anything that converts to a `str`. If `unicode` 249 input is given, it will be UTF-8 encoded. 250 :param safe: as in `quote_plus`, the characters that would 251 otherwise be quoted but shouldn't here (defaults to 252 '/') 253 """ 254 return quote_plus(value.encode('utf-8') if isinstance(value, unicode) 255 else str(value), safe)

256

257 258 -def unicode_unquote(value):

259 """A unicode aware version of `urllib.unquote`. 260 261 :param str: UTF-8 encoded `str` value (for example, as obtained by 262 `unicode_quote`). 263 :rtype: `unicode` 264 """ 265 return unquote(value).decode('utf-8')

266

267 268 -def unicode_urlencode(params, safe=''):

269 """A unicode aware version of `urllib.urlencode`. 270 271 Values set to `empty` are converted to the key alone, without the 272 equal sign. 273 """ 274 if isinstance(params, dict): 275 params = params.iteritems() 276 l = [] 277 for k, v in params: 278 if v is empty: 279 l.append(unicode_quote_plus(k, safe)) 280 else: 281 l.append(unicode_quote_plus(k, safe) + '=' + 282 unicode_quote_plus(v, safe)) 283 return '&'.join(l)

284 285 286 _qs_quote_safe = ''.join(chr(c) for c in xrange(0x21, 0x7f))

287 288 -def quote_query_string(text):

289 """Quote strings for query string 290 """ 291 return unicode_quote_plus(text, _qs_quote_safe)

292

293 294 -def to_utf8(text, charset='latin1'):

295 """Convert input to a UTF-8 `str` object. 296 297 If the input is not an `unicode` object, we assume the encoding is 298 already UTF-8, ISO Latin-1, or as specified by the optional 299 *charset* parameter. 300 """ 301 if isinstance(text, str): 302 try: 303 u = unicode(text, 'utf-8') 304 except UnicodeError: 305 try: 306 # Use the user supplied charset if possible 307 u = unicode(text, charset) 308 except UnicodeError: 309 # This should always work 310 u = unicode(text, 'latin1') 311 else: 312 # Do nothing if it's already utf-8 313 return text 314 else: 315 u = to_unicode(text) 316 return u.encode('utf-8')

317

318 319 -class unicode_passwd(unicode):

320 """Conceal the actual content of the string when `repr` is called."""

321 - def __repr__(self):

322 return '*******'

323

324 325 -def stream_encoding(stream):

326 """Return the appropriate encoding for the given stream.""" 327 encoding = getattr(stream, 'encoding', None) 328 # Windows returns 'cp0' to indicate no encoding 329 return encoding if encoding not in (None, 'cp0') else 'utf-8'

330

331 332 -def console_print(out, *args, **kwargs):

333 """Output the given arguments to the console, encoding the output 334 as appropriate. 335 336 :param kwargs: ``newline`` controls whether a newline will be appended 337 (defaults to `True`) 338 """ 339 cons_charset = stream_encoding(out) 340 out.write(' '.join(to_unicode(a).encode(cons_charset, 'replace') 341 for a in args)) 342 if kwargs.get('newline', True): 343 out.write('\n')

344

345 346 -def printout(*args, **kwargs):

347 """Do a `console_print` on `sys.stdout`.""" 348 console_print(sys.stdout, *args, **kwargs)

349

350 351 -def printerr(*args, **kwargs):

352 """Do a `console_print` on `sys.stderr`.""" 353 console_print(sys.stderr, *args, **kwargs)

354

355 356 -def printfout(message, *args, **kwargs):

357 """Format `message`, do a `console.print` on `sys.stdout` and flush 358 the buffer. 359 """ 360 if args: 361 message %= args 362 printout(message, **kwargs) 363 sys.stdout.flush()

364

365 366 -def printferr(message, *args, **kwargs):

367 """Format `message`, do a `console.print` on `sys.stderr` and flush 368 the buffer. 369 """ 370 if args: 371 message %= args 372 printerr(message, **kwargs) 373 sys.stderr.flush()

374

375 376 -def raw_input(prompt):

377 """Input one line from the console and converts it to unicode as 378 appropriate. 379 """ 380 printout(prompt, newline=False) 381 return to_unicode(__builtin__.raw_input(), sys.stdin.encoding)

382 383 384 _preferredencoding = locale.getpreferredencoding()

385 386 -def getpreferredencoding():

387 """Return the encoding, which is retrieved on ahead, according to user 388 preference. 389 390 We should use this instead of `locale.getpreferredencoding()` which 391 is not thread-safe.""" 392 return _preferredencoding

393

394 395 # -- Plain text formatting 396 397 -def text_width(text, ambiwidth=1):

398 """Determine the column width of `text` in Unicode characters. 399 400 The characters in the East Asian Fullwidth (F) or East Asian Wide (W) 401 have a column width of 2. The other characters in the East Asian 402 Halfwidth (H) or East Asian Narrow (Na) have a column width of 1. 403 404 That `ambiwidth` parameter is used for the column width of the East 405 Asian Ambiguous (A). If `1`, the same width as characters in US-ASCII. 406 This is expected by most users. If `2`, twice the width of US-ASCII 407 characters. This is expected by CJK users. 408 409 cf. http://www.unicode.org/reports/tr11/. 410 """ 411 twice = 'FWA' if ambiwidth == 2 else 'FW' 412 return sum([2 if east_asian_width(chr) in twice else 1 413 for chr in to_unicode(text)])

414

415 416 -def _get_default_ambiwidth():

417 """Return width of East Asian Ambiguous based on locale environment 418 variables or Windows codepage. 419 """ 420 421 if os.name == 'nt': 422 import ctypes 423 codepage = ctypes.windll.kernel32.GetConsoleOutputCP() 424 if codepage in (932, # Japanese (Shift-JIS) 425 936, # Chinese Simplified (GB2312) 426 949, # Korean (Unified Hangul Code) 427 950): # Chinese Traditional (Big5) 428 return 2 429 else: 430 for name in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'): 431 value = os.environ.get(name) or '' 432 if value: 433 if name == 'LANGUAGE' and ':' in value: 434 value = value.split(':')[0] 435 return 2 if value.lower().startswith(('zh', 'ja', 'ko')) else 1 436 437 return 1

438 439 440 _default_ambiwidth = _get_default_ambiwidth()

441 442 443 -def print_table(data, headers=None, sep=' ', out=None, ambiwidth=None):

444 """Print data according to a tabular layout. 445 446 :param data: a sequence of rows; assume all rows are of equal length. 447 :param headers: an optional row containing column headers; must be of 448 the same length as each row in `data`. 449 :param sep: column separator 450 :param out: output file descriptor (`None` means use `sys.stdout`) 451 :param ambiwidth: column width of the East Asian Ambiguous (A). If None, 452 detect ambiwidth with the locale settings. If others, 453 pass to the `ambiwidth` parameter of `text_width`. 454 """ 455 if out is None: 456 out = sys.stdout 457 charset = getattr(out, 'encoding', None) or 'utf-8' 458 if ambiwidth is None: 459 ambiwidth = _default_ambiwidth 460 data = list(data) 461 if headers: 462 data.insert(0, headers) 463 elif not data: 464 return 465 466 # Convert to an unicode object with `to_unicode`. If None, convert to a 467 # empty string. 468 def to_text(val): 469 if val is None: 470 return u'' 471 return to_unicode(val)

472 473 def tw(text): 474 return text_width(text, ambiwidth=ambiwidth) 475 476 def to_lines(data): 477 lines = [] 478 for row in data: 479 row = [to_text(cell) for cell in row] 480 if any('\n' in cell for cell in row): 481 row = [cell.splitlines() for cell in row] 482 max_lines = max(len(cell) for cell in row) 483 for cell in row: 484 if len(cell) < max_lines: 485 cell += [''] * (max_lines - len(cell)) 486 lines.extend([cell[idx] for cell in row] 487 for idx in xrange(max_lines)) 488 else: 489 lines.append(row) 490 return lines 491 492 data = to_lines(data) 493 494 num_cols = len(data[0]) 495 col_width = [max(tw(row[idx]) for row in data) 496 for idx in xrange(num_cols)] 497 498 out.write('\n') 499 for ridx, row in enumerate(data): 500 for cidx, cell in enumerate(row): 501 if cidx + 1 == num_cols: 502 line = cell # No separator after last column 503 else: 504 if headers and ridx == 0: 505 sp = ' ' * tw(sep) # No separator in header 506 else: 507 sp = sep 508 line = u'%-*s%s' % (col_width[cidx] - tw(cell) + len(cell), 509 cell, sp) 510 line = line.encode(charset, 'replace') 511 out.write(line) 512 513 out.write('\n') 514 if ridx == 0 and headers: 515 out.write('-' * (tw(sep) * cidx + sum(col_width))) 516 out.write('\n') 517 out.write('\n') 518

519 520 -def shorten_line(text, maxlen=75):

521 """Truncates `text` to length less than or equal to `maxlen` characters. 522 523 This tries to be (a bit) clever and attempts to find a proper word 524 boundary for doing so. 525 """ 526 if len(text or '') <= maxlen: 527 return text 528 suffix = ' ...' 529 maxtextlen = maxlen - len(suffix) 530 cut = max(text.rfind(' ', 0, maxtextlen), text.rfind('\n', 0, maxtextlen)) 531 if cut < 0: 532 cut = maxtextlen 533 return text[:cut] + suffix

534

535 536 -class UnicodeTextWrapper(textwrap.TextWrapper):

537 breakable_char_ranges = [ 538 (0x1100, 0x11FF), # Hangul Jamo 539 (0x2E80, 0x2EFF), # CJK Radicals Supplement 540 (0x3000, 0x303F), # CJK Symbols and Punctuation 541 (0x3040, 0x309F), # Hiragana 542 (0x30A0, 0x30FF), # Katakana 543 (0x3130, 0x318F), # Hangul Compatibility Jamo 544 (0x3190, 0x319F), # Kanbun 545 (0x31C0, 0x31EF), # CJK Strokes 546 (0x3200, 0x32FF), # Enclosed CJK Letters and Months 547 (0x3300, 0x33FF), # CJK Compatibility 548 (0x3400, 0x4DBF), # CJK Unified Ideographs Extension A 549 (0x4E00, 0x9FFF), # CJK Unified Ideographs 550 (0xA960, 0xA97F), # Hangul Jamo Extended-A 551 (0xAC00, 0xD7AF), # Hangul Syllables 552 (0xD7B0, 0xD7FF), # Hangul Jamo Extended-B 553 (0xF900, 0xFAFF), # CJK Compatibility Ideographs 554 (0xFE30, 0xFE4F), # CJK Compatibility Forms 555 (0xFF00, 0xFFEF), # Halfwidth and Fullwidth Forms 556 (0x20000, 0x2FFFF, u'[\uD840-\uD87F][\uDC00-\uDFFF]'), # Plane 2 557 (0x30000, 0x3FFFF, u'[\uD880-\uD8BF][\uDC00-\uDFFF]'), # Plane 3 558 ] 559 560 split_re = None 561 breakable_re = None 562 563 @classmethod

564 - def _init_patterns(cls):

565 char_ranges = [] 566 surrogate_pairs = [] 567 for val in cls.breakable_char_ranges: 568 try: 569 high = unichr(val[0]) 570 low = unichr(val[1]) 571 char_ranges.append(u'%s-%s' % (high, low)) 572 except ValueError: 573 # Narrow build, `re` cannot use characters >= 0x10000 574 surrogate_pairs.append(val[2]) 575 char_ranges = u''.join(char_ranges) 576 if surrogate_pairs: 577 pattern = u'(?:[%s]|%s)+' % (char_ranges, 578 u'|'.join(surrogate_pairs)) 579 else: 580 pattern = u'[%s]+' % char_ranges 581 582 cls.split_re = re.compile( 583 r'(\s+|' + # any whitespace 584 pattern + u'|' + # breakable text 585 r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' + # hyphenated words 586 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))', # em-dash 587 re.UNICODE) 588 cls.breakable_re = re.compile(r'\A' + pattern, re.UNICODE)

589

590 - def __init__(self, cols, replace_whitespace=0, break_long_words=0, 591 initial_indent='', subsequent_indent='', ambiwidth=1):

592 textwrap.TextWrapper.__init__( 593 self, cols, replace_whitespace=0, break_long_words=0, 594 initial_indent=initial_indent, 595 subsequent_indent=subsequent_indent) 596 self.ambiwidth = ambiwidth 597 if self.split_re is None: 598 self._init_patterns()

599

600 - def _split(self, text):

601 chunks = self.split_re.split(to_unicode(text)) 602 chunks = filter(None, chunks) 603 return chunks

604

605 - def _text_width(self, text):

606 return text_width(text, ambiwidth=self.ambiwidth)

607

608 - def _wrap_chunks(self, chunks):

609 lines = [] 610 chunks.reverse() 611 text_width = self._text_width 612 613 while chunks: 614 cur_line = [] 615 cur_width = 0 616 617 if lines: 618 indent = self.subsequent_indent 619 else: 620 indent = self.initial_indent 621 width = self.width - text_width(indent) 622 623 if chunks[-1].strip() == '' and lines: 624 del chunks[-1] 625 626 while chunks: 627 chunk = chunks[-1] 628 w = text_width(chunk) 629 if cur_width + w <= width: 630 cur_line.append(chunks.pop()) 631 cur_width += w 632 elif self.breakable_re.match(chunk): 633 left_space = width - cur_width 634 for i in xrange(len(chunk)): 635 w = text_width(chunk[i]) 636 if left_space < w: 637 break 638 left_space -= w 639 if i > 0: 640 cur_line.append(chunk[:i]) 641 chunk = chunk[i:] 642 chunks[-1] = chunk 643 w = text_width(chunk) 644 break 645 else: 646 break 647 648 if chunks and w > width: 649 self._handle_long_word(chunks, cur_line, cur_width, width) 650 651 if cur_line and cur_line[-1].strip() == '': 652 del cur_line[-1] 653 654 if cur_line: 655 lines.append(indent + ''.join(cur_line)) 656 657 return lines

658

659 660 -def wrap(t, cols=75, initial_indent='', subsequent_indent='', 661 linesep=os.linesep, ambiwidth=1):

662 """Wraps the single paragraph in `t`, which contains unicode characters. 663 The every line is at most `cols` characters long. 664 665 That `ambiwidth` parameter is used for the column width of the East 666 Asian Ambiguous (A). If `1`, the same width as characters in US-ASCII. 667 This is expected by most users. If `2`, twice the width of US-ASCII 668 characters. This is expected by CJK users. 669 """ 670 t = t.strip().replace('\r\n', '\n').replace('\r', '\n') 671 wrapper = UnicodeTextWrapper(cols, replace_whitespace=0, 672 break_long_words=0, 673 initial_indent=initial_indent, 674 subsequent_indent=subsequent_indent, 675 ambiwidth=ambiwidth) 676 wrappedLines = [] 677 for line in t.split('\n'): 678 wrappedLines += wrapper.wrap(line.rstrip()) or [''] 679 return linesep.join(wrappedLines)

680 681 682 _obfuscation_char = u'@\u2026'

683 684 -def obfuscate_email_address(address):

685 """Replace anything looking like an e-mail address (``'@something'``) 686 with a trailing ellipsis (``'@…'``) 687 """ 688 if address: 689 at = address.find('@') 690 if at != -1: 691 return address[:at] + _obfuscation_char + \ 692 ('>' if address[-1] == '>' else '') 693 return address

694

695 696 -def is_obfuscated(word):

697 """Returns `True` if the `word` looks like an obfuscated e-mail 698 address. 699 700 :since: 1.2 701 """ 702 return _obfuscation_char in word

703

704 705 -def breakable_path(path):

706 """Make a path breakable after path separators, and conversely, avoid 707 breaking at spaces. 708 """ 709 if not path: 710 return path 711 prefix = '' 712 if path.startswith('/'): # Avoid breaking after a leading / 713 prefix = '/' 714 path = path[1:] 715 return prefix + path.replace('/', u'/\u200b').replace('\\', u'\\\u200b') \ 716 .replace(' ', u'\u00a0')

717

718 719 -def normalize_whitespace(text, to_space=u'\u00a0', remove=u'\u200b'):

720 """Normalize whitespace in a string, by replacing special spaces by normal 721 spaces and removing zero-width spaces.""" 722 if not text: 723 return text 724 for each in to_space: 725 text = text.replace(each, ' ') 726 for each in remove: 727 text = text.replace(each, '') 728 return text

729

730 731 -def unquote_label(txt):

732 """Remove (one level of) enclosing single or double quotes. 733 734 .. versionadded :: 1.0 735 """ 736 return txt[1:-1] if txt and txt[0] in "'\"" and txt[0] == txt[-1] else txt

737

738 739 -def cleandoc(message):

740 """Removes uniform indentation and leading/trailing whitespace.""" 741 from inspect import cleandoc 742 return cleandoc(message).strip()

743

744 745 # -- Conversion 746 747 -def pretty_size(size, format='%.1f'):

748 """Pretty print content size information with appropriate unit. 749 750 :param size: number of bytes 751 :param format: can be used to adjust the precision shown 752 """ 753 if size is None: 754 return '' 755 756 jump = 1024 757 if size < jump: 758 from trac.util.translation import ngettext 759 return ngettext("%(num)d byte", "%(num)d bytes", num=size) 760 761 units = ['KB', 'MB', 'GB', 'TB'] 762 i = 0 763 while size >= jump and i < len(units): 764 i += 1 765 size /= 1024. 766 767 return (format + ' %s') % (size, units[i - 1])

768

769 770 -def expandtabs(s, tabstop=8, ignoring=None):

771 """Expand tab characters `'\\\\t'` into spaces. 772 773 :param tabstop: number of space characters per tab 774 (defaults to the canonical 8) 775 776 :param ignoring: if not `None`, the expansion will be "smart" and 777 go from one tabstop to the next. In addition, 778 this parameter lists characters which can be 779 ignored when computing the indent. 780 """ 781 if '\t' not in s: 782 return s 783 if ignoring is None: 784 return s.expandtabs(tabstop) 785 786 outlines = [] 787 for line in s.split('\n'): 788 if '\t' not in line: 789 outlines.append(line) 790 continue 791 p = 0 792 s = [] 793 for c in line: 794 if c == '\t': 795 n = tabstop - p % tabstop 796 s.append(' ' * n) 797 p += n 798 elif not ignoring or c not in ignoring: 799 p += 1 800 s.append(c) 801 else: 802 s.append(c) 803 outlines.append(''.join(s)) 804 return '\n'.join(outlines)

805

806 807 -def fix_eol(text, eol):

808 """Fix end-of-lines in a text.""" 809 lines = text.splitlines() 810 lines.append('') 811 return eol.join(lines)

812

813 -def unicode_to_base64(text, strip_newlines=True):

814 """Safe conversion of ``text`` to base64 representation using 815 utf-8 bytes. 816 817 Strips newlines from output unless ``strip_newlines`` is `False`. 818 """ 819 text = to_unicode(text) 820 if strip_newlines: 821 return text.encode('utf-8').encode('base64').replace('\n', '') 822 return text.encode('utf-8').encode('base64')

823

824 -def unicode_from_base64(text):

825 """Safe conversion of ``text`` to unicode based on utf-8 bytes.""" 826 return text.decode('base64').decode('utf-8')

827

828 829 -def levenshtein_distance(lhs, rhs):

830 """Return the Levenshtein distance between two strings.""" 831 if len(lhs) > len(rhs): 832 rhs, lhs = lhs, rhs 833 if not lhs: 834 return len(rhs) 835 836 prev = xrange(len(rhs) + 1) 837 for lidx, lch in enumerate(lhs): 838 curr = [lidx + 1] 839 for ridx, rch in enumerate(rhs): 840 cost = (lch != rch) * 2 841 curr.append(min(prev[ridx + 1] + 1, # deletion 842 curr[ridx] + 1, # insertion 843 prev[ridx] + cost)) # substitution 844 prev = curr 845 return prev[-1]

846 847 848 sub_vars_re = re.compile("[$]([A-Z_][A-Z0-9_]*)")

849 850 -def sub_vars(text, args):

851 """Substitute $XYZ-style variables in a string with provided values. 852 853 :param text: string containing variables to substitute. 854 :param args: dictionary with keys matching the variables to be substituted. 855 The keys should not be prefixed with the $ character.""" 856 def repl(match): 857 key = match.group(1) 858 return args[key] if key in args else '$' + key

859 return sub_vars_re.sub(repl, text) 860

Source Code for Module trac.util.text