| [3225] | 1 | """HTTP cookie handling for web clients.
|
|---|
| 2 |
|
|---|
| 3 | This module has (now fairly distant) origins in Gisle Aas' Perl module
|
|---|
| 4 | HTTP::Cookies, from the libwww-perl library.
|
|---|
| 5 |
|
|---|
| 6 | Docstrings, comments and debug strings in this code refer to the
|
|---|
| 7 | attributes of the HTTP cookie system as cookie-attributes, to distinguish
|
|---|
| 8 | them clearly from Python attributes.
|
|---|
| 9 |
|
|---|
| 10 | Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
|
|---|
| 11 | distributed with the Python standard library, but are available from
|
|---|
| 12 | http://wwwsearch.sf.net/):
|
|---|
| 13 |
|
|---|
| 14 | CookieJar____
|
|---|
| 15 | / \ \
|
|---|
| 16 | FileCookieJar \ \
|
|---|
| 17 | / | \ \ \
|
|---|
| 18 | MozillaCookieJar | LWPCookieJar \ \
|
|---|
| 19 | | | \
|
|---|
| 20 | | ---MSIEBase | \
|
|---|
| 21 | | / | | \
|
|---|
| 22 | | / MSIEDBCookieJar BSDDBCookieJar
|
|---|
| 23 | |/
|
|---|
| 24 | MSIECookieJar
|
|---|
| 25 |
|
|---|
| 26 | """
|
|---|
| 27 |
|
|---|
| 28 | __all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
|
|---|
| 29 | 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
|
|---|
| 30 |
|
|---|
| 31 | import re, urlparse, copy, time, urllib
|
|---|
| 32 | try:
|
|---|
| 33 | import threading as _threading
|
|---|
| 34 | except ImportError:
|
|---|
| 35 | import dummy_threading as _threading
|
|---|
| 36 | import httplib # only for the default HTTP port
|
|---|
| 37 | from calendar import timegm
|
|---|
| 38 |
|
|---|
| 39 | debug = False # set to True to enable debugging via the logging module
|
|---|
| 40 | logger = None
|
|---|
| 41 |
|
|---|
| 42 | def _debug(*args):
|
|---|
| 43 | if not debug:
|
|---|
| 44 | return
|
|---|
| 45 | global logger
|
|---|
| 46 | if not logger:
|
|---|
| 47 | import logging
|
|---|
| 48 | logger = logging.getLogger("cookielib")
|
|---|
| 49 | return logger.debug(*args)
|
|---|
| 50 |
|
|---|
| 51 |
|
|---|
| 52 | DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
|
|---|
| 53 | MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
|
|---|
| 54 | "instance initialised with one)")
|
|---|
| 55 |
|
|---|
| 56 | def _warn_unhandled_exception():
|
|---|
| 57 | # There are a few catch-all except: statements in this module, for
|
|---|
| 58 | # catching input that's bad in unexpected ways. Warn if any
|
|---|
| 59 | # exceptions are caught there.
|
|---|
| 60 | import warnings, traceback, StringIO
|
|---|
| 61 | f = StringIO.StringIO()
|
|---|
| 62 | traceback.print_exc(None, f)
|
|---|
| 63 | msg = f.getvalue()
|
|---|
| 64 | warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
|
|---|
| 65 |
|
|---|
| 66 |
|
|---|
| 67 | # Date/time conversion
|
|---|
| 68 | # -----------------------------------------------------------------------------
|
|---|
| 69 |
|
|---|
| 70 | EPOCH_YEAR = 1970
|
|---|
| 71 | def _timegm(tt):
|
|---|
| 72 | year, month, mday, hour, min, sec = tt[:6]
|
|---|
| 73 | if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
|
|---|
| 74 | (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
|
|---|
| 75 | return timegm(tt)
|
|---|
| 76 | else:
|
|---|
| 77 | return None
|
|---|
| 78 |
|
|---|
| 79 | DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
|
|---|
| 80 | MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
|
|---|
| 81 | "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
|
|---|
| 82 | MONTHS_LOWER = []
|
|---|
| 83 | for month in MONTHS: MONTHS_LOWER.append(month.lower())
|
|---|
| 84 |
|
|---|
| 85 | def time2isoz(t=None):
|
|---|
| 86 | """Return a string representing time in seconds since epoch, t.
|
|---|
| 87 |
|
|---|
| 88 | If the function is called without an argument, it will use the current
|
|---|
| 89 | time.
|
|---|
| 90 |
|
|---|
| 91 | The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
|
|---|
| 92 | representing Universal Time (UTC, aka GMT). An example of this format is:
|
|---|
| 93 |
|
|---|
| 94 | 1994-11-24 08:49:37Z
|
|---|
| 95 |
|
|---|
| 96 | """
|
|---|
| 97 | if t is None: t = time.time()
|
|---|
| 98 | year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
|
|---|
| 99 | return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
|
|---|
| 100 | year, mon, mday, hour, min, sec)
|
|---|
| 101 |
|
|---|
| 102 | def time2netscape(t=None):
|
|---|
| 103 | """Return a string representing time in seconds since epoch, t.
|
|---|
| 104 |
|
|---|
| 105 | If the function is called without an argument, it will use the current
|
|---|
| 106 | time.
|
|---|
| 107 |
|
|---|
| 108 | The format of the returned string is like this:
|
|---|
| 109 |
|
|---|
| 110 | Wed, DD-Mon-YYYY HH:MM:SS GMT
|
|---|
| 111 |
|
|---|
| 112 | """
|
|---|
| 113 | if t is None: t = time.time()
|
|---|
| 114 | year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
|
|---|
| 115 | return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
|
|---|
| 116 | DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
|
|---|
| 117 |
|
|---|
| 118 |
|
|---|
| 119 | UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
|
|---|
| 120 |
|
|---|
| 121 | TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
|
|---|
| 122 | def offset_from_tz_string(tz):
|
|---|
| 123 | offset = None
|
|---|
| 124 | if tz in UTC_ZONES:
|
|---|
| 125 | offset = 0
|
|---|
| 126 | else:
|
|---|
| 127 | m = TIMEZONE_RE.search(tz)
|
|---|
| 128 | if m:
|
|---|
| 129 | offset = 3600 * int(m.group(2))
|
|---|
| 130 | if m.group(3):
|
|---|
| 131 | offset = offset + 60 * int(m.group(3))
|
|---|
| 132 | if m.group(1) == '-':
|
|---|
| 133 | offset = -offset
|
|---|
| 134 | return offset
|
|---|
| 135 |
|
|---|
| 136 | def _str2time(day, mon, yr, hr, min, sec, tz):
|
|---|
| 137 | # translate month name to number
|
|---|
| 138 | # month numbers start with 1 (January)
|
|---|
| 139 | try:
|
|---|
| 140 | mon = MONTHS_LOWER.index(mon.lower())+1
|
|---|
| 141 | except ValueError:
|
|---|
| 142 | # maybe it's already a number
|
|---|
| 143 | try:
|
|---|
| 144 | imon = int(mon)
|
|---|
| 145 | except ValueError:
|
|---|
| 146 | return None
|
|---|
| 147 | if 1 <= imon <= 12:
|
|---|
| 148 | mon = imon
|
|---|
| 149 | else:
|
|---|
| 150 | return None
|
|---|
| 151 |
|
|---|
| 152 | # make sure clock elements are defined
|
|---|
| 153 | if hr is None: hr = 0
|
|---|
| 154 | if min is None: min = 0
|
|---|
| 155 | if sec is None: sec = 0
|
|---|
| 156 |
|
|---|
| 157 | yr = int(yr)
|
|---|
| 158 | day = int(day)
|
|---|
| 159 | hr = int(hr)
|
|---|
| 160 | min = int(min)
|
|---|
| 161 | sec = int(sec)
|
|---|
| 162 |
|
|---|
| 163 | if yr < 1000:
|
|---|
| 164 | # find "obvious" year
|
|---|
| 165 | cur_yr = time.localtime(time.time())[0]
|
|---|
| 166 | m = cur_yr % 100
|
|---|
| 167 | tmp = yr
|
|---|
| 168 | yr = yr + cur_yr - m
|
|---|
| 169 | m = m - tmp
|
|---|
| 170 | if abs(m) > 50:
|
|---|
| 171 | if m > 0: yr = yr + 100
|
|---|
| 172 | else: yr = yr - 100
|
|---|
| 173 |
|
|---|
| 174 | # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
|
|---|
| 175 | t = _timegm((yr, mon, day, hr, min, sec, tz))
|
|---|
| 176 |
|
|---|
| 177 | if t is not None:
|
|---|
| 178 | # adjust time using timezone string, to get absolute time since epoch
|
|---|
| 179 | if tz is None:
|
|---|
| 180 | tz = "UTC"
|
|---|
| 181 | tz = tz.upper()
|
|---|
| 182 | offset = offset_from_tz_string(tz)
|
|---|
| 183 | if offset is None:
|
|---|
| 184 | return None
|
|---|
| 185 | t = t - offset
|
|---|
| 186 |
|
|---|
| 187 | return t
|
|---|
| 188 |
|
|---|
| 189 | STRICT_DATE_RE = re.compile(
|
|---|
| 190 | r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
|
|---|
| 191 | "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
|
|---|
| 192 | WEEKDAY_RE = re.compile(
|
|---|
| 193 | r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
|
|---|
| 194 | LOOSE_HTTP_DATE_RE = re.compile(
|
|---|
| 195 | r"""^
|
|---|
| 196 | (\d\d?) # day
|
|---|
| 197 | (?:\s+|[-\/])
|
|---|
| 198 | (\w+) # month
|
|---|
| 199 | (?:\s+|[-\/])
|
|---|
| 200 | (\d+) # year
|
|---|
| 201 | (?:
|
|---|
| 202 | (?:\s+|:) # separator before clock
|
|---|
| 203 | (\d\d?):(\d\d) # hour:min
|
|---|
| 204 | (?::(\d\d))? # optional seconds
|
|---|
| 205 | )? # optional clock
|
|---|
| 206 | \s*
|
|---|
| 207 | ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
|
|---|
| 208 | \s*
|
|---|
| 209 | (?:\(\w+\))? # ASCII representation of timezone in parens.
|
|---|
| 210 | \s*$""", re.X)
|
|---|
| 211 | def http2time(text):
|
|---|
| 212 | """Returns time in seconds since epoch of time represented by a string.
|
|---|
| 213 |
|
|---|
| 214 | Return value is an integer.
|
|---|
| 215 |
|
|---|
| 216 | None is returned if the format of str is unrecognized, the time is outside
|
|---|
| 217 | the representable range, or the timezone string is not recognized. If the
|
|---|
| 218 | string contains no timezone, UTC is assumed.
|
|---|
| 219 |
|
|---|
| 220 | The timezone in the string may be numerical (like "-0800" or "+0100") or a
|
|---|
| 221 | string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
|
|---|
| 222 | timezone strings equivalent to UTC (zero offset) are known to the function.
|
|---|
| 223 |
|
|---|
| 224 | The function loosely parses the following formats:
|
|---|
| 225 |
|
|---|
| 226 | Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
|
|---|
| 227 | Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
|
|---|
| 228 | Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
|
|---|
| 229 | 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
|
|---|
| 230 | 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
|
|---|
| 231 | 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
|
|---|
| 232 |
|
|---|
| 233 | The parser ignores leading and trailing whitespace. The time may be
|
|---|
| 234 | absent.
|
|---|
| 235 |
|
|---|
| 236 | If the year is given with only 2 digits, the function will select the
|
|---|
| 237 | century that makes the year closest to the current date.
|
|---|
| 238 |
|
|---|
| 239 | """
|
|---|
| 240 | # fast exit for strictly conforming string
|
|---|
| 241 | m = STRICT_DATE_RE.search(text)
|
|---|
| 242 | if m:
|
|---|
| 243 | g = m.groups()
|
|---|
| 244 | mon = MONTHS_LOWER.index(g[1].lower()) + 1
|
|---|
| 245 | tt = (int(g[2]), mon, int(g[0]),
|
|---|
| 246 | int(g[3]), int(g[4]), float(g[5]))
|
|---|
| 247 | return _timegm(tt)
|
|---|
| 248 |
|
|---|
| 249 | # No, we need some messy parsing...
|
|---|
| 250 |
|
|---|
| 251 | # clean up
|
|---|
| 252 | text = text.lstrip()
|
|---|
| 253 | text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
|
|---|
| 254 |
|
|---|
| 255 | # tz is time zone specifier string
|
|---|
| 256 | day, mon, yr, hr, min, sec, tz = [None]*7
|
|---|
| 257 |
|
|---|
| 258 | # loose regexp parse
|
|---|
| 259 | m = LOOSE_HTTP_DATE_RE.search(text)
|
|---|
| 260 | if m is not None:
|
|---|
| 261 | day, mon, yr, hr, min, sec, tz = m.groups()
|
|---|
| 262 | else:
|
|---|
| 263 | return None # bad format
|
|---|
| 264 |
|
|---|
| 265 | return _str2time(day, mon, yr, hr, min, sec, tz)
|
|---|
| 266 |
|
|---|
| 267 | ISO_DATE_RE = re.compile(
|
|---|
| 268 | """^
|
|---|
| 269 | (\d{4}) # year
|
|---|
| 270 | [-\/]?
|
|---|
| 271 | (\d\d?) # numerical month
|
|---|
| 272 | [-\/]?
|
|---|
| 273 | (\d\d?) # day
|
|---|
| 274 | (?:
|
|---|
| 275 | (?:\s+|[-:Tt]) # separator before clock
|
|---|
| 276 | (\d\d?):?(\d\d) # hour:min
|
|---|
| 277 | (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
|
|---|
| 278 | )? # optional clock
|
|---|
| 279 | \s*
|
|---|
| 280 | ([-+]?\d\d?:?(:?\d\d)?
|
|---|
| 281 | |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
|
|---|
| 282 | \s*$""", re.X)
|
|---|
| 283 | def iso2time(text):
|
|---|
| 284 | """
|
|---|
| 285 | As for http2time, but parses the ISO 8601 formats:
|
|---|
| 286 |
|
|---|
| 287 | 1994-02-03 14:15:29 -0100 -- ISO 8601 format
|
|---|
| 288 | 1994-02-03 14:15:29 -- zone is optional
|
|---|
| 289 | 1994-02-03 -- only date
|
|---|
| 290 | 1994-02-03T14:15:29 -- Use T as separator
|
|---|
| 291 | 19940203T141529Z -- ISO 8601 compact format
|
|---|
| 292 | 19940203 -- only date
|
|---|
| 293 |
|
|---|
| 294 | """
|
|---|
| 295 | # clean up
|
|---|
| 296 | text = text.lstrip()
|
|---|
| 297 |
|
|---|
| 298 | # tz is time zone specifier string
|
|---|
| 299 | day, mon, yr, hr, min, sec, tz = [None]*7
|
|---|
| 300 |
|
|---|
| 301 | # loose regexp parse
|
|---|
| 302 | m = ISO_DATE_RE.search(text)
|
|---|
| 303 | if m is not None:
|
|---|
| 304 | # XXX there's an extra bit of the timezone I'm ignoring here: is
|
|---|
| 305 | # this the right thing to do?
|
|---|
| 306 | yr, mon, day, hr, min, sec, tz, _ = m.groups()
|
|---|
| 307 | else:
|
|---|
| 308 | return None # bad format
|
|---|
| 309 |
|
|---|
| 310 | return _str2time(day, mon, yr, hr, min, sec, tz)
|
|---|
| 311 |
|
|---|
| 312 |
|
|---|
| 313 | # Header parsing
|
|---|
| 314 | # -----------------------------------------------------------------------------
|
|---|
| 315 |
|
|---|
| 316 | def unmatched(match):
|
|---|
| 317 | """Return unmatched part of re.Match object."""
|
|---|
| 318 | start, end = match.span(0)
|
|---|
| 319 | return match.string[:start]+match.string[end:]
|
|---|
| 320 |
|
|---|
| 321 | HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
|
|---|
| 322 | HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
|
|---|
| 323 | HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
|
|---|
| 324 | HEADER_ESCAPE_RE = re.compile(r"\\(.)")
|
|---|
| 325 | def split_header_words(header_values):
|
|---|
| 326 | r"""Parse header values into a list of lists containing key,value pairs.
|
|---|
| 327 |
|
|---|
| 328 | The function knows how to deal with ",", ";" and "=" as well as quoted
|
|---|
| 329 | values after "=". A list of space separated tokens are parsed as if they
|
|---|
| 330 | were separated by ";".
|
|---|
| 331 |
|
|---|
| 332 | If the header_values passed as argument contains multiple values, then they
|
|---|
| 333 | are treated as if they were a single value separated by comma ",".
|
|---|
| 334 |
|
|---|
| 335 | This means that this function is useful for parsing header fields that
|
|---|
| 336 | follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
|
|---|
| 337 | the requirement for tokens).
|
|---|
| 338 |
|
|---|
| 339 | headers = #header
|
|---|
| 340 | header = (token | parameter) *( [";"] (token | parameter))
|
|---|
| 341 |
|
|---|
| 342 | token = 1*<any CHAR except CTLs or separators>
|
|---|
| 343 | separators = "(" | ")" | "<" | ">" | "@"
|
|---|
| 344 | | "," | ";" | ":" | "\" | <">
|
|---|
| 345 | | "/" | "[" | "]" | "?" | "="
|
|---|
| 346 | | "{" | "}" | SP | HT
|
|---|
| 347 |
|
|---|
| 348 | quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
|
|---|
| 349 | qdtext = <any TEXT except <">>
|
|---|
| 350 | quoted-pair = "\" CHAR
|
|---|
| 351 |
|
|---|
| 352 | parameter = attribute "=" value
|
|---|
| 353 | attribute = token
|
|---|
| 354 | value = token | quoted-string
|
|---|
| 355 |
|
|---|
| 356 | Each header is represented by a list of key/value pairs. The value for a
|
|---|
| 357 | simple token (not part of a parameter) is None. Syntactically incorrect
|
|---|
| 358 | headers will not necessarily be parsed as you would want.
|
|---|
| 359 |
|
|---|
| 360 | This is easier to describe with some examples:
|
|---|
| 361 |
|
|---|
| 362 | >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
|
|---|
| 363 | [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
|
|---|
| 364 | >>> split_header_words(['text/html; charset="iso-8859-1"'])
|
|---|
| 365 | [[('text/html', None), ('charset', 'iso-8859-1')]]
|
|---|
| 366 | >>> split_header_words([r'Basic realm="\"foo\bar\""'])
|
|---|
| 367 | [[('Basic', None), ('realm', '"foobar"')]]
|
|---|
| 368 |
|
|---|
| 369 | """
|
|---|
| 370 | assert not isinstance(header_values, basestring)
|
|---|
| 371 | result = []
|
|---|
| 372 | for text in header_values:
|
|---|
| 373 | orig_text = text
|
|---|
| 374 | pairs = []
|
|---|
| 375 | while text:
|
|---|
| 376 | m = HEADER_TOKEN_RE.search(text)
|
|---|
| 377 | if m:
|
|---|
| 378 | text = unmatched(m)
|
|---|
| 379 | name = m.group(1)
|
|---|
| 380 | m = HEADER_QUOTED_VALUE_RE.search(text)
|
|---|
| 381 | if m: # quoted value
|
|---|
| 382 | text = unmatched(m)
|
|---|
| 383 | value = m.group(1)
|
|---|
| 384 | value = HEADER_ESCAPE_RE.sub(r"\1", value)
|
|---|
| 385 | else:
|
|---|
| 386 | m = HEADER_VALUE_RE.search(text)
|
|---|
| 387 | if m: # unquoted value
|
|---|
| 388 | text = unmatched(m)
|
|---|
| 389 | value = m.group(1)
|
|---|
| 390 | value = value.rstrip()
|
|---|
| 391 | else:
|
|---|
| 392 | # no value, a lone token
|
|---|
| 393 | value = None
|
|---|
| 394 | pairs.append((name, value))
|
|---|
| 395 | elif text.lstrip().startswith(","):
|
|---|
| 396 | # concatenated headers, as per RFC 2616 section 4.2
|
|---|
| 397 | text = text.lstrip()[1:]
|
|---|
| 398 | if pairs: result.append(pairs)
|
|---|
| 399 | pairs = []
|
|---|
| 400 | else:
|
|---|
| 401 | # skip junk
|
|---|
| 402 | non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
|
|---|
| 403 | assert nr_junk_chars > 0, (
|
|---|
| 404 | "split_header_words bug: '%s', '%s', %s" %
|
|---|
| 405 | (orig_text, text, pairs))
|
|---|
| 406 | text = non_junk
|
|---|
| 407 | if pairs: result.append(pairs)
|
|---|
| 408 | return result
|
|---|
| 409 |
|
|---|
| 410 | HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
|
|---|
| 411 | def join_header_words(lists):
|
|---|
| 412 | """Do the inverse (almost) of the conversion done by split_header_words.
|
|---|
| 413 |
|
|---|
| 414 | Takes a list of lists of (key, value) pairs and produces a single header
|
|---|
| 415 | value. Attribute values are quoted if needed.
|
|---|
| 416 |
|
|---|
| 417 | >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
|
|---|
| 418 | 'text/plain; charset="iso-8859/1"'
|
|---|
| 419 | >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
|
|---|
| 420 | 'text/plain, charset="iso-8859/1"'
|
|---|
| 421 |
|
|---|
| 422 | """
|
|---|
| 423 | headers = []
|
|---|
| 424 | for pairs in lists:
|
|---|
| 425 | attr = []
|
|---|
| 426 | for k, v in pairs:
|
|---|
| 427 | if v is not None:
|
|---|
| 428 | if not re.search(r"^\w+$", v):
|
|---|
| 429 | v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
|
|---|
| 430 | v = '"%s"' % v
|
|---|
| 431 | k = "%s=%s" % (k, v)
|
|---|
| 432 | attr.append(k)
|
|---|
| 433 | if attr: headers.append("; ".join(attr))
|
|---|
| 434 | return ", ".join(headers)
|
|---|
| 435 |
|
|---|
| 436 | def parse_ns_headers(ns_headers):
|
|---|
| 437 | """Ad-hoc parser for Netscape protocol cookie-attributes.
|
|---|
| 438 |
|
|---|
| 439 | The old Netscape cookie format for Set-Cookie can for instance contain
|
|---|
| 440 | an unquoted "," in the expires field, so we have to use this ad-hoc
|
|---|
| 441 | parser instead of split_header_words.
|
|---|
| 442 |
|
|---|
| 443 | XXX This may not make the best possible effort to parse all the crap
|
|---|
| 444 | that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
|
|---|
| 445 | parser is probably better, so could do worse than following that if
|
|---|
| 446 | this ever gives any trouble.
|
|---|
| 447 |
|
|---|
| 448 | Currently, this is also used for parsing RFC 2109 cookies.
|
|---|
| 449 |
|
|---|
| 450 | """
|
|---|
| 451 | known_attrs = ("expires", "domain", "path", "secure",
|
|---|
| 452 | # RFC 2109 attrs (may turn up in Netscape cookies, too)
|
|---|
| 453 | "port", "max-age")
|
|---|
| 454 |
|
|---|
| 455 | result = []
|
|---|
| 456 | for ns_header in ns_headers:
|
|---|
| 457 | pairs = []
|
|---|
| 458 | version_set = False
|
|---|
| 459 | for ii, param in enumerate(re.split(r";\s*", ns_header)):
|
|---|
| 460 | param = param.rstrip()
|
|---|
| 461 | if param == "": continue
|
|---|
| 462 | if "=" not in param:
|
|---|
| 463 | k, v = param, None
|
|---|
| 464 | else:
|
|---|
| 465 | k, v = re.split(r"\s*=\s*", param, 1)
|
|---|
| 466 | k = k.lstrip()
|
|---|
| 467 | if ii != 0:
|
|---|
| 468 | lc = k.lower()
|
|---|
| 469 | if lc in known_attrs:
|
|---|
| 470 | k = lc
|
|---|
| 471 | if k == "version":
|
|---|
| 472 | # This is an RFC 2109 cookie.
|
|---|
| 473 | version_set = True
|
|---|
| 474 | if k == "expires":
|
|---|
| 475 | # convert expires date to seconds since epoch
|
|---|
| 476 | if v.startswith('"'): v = v[1:]
|
|---|
| 477 | if v.endswith('"'): v = v[:-1]
|
|---|
| 478 | v = http2time(v) # None if invalid
|
|---|
| 479 | pairs.append((k, v))
|
|---|
| 480 |
|
|---|
| 481 | if pairs:
|
|---|
| 482 | if not version_set:
|
|---|
| 483 | pairs.append(("version", "0"))
|
|---|
| 484 | result.append(pairs)
|
|---|
| 485 |
|
|---|
| 486 | return result
|
|---|
| 487 |
|
|---|
| 488 |
|
|---|
| 489 | IPV4_RE = re.compile(r"\.\d+$")
|
|---|
| 490 | def is_HDN(text):
|
|---|
| 491 | """Return True if text is a host domain name."""
|
|---|
| 492 | # XXX
|
|---|
| 493 | # This may well be wrong. Which RFC is HDN defined in, if any (for
|
|---|
| 494 | # the purposes of RFC 2965)?
|
|---|
| 495 | # For the current implementation, what about IPv6? Remember to look
|
|---|
| 496 | # at other uses of IPV4_RE also, if change this.
|
|---|
| 497 | if IPV4_RE.search(text):
|
|---|
| 498 | return False
|
|---|
| 499 | if text == "":
|
|---|
| 500 | return False
|
|---|
| 501 | if text[0] == "." or text[-1] == ".":
|
|---|
| 502 | return False
|
|---|
| 503 | return True
|
|---|
| 504 |
|
|---|
| 505 | def domain_match(A, B):
|
|---|
| 506 | """Return True if domain A domain-matches domain B, according to RFC 2965.
|
|---|
| 507 |
|
|---|
| 508 | A and B may be host domain names or IP addresses.
|
|---|
| 509 |
|
|---|
| 510 | RFC 2965, section 1:
|
|---|
| 511 |
|
|---|
| 512 | Host names can be specified either as an IP address or a HDN string.
|
|---|
| 513 | Sometimes we compare one host name with another. (Such comparisons SHALL
|
|---|
| 514 | be case-insensitive.) Host A's name domain-matches host B's if
|
|---|
| 515 |
|
|---|
| 516 | * their host name strings string-compare equal; or
|
|---|
| 517 |
|
|---|
| 518 | * A is a HDN string and has the form NB, where N is a non-empty
|
|---|
| 519 | name string, B has the form .B', and B' is a HDN string. (So,
|
|---|
| 520 | x.y.com domain-matches .Y.com but not Y.com.)
|
|---|
| 521 |
|
|---|
| 522 | Note that domain-match is not a commutative operation: a.b.c.com
|
|---|
| 523 | domain-matches .c.com, but not the reverse.
|
|---|
| 524 |
|
|---|
| 525 | """
|
|---|
| 526 | # Note that, if A or B are IP addresses, the only relevant part of the
|
|---|
| 527 | # definition of the domain-match algorithm is the direct string-compare.
|
|---|
| 528 | A = A.lower()
|
|---|
| 529 | B = B.lower()
|
|---|
| 530 | if A == B:
|
|---|
| 531 | return True
|
|---|
| 532 | if not is_HDN(A):
|
|---|
| 533 | return False
|
|---|
| 534 | i = A.rfind(B)
|
|---|
| 535 | if i == -1 or i == 0:
|
|---|
| 536 | # A does not have form NB, or N is the empty string
|
|---|
| 537 | return False
|
|---|
| 538 | if not B.startswith("."):
|
|---|
| 539 | return False
|
|---|
| 540 | if not is_HDN(B[1:]):
|
|---|
| 541 | return False
|
|---|
| 542 | return True
|
|---|
| 543 |
|
|---|
| 544 | def liberal_is_HDN(text):
|
|---|
| 545 | """Return True if text is a sort-of-like a host domain name.
|
|---|
| 546 |
|
|---|
| 547 | For accepting/blocking domains.
|
|---|
| 548 |
|
|---|
| 549 | """
|
|---|
| 550 | if IPV4_RE.search(text):
|
|---|
| 551 | return False
|
|---|
| 552 | return True
|
|---|
| 553 |
|
|---|
| 554 | def user_domain_match(A, B):
|
|---|
| 555 | """For blocking/accepting domains.
|
|---|
| 556 |
|
|---|
| 557 | A and B may be host domain names or IP addresses.
|
|---|
| 558 |
|
|---|
| 559 | """
|
|---|
| 560 | A = A.lower()
|
|---|
| 561 | B = B.lower()
|
|---|
| 562 | if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
|
|---|
| 563 | if A == B:
|
|---|
| 564 | # equal IP addresses
|
|---|
| 565 | return True
|
|---|
| 566 | return False
|
|---|
| 567 | initial_dot = B.startswith(".")
|
|---|
| 568 | if initial_dot and A.endswith(B):
|
|---|
| 569 | return True
|
|---|
| 570 | if not initial_dot and A == B:
|
|---|
| 571 | return True
|
|---|
| 572 | return False
|
|---|
| 573 |
|
|---|
| 574 | cut_port_re = re.compile(r":\d+$")
|
|---|
| 575 | def request_host(request):
|
|---|
| 576 | """Return request-host, as defined by RFC 2965.
|
|---|
| 577 |
|
|---|
| 578 | Variation from RFC: returned value is lowercased, for convenient
|
|---|
| 579 | comparison.
|
|---|
| 580 |
|
|---|
| 581 | """
|
|---|
| 582 | url = request.get_full_url()
|
|---|
| 583 | host = urlparse.urlparse(url)[1]
|
|---|
| 584 | if host == "":
|
|---|
| 585 | host = request.get_header("Host", "")
|
|---|
| 586 |
|
|---|
| 587 | # remove port, if present
|
|---|
| 588 | host = cut_port_re.sub("", host, 1)
|
|---|
| 589 | return host.lower()
|
|---|
| 590 |
|
|---|
| 591 | def eff_request_host(request):
|
|---|
| 592 | """Return a tuple (request-host, effective request-host name).
|
|---|
| 593 |
|
|---|
| 594 | As defined by RFC 2965, except both are lowercased.
|
|---|
| 595 |
|
|---|
| 596 | """
|
|---|
| 597 | erhn = req_host = request_host(request)
|
|---|
| 598 | if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
|
|---|
| 599 | erhn = req_host + ".local"
|
|---|
| 600 | return req_host, erhn
|
|---|
| 601 |
|
|---|
| 602 | def request_path(request):
|
|---|
| 603 | """request-URI, as defined by RFC 2965."""
|
|---|
| 604 | url = request.get_full_url()
|
|---|
| 605 | #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
|
|---|
| 606 | #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
|
|---|
| 607 | path, parameters, query, frag = urlparse.urlparse(url)[2:]
|
|---|
| 608 | if parameters:
|
|---|
| 609 | path = "%s;%s" % (path, parameters)
|
|---|
| 610 | path = escape_path(path)
|
|---|
| 611 | req_path = urlparse.urlunparse(("", "", path, "", query, frag))
|
|---|
| 612 | if not req_path.startswith("/"):
|
|---|
| 613 | # fix bad RFC 2396 absoluteURI
|
|---|
| 614 | req_path = "/"+req_path
|
|---|
| 615 | return req_path
|
|---|
| 616 |
|
|---|
| 617 | def request_port(request):
|
|---|
| 618 | host = request.get_host()
|
|---|
| 619 | i = host.find(':')
|
|---|
| 620 | if i >= 0:
|
|---|
| 621 | port = host[i+1:]
|
|---|
| 622 | try:
|
|---|
| 623 | int(port)
|
|---|
| 624 | except ValueError:
|
|---|
| 625 | _debug("nonnumeric port: '%s'", port)
|
|---|
| 626 | return None
|
|---|
| 627 | else:
|
|---|
| 628 | port = DEFAULT_HTTP_PORT
|
|---|
| 629 | return port
|
|---|
| 630 |
|
|---|
| 631 | # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
|
|---|
| 632 | # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
|
|---|
| 633 | HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
|
|---|
| 634 | ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
|
|---|
| 635 | def uppercase_escaped_char(match):
|
|---|
| 636 | return "%%%s" % match.group(1).upper()
|
|---|
| 637 | def escape_path(path):
|
|---|
| 638 | """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
|
|---|
| 639 | # There's no knowing what character encoding was used to create URLs
|
|---|
| 640 | # containing %-escapes, but since we have to pick one to escape invalid
|
|---|
| 641 | # path characters, we pick UTF-8, as recommended in the HTML 4.0
|
|---|
| 642 | # specification:
|
|---|
| 643 | # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
|
|---|
| 644 | # And here, kind of: draft-fielding-uri-rfc2396bis-03
|
|---|
| 645 | # (And in draft IRI specification: draft-duerst-iri-05)
|
|---|
| 646 | # (And here, for new URI schemes: RFC 2718)
|
|---|
| 647 | if isinstance(path, unicode):
|
|---|
| 648 | path = path.encode("utf-8")
|
|---|
| 649 | path = urllib.quote(path, HTTP_PATH_SAFE)
|
|---|
| 650 | path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
|
|---|
| 651 | return path
|
|---|
| 652 |
|
|---|
| 653 | def reach(h):
|
|---|
| 654 | """Return reach of host h, as defined by RFC 2965, section 1.
|
|---|
| 655 |
|
|---|
| 656 | The reach R of a host name H is defined as follows:
|
|---|
| 657 |
|
|---|
| 658 | * If
|
|---|
| 659 |
|
|---|
| 660 | - H is the host domain name of a host; and,
|
|---|
| 661 |
|
|---|
| 662 | - H has the form A.B; and
|
|---|
| 663 |
|
|---|
| 664 | - A has no embedded (that is, interior) dots; and
|
|---|
| 665 |
|
|---|
| 666 | - B has at least one embedded dot, or B is the string "local".
|
|---|
| 667 | then the reach of H is .B.
|
|---|
| 668 |
|
|---|
| 669 | * Otherwise, the reach of H is H.
|
|---|
| 670 |
|
|---|
| 671 | >>> reach("www.acme.com")
|
|---|
| 672 | '.acme.com'
|
|---|
| 673 | >>> reach("acme.com")
|
|---|
| 674 | 'acme.com'
|
|---|
| 675 | >>> reach("acme.local")
|
|---|
| 676 | '.local'
|
|---|
| 677 |
|
|---|
| 678 | """
|
|---|
| 679 | i = h.find(".")
|
|---|
| 680 | if i >= 0:
|
|---|
| 681 | #a = h[:i] # this line is only here to show what a is
|
|---|
| 682 | b = h[i+1:]
|
|---|
| 683 | i = b.find(".")
|
|---|
| 684 | if is_HDN(h) and (i >= 0 or b == "local"):
|
|---|
| 685 | return "."+b
|
|---|
| 686 | return h
|
|---|
| 687 |
|
|---|
| 688 | def is_third_party(request):
|
|---|
| 689 | """
|
|---|
| 690 |
|
|---|
| 691 | RFC 2965, section 3.3.6:
|
|---|
| 692 |
|
|---|
| 693 | An unverifiable transaction is to a third-party host if its request-
|
|---|
| 694 | host U does not domain-match the reach R of the request-host O in the
|
|---|
| 695 | origin transaction.
|
|---|
| 696 |
|
|---|
| 697 | """
|
|---|
| 698 | req_host = request_host(request)
|
|---|
| 699 | if not domain_match(req_host, reach(request.get_origin_req_host())):
|
|---|
| 700 | return True
|
|---|
| 701 | else:
|
|---|
| 702 | return False
|
|---|
| 703 |
|
|---|
| 704 |
|
|---|
| 705 | class Cookie:
|
|---|
| 706 | """HTTP Cookie.
|
|---|
| 707 |
|
|---|
| 708 | This class represents both Netscape and RFC 2965 cookies.
|
|---|
| 709 |
|
|---|
| 710 | This is deliberately a very simple class. It just holds attributes. It's
|
|---|
| 711 | possible to construct Cookie instances that don't comply with the cookie
|
|---|
| 712 | standards. CookieJar.make_cookies is the factory function for Cookie
|
|---|
| 713 | objects -- it deals with cookie parsing, supplying defaults, and
|
|---|
| 714 | normalising to the representation used in this class. CookiePolicy is
|
|---|
| 715 | responsible for checking them to see whether they should be accepted from
|
|---|
| 716 | and returned to the server.
|
|---|
| 717 |
|
|---|
| 718 | Note that the port may be present in the headers, but unspecified ("Port"
|
|---|
| 719 | rather than"Port=80", for example); if this is the case, port is None.
|
|---|
| 720 |
|
|---|
| 721 | """
|
|---|
| 722 |
|
|---|
| 723 | def __init__(self, version, name, value,
|
|---|
| 724 | port, port_specified,
|
|---|
| 725 | domain, domain_specified, domain_initial_dot,
|
|---|
| 726 | path, path_specified,
|
|---|
| 727 | secure,
|
|---|
| 728 | expires,
|
|---|
| 729 | discard,
|
|---|
| 730 | comment,
|
|---|
| 731 | comment_url,
|
|---|
| 732 | rest,
|
|---|
| 733 | rfc2109=False,
|
|---|
| 734 | ):
|
|---|
| 735 |
|
|---|
| 736 | if version is not None: version = int(version)
|
|---|
| 737 | if expires is not None: expires = int(expires)
|
|---|
| 738 | if port is None and port_specified is True:
|
|---|
| 739 | raise ValueError("if port is None, port_specified must be false")
|
|---|
| 740 |
|
|---|
| 741 | self.version = version
|
|---|
| 742 | self.name = name
|
|---|
| 743 | self.value = value
|
|---|
| 744 | self.port = port
|
|---|
| 745 | self.port_specified = port_specified
|
|---|
| 746 | # normalise case, as per RFC 2965 section 3.3.3
|
|---|
| 747 | self.domain = domain.lower()
|
|---|
| 748 | self.domain_specified = domain_specified
|
|---|
| 749 | # Sigh. We need to know whether the domain given in the
|
|---|
| 750 | # cookie-attribute had an initial dot, in order to follow RFC 2965
|
|---|
| 751 | # (as clarified in draft errata). Needed for the returned $Domain
|
|---|
| 752 | # value.
|
|---|
| 753 | self.domain_initial_dot = domain_initial_dot
|
|---|
| 754 | self.path = path
|
|---|
| 755 | self.path_specified = path_specified
|
|---|
| 756 | self.secure = secure
|
|---|
| 757 | self.expires = expires
|
|---|
| 758 | self.discard = discard
|
|---|
| 759 | self.comment = comment
|
|---|
| 760 | self.comment_url = comment_url
|
|---|
| 761 | self.rfc2109 = rfc2109
|
|---|
| 762 |
|
|---|
| 763 | self._rest = copy.copy(rest)
|
|---|
| 764 |
|
|---|
| 765 | def has_nonstandard_attr(self, name):
|
|---|
| 766 | return name in self._rest
|
|---|
| 767 | def get_nonstandard_attr(self, name, default=None):
|
|---|
| |
|---|