00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 """BSON (Binary JSON) encoding and decoding.
00016 """
00017
00018 import calendar
00019 import datetime
00020 import re
00021 import struct
00022 import warnings
00023
00024 from bson.binary import Binary
00025 from bson.code import Code
00026 from bson.dbref import DBRef
00027 from bson.errors import (InvalidBSON,
00028 InvalidDocument,
00029 InvalidStringData)
00030 from bson.max_key import MaxKey
00031 from bson.min_key import MinKey
00032 from bson.objectid import ObjectId
00033 from bson.son import SON
00034 from bson.timestamp import Timestamp
00035 from bson.tz_util import utc
00036
00037
00038 try:
00039 import _cbson
00040 _use_c = True
00041 except ImportError:
00042 _use_c = False
00043
00044 try:
00045 import uuid
00046 _use_uuid = True
00047 except ImportError:
00048 _use_uuid = False
00049
00050
00051
00052 RE_TYPE = type(re.compile(""))
00053
00054
00055 def _get_int(data, as_class=None, tz_aware=False, unsigned=False):
00056 format = unsigned and "I" or "i"
00057 try:
00058 value = struct.unpack("<%s" % format, data[:4])[0]
00059 except struct.error:
00060 raise InvalidBSON()
00061
00062 return (value, data[4:])
00063
00064
00065 def _get_c_string(data, length=None):
00066 if length is None:
00067 try:
00068 length = data.index("\x00")
00069 except ValueError:
00070 raise InvalidBSON()
00071
00072 return (unicode(data[:length], "utf-8"), data[length + 1:])
00073
00074
00075 def _make_c_string(string, check_null=False):
00076 if check_null and "\x00" in string:
00077 raise InvalidDocument("BSON keys / regex patterns must not "
00078 "contain a NULL character")
00079 if isinstance(string, unicode):
00080 return string.encode("utf-8") + "\x00"
00081 else:
00082 try:
00083 string.decode("utf-8")
00084 return string + "\x00"
00085 except:
00086 raise InvalidStringData("strings in documents must be valid "
00087 "UTF-8: %r" % string)
00088
00089
00090 def _get_number(data, as_class, tz_aware):
00091 return (struct.unpack("<d", data[:8])[0], data[8:])
00092
00093
00094 def _get_string(data, as_class, tz_aware):
00095 return _get_c_string(data[4:], struct.unpack("<i", data[:4])[0] - 1)
00096
00097
00098 def _get_object(data, as_class, tz_aware):
00099 (object, data) = _bson_to_dict(data, as_class, tz_aware)
00100 if "$ref" in object:
00101 return (DBRef(object.pop("$ref"), object.pop("$id"),
00102 object.pop("$db", None), object), data)
00103 return (object, data)
00104
00105
00106 def _get_array(data, as_class, tz_aware):
00107 (obj, data) = _get_object(data, as_class, tz_aware)
00108 result = []
00109 i = 0
00110 while True:
00111 try:
00112 result.append(obj[str(i)])
00113 i += 1
00114 except KeyError:
00115 break
00116 return (result, data)
00117
00118
00119 def _get_binary(data, as_class, tz_aware):
00120 (length, data) = _get_int(data)
00121 subtype = ord(data[0])
00122 data = data[1:]
00123 if subtype == 2:
00124 (length2, data) = _get_int(data)
00125 if length2 != length - 4:
00126 raise InvalidBSON("invalid binary (st 2) - lengths don't match!")
00127 length = length2
00128 if subtype == 3 and _use_uuid:
00129 return (uuid.UUID(bytes=data[:length]), data[length:])
00130 return (Binary(data[:length], subtype), data[length:])
00131
00132
00133 def _get_oid(data, as_class, tz_aware):
00134 return (ObjectId(data[:12]), data[12:])
00135
00136
00137 def _get_boolean(data, as_class, tz_aware):
00138 return (data[0] == "\x01", data[1:])
00139
00140
00141 def _get_date(data, as_class, tz_aware):
00142 seconds = float(struct.unpack("<q", data[:8])[0]) / 1000.0
00143 if tz_aware:
00144 return (datetime.datetime.fromtimestamp(seconds, utc), data[8:])
00145 return (datetime.datetime.utcfromtimestamp(seconds), data[8:])
00146
00147
00148 def _get_code_w_scope(data, as_class, tz_aware):
00149 (_, data) = _get_int(data)
00150 (code, data) = _get_string(data)
00151 (scope, data) = _get_object(data, as_class, tz_aware)
00152 return (Code(code, scope), data)
00153
00154
00155 def _get_null(data, as_class, tz_aware):
00156 return (None, data)
00157
00158
00159 def _get_regex(data, as_class, tz_aware):
00160 (pattern, data) = _get_c_string(data)
00161 (bson_flags, data) = _get_c_string(data)
00162 flags = 0
00163 if "i" in bson_flags:
00164 flags |= re.IGNORECASE
00165 if "l" in bson_flags:
00166 flags |= re.LOCALE
00167 if "m" in bson_flags:
00168 flags |= re.MULTILINE
00169 if "s" in bson_flags:
00170 flags |= re.DOTALL
00171 if "u" in bson_flags:
00172 flags |= re.UNICODE
00173 if "x" in bson_flags:
00174 flags |= re.VERBOSE
00175 return (re.compile(pattern, flags), data)
00176
00177
00178 def _get_ref(data, as_class, tz_aware):
00179 (collection, data) = _get_c_string(data[4:])
00180 (oid, data) = _get_oid(data)
00181 return (DBRef(collection, oid), data)
00182
00183
00184 def _get_timestamp(data, as_class, tz_aware):
00185 (inc, data) = _get_int(data, unsigned=True)
00186 (timestamp, data) = _get_int(data, unsigned=True)
00187 return (Timestamp(timestamp, inc), data)
00188
00189
00190 def _get_long(data, as_class, tz_aware):
00191 return (struct.unpack("<q", data[:8])[0], data[8:])
00192
00193
00194 _element_getter = {
00195 "\x01": _get_number,
00196 "\x02": _get_string,
00197 "\x03": _get_object,
00198 "\x04": _get_array,
00199 "\x05": _get_binary,
00200 "\x06": _get_null,
00201 "\x07": _get_oid,
00202 "\x08": _get_boolean,
00203 "\x09": _get_date,
00204 "\x0A": _get_null,
00205 "\x0B": _get_regex,
00206 "\x0C": _get_ref,
00207 "\x0D": _get_string,
00208 "\x0E": _get_string,
00209 "\x0F": _get_code_w_scope,
00210 "\x10": _get_int,
00211 "\x11": _get_timestamp,
00212 "\x12": _get_long,
00213 "\xFF": lambda x, y, z: (MinKey(), x),
00214 "\x7F": lambda x, y, z: (MaxKey(), x)}
00215
00216
00217 def _element_to_dict(data, as_class, tz_aware):
00218 element_type = data[0]
00219 (element_name, data) = _get_c_string(data[1:])
00220 (value, data) = _element_getter[element_type](data, as_class, tz_aware)
00221 return (element_name, value, data)
00222
00223
00224 def _elements_to_dict(data, as_class, tz_aware):
00225 result = as_class()
00226 while data:
00227 (key, value, data) = _element_to_dict(data, as_class, tz_aware)
00228 result[key] = value
00229 return result
00230
00231
00232 def _bson_to_dict(data, as_class, tz_aware):
00233 obj_size = struct.unpack("<i", data[:4])[0]
00234 if len(data) < obj_size:
00235 raise InvalidBSON("objsize too large")
00236 if data[obj_size - 1] != "\x00":
00237 raise InvalidBSON("bad eoo")
00238 elements = data[4:obj_size - 1]
00239 return (_elements_to_dict(elements, as_class, tz_aware), data[obj_size:])
00240 if _use_c:
00241 _bson_to_dict = _cbson._bson_to_dict
00242
00243
00244 def _element_to_bson(key, value, check_keys):
00245 if not isinstance(key, basestring):
00246 raise InvalidDocument("documents must have only string keys, "
00247 "key was %r" % key)
00248
00249 if check_keys:
00250 if key.startswith("$"):
00251 raise InvalidDocument("key %r must not start with '$'" % key)
00252 if "." in key:
00253 raise InvalidDocument("key %r must not contain '.'" % key)
00254
00255 name = _make_c_string(key, True)
00256 if isinstance(value, float):
00257 return "\x01" + name + struct.pack("<d", value)
00258
00259
00260 try:
00261 import uuid
00262
00263 if isinstance(value, uuid.UUID):
00264 value = Binary(value.bytes, subtype=3)
00265 except ImportError:
00266 pass
00267
00268 if isinstance(value, Binary):
00269 subtype = value.subtype
00270 if subtype == 2:
00271 value = struct.pack("<i", len(value)) + value
00272 return "\x05%s%s%s%s" % (name, struct.pack("<i", len(value)),
00273 chr(subtype), value)
00274 if isinstance(value, Code):
00275 cstring = _make_c_string(value)
00276 scope = _dict_to_bson(value.scope, False, False)
00277 full_length = struct.pack("<i", 8 + len(cstring) + len(scope))
00278 length = struct.pack("<i", len(cstring))
00279 return "\x0F" + name + full_length + length + cstring + scope
00280 if isinstance(value, str):
00281 cstring = _make_c_string(value)
00282 length = struct.pack("<i", len(cstring))
00283 return "\x02" + name + length + cstring
00284 if isinstance(value, unicode):
00285 cstring = _make_c_string(value)
00286 length = struct.pack("<i", len(cstring))
00287 return "\x02" + name + length + cstring
00288 if isinstance(value, dict):
00289 return "\x03" + name + _dict_to_bson(value, check_keys, False)
00290 if isinstance(value, (list, tuple)):
00291 as_dict = SON(zip([str(i) for i in range(len(value))], value))
00292 return "\x04" + name + _dict_to_bson(as_dict, check_keys, False)
00293 if isinstance(value, ObjectId):
00294 return "\x07" + name + value.binary
00295 if value is True:
00296 return "\x08" + name + "\x01"
00297 if value is False:
00298 return "\x08" + name + "\x00"
00299 if isinstance(value, (int, long)):
00300
00301 if value > 2 ** 64 / 2 - 1 or value < -2 ** 64 / 2:
00302 raise OverflowError("BSON can only handle up to 8-byte ints")
00303 if value > 2 ** 32 / 2 - 1 or value < -2 ** 32 / 2:
00304 return "\x12" + name + struct.pack("<q", value)
00305 return "\x10" + name + struct.pack("<i", value)
00306 if isinstance(value, datetime.datetime):
00307 if value.utcoffset() is not None:
00308 value = value - value.utcoffset()
00309 millis = int(calendar.timegm(value.timetuple()) * 1000 +
00310 value.microsecond / 1000)
00311 return "\x09" + name + struct.pack("<q", millis)
00312 if isinstance(value, Timestamp):
00313 time = struct.pack("<I", value.time)
00314 inc = struct.pack("<I", value.inc)
00315 return "\x11" + name + inc + time
00316 if value is None:
00317 return "\x0A" + name
00318 if isinstance(value, RE_TYPE):
00319 pattern = value.pattern
00320 flags = ""
00321 if value.flags & re.IGNORECASE:
00322 flags += "i"
00323 if value.flags & re.LOCALE:
00324 flags += "l"
00325 if value.flags & re.MULTILINE:
00326 flags += "m"
00327 if value.flags & re.DOTALL:
00328 flags += "s"
00329 if value.flags & re.UNICODE:
00330 flags += "u"
00331 if value.flags & re.VERBOSE:
00332 flags += "x"
00333 return "\x0B" + name + _make_c_string(pattern, True) + \
00334 _make_c_string(flags)
00335 if isinstance(value, DBRef):
00336 return _element_to_bson(key, value.as_doc(), False)
00337 if isinstance(value, MinKey):
00338 return "\xFF" + name
00339 if isinstance(value, MaxKey):
00340 return "\x7F" + name
00341
00342 raise InvalidDocument("cannot convert value of type %s to bson" %
00343 type(value))
00344
00345
00346 def _dict_to_bson(dict, check_keys, top_level=True):
00347 try:
00348 elements = ""
00349 if top_level and "_id" in dict:
00350 elements += _element_to_bson("_id", dict["_id"], False)
00351 for (key, value) in dict.iteritems():
00352 if not top_level or key != "_id":
00353 elements += _element_to_bson(key, value, check_keys)
00354 except AttributeError:
00355 raise TypeError("encoder expected a mapping type but got: %r" % dict)
00356
00357 length = len(elements) + 5
00358 return struct.pack("<i", length) + elements + "\x00"
00359 if _use_c:
00360 _dict_to_bson = _cbson._dict_to_bson
00361
00362
00363 def _to_dicts(data, as_class=dict, tz_aware=True):
00364 """DEPRECATED - `_to_dicts` has been renamed to `decode_all`.
00365
00366 .. versionchanged:: 1.9
00367 Deprecated in favor of :meth:`decode_all`.
00368 .. versionadded:: 1.7
00369 The `as_class` parameter.
00370 """
00371 warnings.warn("`_to_dicts` has been renamed to `decode_all`",
00372 DeprecationWarning)
00373 return decode_all(data, as_class, tz_aware)
00374
00375
00376 def decode_all(data, as_class=dict, tz_aware=True):
00377 """Decode BSON data to multiple documents.
00378
00379 `data` must be a string of concatenated, valid, BSON-encoded
00380 documents.
00381
00382 :Parameters:
00383 - `data`: BSON data
00384 - `as_class` (optional): the class to use for the resulting
00385 documents
00386 - `tz_aware` (optional): if ``True``, return timezone-aware
00387 :class:`~datetime.datetime` instances
00388
00389 .. versionadded:: 1.9
00390 """
00391 docs = []
00392 while len(data):
00393 (doc, data) = _bson_to_dict(data, as_class, tz_aware)
00394 docs.append(doc)
00395 return docs
00396 if _use_c:
00397 decode_all = _cbson.decode_all
00398
00399
00400 def is_valid(bson):
00401 """Check that the given string represents valid :class:`BSON` data.
00402
00403 Raises :class:`TypeError` if `bson` is not an instance of
00404 :class:`str`. Returns ``True`` if `bson` is valid :class:`BSON`,
00405 ``False`` otherwise.
00406
00407 :Parameters:
00408 - `bson`: the data to be validated
00409 """
00410 if not isinstance(bson, str):
00411 raise TypeError("BSON data must be an instance of a subclass of str")
00412
00413 try:
00414 (_, remainder) = _bson_to_dict(bson, dict, True)
00415 return remainder == ""
00416 except:
00417 return False
00418
00419
00420 class BSON(str):
00421 """BSON (Binary JSON) data.
00422 """
00423
00424 @classmethod
00425 def from_dict(cls, dct, check_keys=False):
00426 """DEPRECATED - `from_dict` has been renamed to `encode`.
00427
00428 .. versionchanged:: 1.9
00429 Deprecated in favor of :meth:`encode`
00430 """
00431 warnings.warn("`from_dict` has been renamed to `encode`",
00432 DeprecationWarning)
00433 return cls.encode(dct, check_keys)
00434
00435 @classmethod
00436 def encode(cls, document, check_keys=False):
00437 """Encode a document to a new :class:`BSON` instance.
00438
00439 A document can be any mapping type (like :class:`dict`).
00440
00441 Raises :class:`TypeError` if `document` is not a mapping type,
00442 or contains keys that are not instances of
00443 :class:`basestring`. Raises
00444 :class:`~bson.errors.InvalidDocument` if `document` cannot be
00445 converted to :class:`BSON`.
00446
00447 :Parameters:
00448 - `document`: mapping type representing a document
00449 - `check_keys` (optional): check if keys start with '$' or
00450 contain '.', raising :class:`~bson.errors.InvalidDocument` in
00451 either case
00452
00453 .. versionadded:: 1.9
00454 """
00455 return cls(_dict_to_bson(document, check_keys))
00456
00457 def to_dict(self, as_class=dict, tz_aware=False):
00458 """DEPRECATED - `to_dict` has been renamed to `decode`.
00459
00460 .. versionchanged:: 1.9
00461 Deprecated in favor of :meth:`decode`
00462 .. versionadded:: 1.8
00463 The `tz_aware` parameter.
00464 .. versionadded:: 1.7
00465 The `as_class` parameter.
00466 """
00467 warnings.warn("`to_dict` has been renamed to `decode`",
00468 DeprecationWarning)
00469 return self.decode(as_class, tz_aware)
00470
00471 def decode(self, as_class=dict, tz_aware=False):
00472 """Decode this BSON data.
00473
00474 The default type to use for the resultant document is
00475 :class:`dict`. Any other class that supports
00476 :meth:`__setitem__` can be used instead by passing it as the
00477 `as_class` parameter.
00478
00479 If `tz_aware` is ``True`` (recommended), any
00480 :class:`~datetime.datetime` instances returned will be
00481 timezone-aware, with their timezone set to
00482 :attr:`bson.tz_util.utc`. Otherwise (default), all
00483 :class:`~datetime.datetime` instances will be naive (but
00484 contain UTC).
00485
00486 :Parameters:
00487 - `as_class` (optional): the class to use for the resulting
00488 document
00489 - `tz_aware` (optional): if ``True``, return timezone-aware
00490 :class:`~datetime.datetime` instances
00491
00492 .. versionadded:: 1.9
00493 """
00494 (document, _) = _bson_to_dict(self, as_class, tz_aware)
00495 return document
00496
00497
00498 def has_c():
00499 """Is the C extension installed?
00500
00501 .. versionadded:: 1.9
00502 """
00503 try:
00504 from bson import _cbson
00505 return True
00506 except ImportError:
00507 return False