00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 """Tools for representing files stored in GridFS."""
00016
00017 import datetime
00018 import math
00019 import os
00020 try:
00021 from cStringIO import StringIO
00022 except ImportError:
00023 from StringIO import StringIO
00024
00025 from bson.binary import Binary
00026 from bson.objectid import ObjectId
00027 from gridfs.errors import (CorruptGridFile,
00028 FileExists,
00029 NoFile,
00030 UnsupportedAPI)
00031 from pymongo import ASCENDING
00032 from pymongo.collection import Collection
00033 from pymongo.errors import DuplicateKeyError
00034
00035 try:
00036 _SEEK_SET = os.SEEK_SET
00037 _SEEK_CUR = os.SEEK_CUR
00038 _SEEK_END = os.SEEK_END
00039
00040 except AttributeError:
00041 _SEEK_SET = 0
00042 _SEEK_CUR = 1
00043 _SEEK_END = 2
00044
00045
00046 """Default chunk size, in bytes."""
00047 DEFAULT_CHUNK_SIZE = 256 * 1024
00048
00049
00050 def _create_property(field_name, docstring,
00051 read_only=False, closed_only=False):
00052 """Helper for creating properties to read/write to files.
00053 """
00054 def getter(self):
00055 if closed_only and not self._closed:
00056 raise AttributeError("can only get %r on a closed file" %
00057 field_name)
00058 return self._file.get(field_name, None)
00059
00060 def setter(self, value):
00061 if self._closed:
00062 self._coll.files.update({"_id": self._file["_id"]},
00063 {"$set": {field_name: value}}, safe=True)
00064 self._file[field_name] = value
00065
00066 if read_only:
00067 docstring = docstring + "\n\nThis attribute is read-only."
00068 elif closed_only:
00069 docstring = "%s\n\n%s" % (docstring, "This attribute is read-only and "
00070 "can only be read after :meth:`close` "
00071 "has been called.")
00072
00073 if not read_only and not closed_only:
00074 return property(getter, setter, doc=docstring)
00075 return property(getter, doc=docstring)
00076
00077
00078 class GridIn(object):
00079 """Class to write data to GridFS.
00080 """
00081 def __init__(self, root_collection, **kwargs):
00082 """Write a file to GridFS
00083
00084 Application developers should generally not need to
00085 instantiate this class directly - instead see the methods
00086 provided by :class:`~gridfs.GridFS`.
00087
00088 Raises :class:`TypeError` if `root_collection` is not an
00089 instance of :class:`~pymongo.collection.Collection`.
00090
00091 Any of the file level options specified in the `GridFS Spec
00092 <http://dochub.mongodb.org/core/gridfsspec>`_ may be passed as
00093 keyword arguments. Any additional keyword arguments will be
00094 set as additional fields on the file document. Valid keyword
00095 arguments include:
00096
00097 - ``"_id"``: unique ID for this file (default:
00098 :class:`~bson.objectid.ObjectId`) - this ``"_id"`` must
00099 not have already been used for another file
00100
00101 - ``"filename"``: human name for the file
00102
00103 - ``"contentType"`` or ``"content_type"``: valid mime-type
00104 for the file
00105
00106 - ``"chunkSize"`` or ``"chunk_size"``: size of each of the
00107 chunks, in bytes (default: 256 kb)
00108
00109 - ``"encoding"``: encoding used for this file - any
00110 :class:`unicode` that is written to the file will be
00111 converted to a :class:`str` with this encoding
00112
00113 :Parameters:
00114 - `root_collection`: root collection to write to
00115 - `**kwargs` (optional): file level options (see above)
00116 """
00117 if not isinstance(root_collection, Collection):
00118 raise TypeError("root_collection must be an "
00119 "instance of Collection")
00120
00121
00122 if "content_type" in kwargs:
00123 kwargs["contentType"] = kwargs.pop("content_type")
00124 if "chunk_size" in kwargs:
00125 kwargs["chunkSize"] = kwargs.pop("chunk_size")
00126
00127
00128 kwargs["_id"] = kwargs.get("_id", ObjectId())
00129 kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE)
00130
00131 root_collection.chunks.ensure_index([("files_id", ASCENDING),
00132 ("n", ASCENDING)],
00133 unique=True)
00134 object.__setattr__(self, "_coll", root_collection)
00135 object.__setattr__(self, "_chunks", root_collection.chunks)
00136 object.__setattr__(self, "_file", kwargs)
00137 object.__setattr__(self, "_buffer", StringIO())
00138 object.__setattr__(self, "_position", 0)
00139 object.__setattr__(self, "_chunk_number", 0)
00140 object.__setattr__(self, "_closed", False)
00141
00142 @property
00143 def closed(self):
00144 """Is this file closed?
00145 """
00146 return self._closed
00147
00148 _id = _create_property("_id", "The ``'_id'`` value for this file.",
00149 read_only=True)
00150 filename = _create_property("filename", "Name of this file.")
00151 content_type = _create_property("contentType", "Mime-type for this file.")
00152 length = _create_property("length", "Length (in bytes) of this file.",
00153 closed_only=True)
00154 chunk_size = _create_property("chunkSize", "Chunk size for this file.",
00155 read_only=True)
00156 upload_date = _create_property("uploadDate",
00157 "Date that this file was uploaded.",
00158 closed_only=True)
00159 md5 = _create_property("md5", "MD5 of the contents of this file "
00160 "(generated on the server).",
00161 closed_only=True)
00162
00163 def __getattr__(self, name):
00164 if name in self._file:
00165 return self._file[name]
00166 raise AttributeError("GridIn object has no attribute '%s'" % name)
00167
00168 def __setattr__(self, name, value):
00169 object.__setattr__(self, name, value)
00170 if self._closed:
00171 self._coll.files.update({"_id": self._file["_id"]},
00172 {"$set": {name: value}}, safe=True)
00173
00174 def __flush_data(self, data):
00175 """Flush `data` to a chunk.
00176 """
00177 if not data:
00178 return
00179 assert(len(data) <= self.chunk_size)
00180
00181 chunk = {"files_id": self._file["_id"],
00182 "n": self._chunk_number,
00183 "data": Binary(data)}
00184
00185 self._chunks.insert(chunk)
00186 self._chunk_number += 1
00187 self._position += len(data)
00188
00189 def __flush_buffer(self):
00190 """Flush the buffer contents out to a chunk.
00191 """
00192 self.__flush_data(self._buffer.getvalue())
00193 self._buffer.close()
00194 self._buffer = StringIO()
00195
00196 def __flush(self):
00197 """Flush the file to the database.
00198 """
00199 self.__flush_buffer()
00200
00201 md5 = self._coll.database.command("filemd5", self._id,
00202 root=self._coll.name)["md5"]
00203
00204 self._file["md5"] = md5
00205 self._file["length"] = self._position
00206 self._file["uploadDate"] = datetime.datetime.utcnow()
00207
00208 try:
00209 return self._coll.files.insert(self._file, safe=True)
00210 except DuplicateKeyError:
00211 raise FileExists("file with _id %r already exists" % self._id)
00212
00213 def close(self):
00214 """Flush the file and close it.
00215
00216 A closed file cannot be written any more. Calling
00217 :meth:`close` more than once is allowed.
00218 """
00219 if not self._closed:
00220 self.__flush()
00221 self._closed = True
00222
00223 def write(self, data):
00224 """Write data to the file. There is no return value.
00225
00226 `data` can be either a string of bytes or a file-like object
00227 (implementing :meth:`read`). If the file has an
00228 :attr:`encoding` attribute, `data` can also be a
00229 :class:`unicode` instance, which will be encoded as
00230 :attr:`encoding` before being written.
00231
00232 Due to buffering, the data may not actually be written to the
00233 database until the :meth:`close` method is called. Raises
00234 :class:`ValueError` if this file is already closed. Raises
00235 :class:`TypeError` if `data` is not an instance of
00236 :class:`str`, a file-like object, or an instance of
00237 :class:`unicode` (only allowed if the file has an
00238 :attr:`encoding` attribute).
00239
00240 :Parameters:
00241 - `data`: string of bytes or file-like object to be written
00242 to the file
00243
00244 .. versionadded:: 1.9
00245 The ability to write :class:`unicode`, if the file has an
00246 :attr:`encoding` attribute.
00247 """
00248 if self._closed:
00249 raise ValueError("cannot write to a closed file")
00250
00251
00252 try:
00253 if self._buffer.tell() > 0:
00254 space = self.chunk_size - self._buffer.tell()
00255 self._buffer.write(data.read(space))
00256 self.__flush_buffer()
00257 to_write = data.read(self.chunk_size)
00258 while to_write and len(to_write) == self.chunk_size:
00259 self.__flush_data(to_write)
00260 to_write = data.read(self.chunk_size)
00261 self._buffer.write(to_write)
00262
00263 except AttributeError:
00264 if not isinstance(data, basestring):
00265 raise TypeError("can only write strings or file-like objects")
00266
00267 if isinstance(data, unicode):
00268 try:
00269 data = data.encode(self.encoding)
00270 except AttributeError:
00271 raise TypeError("must specify an encoding for file in "
00272 "order to write unicode")
00273
00274 while data:
00275 space = self.chunk_size - self._buffer.tell()
00276
00277 if len(data) <= space:
00278 self._buffer.write(data)
00279 break
00280 else:
00281 self._buffer.write(data[:space])
00282 self.__flush_buffer()
00283 data = data[space:]
00284
00285 def writelines(self, sequence):
00286 """Write a sequence of strings to the file.
00287
00288 Does not add seperators.
00289 """
00290 for line in sequence:
00291 self.write(line)
00292
00293 def __enter__(self):
00294 """Support for the context manager protocol.
00295 """
00296 return self
00297
00298 def __exit__(self, exc_type, exc_val, exc_tb):
00299 """Support for the context manager protocol.
00300
00301 Close the file and allow exceptions to propogate.
00302 """
00303 self.close()
00304
00305
00306 return False
00307
00308
00309 class GridOut(object):
00310 """Class to read data out of GridFS.
00311 """
00312 def __init__(self, root_collection, file_id=None, file_document=None):
00313 """Read a file from GridFS
00314
00315 Application developers should generally not need to
00316 instantiate this class directly - instead see the methods
00317 provided by :class:`~gridfs.GridFS`.
00318
00319 Either `file_id` or `file_document` must be specified,
00320 `file_document` will be given priority if present. Raises
00321 :class:`TypeError` if `root_collection` is not an instance of
00322 :class:`~pymongo.collection.Collection`.
00323
00324 :Parameters:
00325 - `root_collection`: root collection to read from
00326 - `file_id`: value of ``"_id"`` for the file to read
00327 - `file_document`: file document from `root_collection.files`
00328
00329 .. versionadded:: 1.9
00330 The `file_document` parameter.
00331 """
00332 if not isinstance(root_collection, Collection):
00333 raise TypeError("root_collection must be an "
00334 "instance of Collection")
00335
00336 self.__chunks = root_collection.chunks
00337
00338 files = root_collection.files
00339 self._file = file_document or files.find_one({"_id": file_id})
00340
00341 if not self._file:
00342 raise NoFile("no file in gridfs collection %r with _id %r" %
00343 (files, file_id))
00344
00345 self.__buffer = ""
00346 self.__position = 0
00347
00348 _id = _create_property("_id", "The ``'_id'`` value for this file.", True)
00349 name = _create_property("filename", "Name of this file.", True)
00350 content_type = _create_property("contentType", "Mime-type for this file.",
00351 True)
00352 length = _create_property("length", "Length (in bytes) of this file.",
00353 True)
00354 chunk_size = _create_property("chunkSize", "Chunk size for this file.",
00355 True)
00356 upload_date = _create_property("uploadDate",
00357 "Date that this file was first uploaded.",
00358 True)
00359 aliases = _create_property("aliases", "List of aliases for this file.",
00360 True)
00361 metadata = _create_property("metadata", "Metadata attached to this file.",
00362 True)
00363 md5 = _create_property("md5", "MD5 of the contents of this file "
00364 "(generated on the server).", True)
00365
00366 def __getattr__(self, name):
00367 if name in self._file:
00368 return self._file[name]
00369 raise AttributeError("GridIn object has no attribute '%s'" % name)
00370
00371 def read(self, size=-1):
00372 """Read at most `size` bytes from the file (less if there
00373 isn't enough data).
00374
00375 The bytes are returned as an instance of :class:`str`. If
00376 `size` is negative or omitted all data is read.
00377
00378 :Parameters:
00379 - `size` (optional): the number of bytes to read
00380 """
00381 if size == 0:
00382 return ""
00383
00384 remainder = int(self.length) - self.__position
00385 if size < 0 or size > remainder:
00386 size = remainder
00387
00388 received = len(self.__buffer)
00389 chunk_number = (received + self.__position) / self.chunk_size
00390 chunks = []
00391
00392 while received < size:
00393 chunk = self.__chunks.find_one({"files_id": self._id,
00394 "n": chunk_number})
00395 if not chunk:
00396 raise CorruptGridFile("no chunk #%d" % chunk_number)
00397
00398 if received:
00399 chunk_data = chunk["data"]
00400 else:
00401 chunk_data = chunk["data"][self.__position % self.chunk_size:]
00402
00403 received += len(chunk_data)
00404 chunks.append(chunk_data)
00405 chunk_number += 1
00406
00407 data = "".join([self.__buffer] + chunks)
00408 self.__position += size
00409 to_return = data[:size]
00410 self.__buffer = data[size:]
00411 return to_return
00412
00413 def readline(self, size=-1):
00414 """Read one line or up to `size` bytes from the file.
00415
00416 :Parameters:
00417 - `size` (optional): the maximum number of bytes to read
00418
00419 .. versionadded:: 1.9
00420 """
00421 bytes = ""
00422 while len(bytes) != size:
00423 byte = self.read(1)
00424 bytes += byte
00425 if byte == "" or byte == "\n":
00426 break
00427 return bytes
00428
00429 def tell(self):
00430 """Return the current position of this file.
00431 """
00432 return self.__position
00433
00434 def seek(self, pos, whence=_SEEK_SET):
00435 """Set the current position of this file.
00436
00437 :Parameters:
00438 - `pos`: the position (or offset if using relative
00439 positioning) to seek to
00440 - `whence` (optional): where to seek
00441 from. :attr:`os.SEEK_SET` (``0``) for absolute file
00442 positioning, :attr:`os.SEEK_CUR` (``1``) to seek relative
00443 to the current position, :attr:`os.SEEK_END` (``2``) to
00444 seek relative to the file's end.
00445 """
00446 if whence == _SEEK_SET:
00447 new_pos = pos
00448 elif whence == _SEEK_CUR:
00449 new_pos = self.__position + pos
00450 elif whence == _SEEK_END:
00451 new_pos = int(self.length) + pos
00452 else:
00453 raise IOError(22, "Invalid value for `whence`")
00454
00455 if new_pos < 0:
00456 raise IOError(22, "Invalid value for `pos` - must be positive")
00457
00458 self.__position = new_pos
00459 self.__buffer = ""
00460
00461 def __iter__(self):
00462 """Return an iterator over all of this file's data.
00463
00464 The iterator will return chunk-sized instances of
00465 :class:`str`. This can be useful when serving files using a
00466 webserver that handles such an iterator efficiently.
00467 """
00468 return GridOutIterator(self, self.__chunks)
00469
00470
00471 class GridOutIterator(object):
00472 def __init__(self, grid_out, chunks):
00473 self.__id = grid_out._id
00474 self.__chunks = chunks
00475 self.__current_chunk = 0
00476 self.__max_chunk = math.ceil(float(grid_out.length) /
00477 grid_out.chunk_size)
00478
00479 def __iter__(self):
00480 return self
00481
00482 def next(self):
00483 if self.__current_chunk >= self.__max_chunk:
00484 raise StopIteration
00485 chunk = self.__chunks.find_one({"files_id": self.__id,
00486 "n": self.__current_chunk})
00487 if not chunk:
00488 raise CorruptGridFile("no chunk #%d" % self.__current_chunk)
00489 self.__current_chunk += 1
00490 return str(chunk["data"])
00491
00492
00493 class GridFile(object):
00494 """No longer supported.
00495
00496 .. versionchanged:: 1.6
00497 The GridFile class is no longer supported.
00498 """
00499 def __init__(self, *args, **kwargs):
00500 raise UnsupportedAPI("The GridFile class is no longer supported. "
00501 "Please use GridIn or GridOut instead.")