pymongo: grid_file.py Source File

00001 # Copyright 2009-2010 10gen, Inc.
00002 #
00003 # Licensed under the Apache License, Version 2.0 (the "License");
00004 # you may not use this file except in compliance with the License.
00005 # You may obtain a copy of the License at
00006 #
00007 # http://www.apache.org/licenses/LICENSE-2.0
00008 #
00009 # Unless required by applicable law or agreed to in writing, software
00010 # distributed under the License is distributed on an "AS IS" BASIS,
00011 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00012 # See the License for the specific language governing permissions and
00013 # limitations under the License.
00014 
00015 """Tools for representing files stored in GridFS."""
00016 
00017 import datetime
00018 import math
00019 import os
00020 try:
00021     from cStringIO import StringIO
00022 except ImportError:
00023     from StringIO import StringIO
00024 
00025 from bson.binary import Binary
00026 from bson.objectid import ObjectId
00027 from gridfs.errors import (CorruptGridFile,
00028                            FileExists,
00029                            NoFile,
00030                            UnsupportedAPI)
00031 from pymongo import ASCENDING
00032 from pymongo.collection import Collection
00033 from pymongo.errors import DuplicateKeyError
00034 
00035 try:
00036     _SEEK_SET = os.SEEK_SET
00037     _SEEK_CUR = os.SEEK_CUR
00038     _SEEK_END = os.SEEK_END
00039 # before 2.5
00040 except AttributeError:
00041     _SEEK_SET = 0
00042     _SEEK_CUR = 1
00043     _SEEK_END = 2
00044 
00045 
00046 """Default chunk size, in bytes."""
00047 DEFAULT_CHUNK_SIZE = 256 * 1024
00048 
00049 
00050 def _create_property(field_name, docstring,
00051                       read_only=False, closed_only=False):
00052     """Helper for creating properties to read/write to files.
00053     """
00054     def getter(self):
00055         if closed_only and not self._closed:
00056             raise AttributeError("can only get %r on a closed file" %
00057                                  field_name)
00058         return self._file.get(field_name, None)
00059 
00060     def setter(self, value):
00061         if self._closed:
00062             self._coll.files.update({"_id": self._file["_id"]},
00063                                     {"$set": {field_name: value}}, safe=True)
00064         self._file[field_name] = value
00065 
00066     if read_only:
00067         docstring = docstring + "\n\nThis attribute is read-only."
00068     elif closed_only:
00069         docstring = "%s\n\n%s" % (docstring, "This attribute is read-only and "
00070                                   "can only be read after :meth:`close` "
00071                                   "has been called.")
00072 
00073     if not read_only and not closed_only:
00074         return property(getter, setter, doc=docstring)
00075     return property(getter, doc=docstring)
00076 
00077 
00078 class GridIn(object):
00079     """Class to write data to GridFS.
00080     """
00081     def __init__(self, root_collection, **kwargs):
00082         """Write a file to GridFS
00083 
00084         Application developers should generally not need to
00085         instantiate this class directly - instead see the methods
00086         provided by :class:`~gridfs.GridFS`.
00087 
00088         Raises :class:`TypeError` if `root_collection` is not an
00089         instance of :class:`~pymongo.collection.Collection`.
00090 
00091         Any of the file level options specified in the `GridFS Spec
00092         <http://dochub.mongodb.org/core/gridfsspec>`_ may be passed as
00093         keyword arguments. Any additional keyword arguments will be
00094         set as additional fields on the file document. Valid keyword
00095         arguments include:
00096 
00097           - ``"_id"``: unique ID for this file (default:
00098             :class:`~bson.objectid.ObjectId`) - this ``"_id"`` must
00099             not have already been used for another file
00100 
00101           - ``"filename"``: human name for the file
00102 
00103           - ``"contentType"`` or ``"content_type"``: valid mime-type
00104             for the file
00105 
00106           - ``"chunkSize"`` or ``"chunk_size"``: size of each of the
00107             chunks, in bytes (default: 256 kb)
00108 
00109           - ``"encoding"``: encoding used for this file - any
00110             :class:`unicode` that is written to the file will be
00111             converted to a :class:`str` with this encoding
00112 
00113         :Parameters:
00114           - `root_collection`: root collection to write to
00115           - `**kwargs` (optional): file level options (see above)
00116         """
00117         if not isinstance(root_collection, Collection):
00118             raise TypeError("root_collection must be an "
00119                             "instance of Collection")
00120 
00121         # Handle alternative naming
00122         if "content_type" in kwargs:
00123             kwargs["contentType"] = kwargs.pop("content_type")
00124         if "chunk_size" in kwargs:
00125             kwargs["chunkSize"] = kwargs.pop("chunk_size")
00126 
00127         # Defaults
00128         kwargs["_id"] = kwargs.get("_id", ObjectId())
00129         kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE)
00130 
00131         root_collection.chunks.ensure_index([("files_id", ASCENDING),
00132                                              ("n", ASCENDING)],
00133                                             unique=True)
00134         object.__setattr__(self, "_coll", root_collection)
00135         object.__setattr__(self, "_chunks", root_collection.chunks)
00136         object.__setattr__(self, "_file", kwargs)
00137         object.__setattr__(self, "_buffer", StringIO())
00138         object.__setattr__(self, "_position", 0)
00139         object.__setattr__(self, "_chunk_number", 0)
00140         object.__setattr__(self, "_closed", False)
00141 
00142     @property
00143     def closed(self):
00144         """Is this file closed?
00145         """
00146         return self._closed
00147 
00148     _id = _create_property("_id", "The ``'_id'`` value for this file.",
00149                             read_only=True)
00150     filename = _create_property("filename", "Name of this file.")
00151     content_type = _create_property("contentType", "Mime-type for this file.")
00152     length = _create_property("length", "Length (in bytes) of this file.",
00153                                closed_only=True)
00154     chunk_size = _create_property("chunkSize", "Chunk size for this file.",
00155                                    read_only=True)
00156     upload_date = _create_property("uploadDate",
00157                                     "Date that this file was uploaded.",
00158                                     closed_only=True)
00159     md5 = _create_property("md5", "MD5 of the contents of this file "
00160                             "(generated on the server).",
00161                             closed_only=True)
00162 
00163     def __getattr__(self, name):
00164         if name in self._file:
00165             return self._file[name]
00166         raise AttributeError("GridIn object has no attribute '%s'" % name)
00167 
00168     def __setattr__(self, name, value):
00169         object.__setattr__(self, name, value)
00170         if self._closed:
00171             self._coll.files.update({"_id": self._file["_id"]},
00172                                     {"$set": {name: value}}, safe=True)
00173 
00174     def __flush_data(self, data):
00175         """Flush `data` to a chunk.
00176         """
00177         if not data:
00178             return
00179         assert(len(data) <= self.chunk_size)
00180 
00181         chunk = {"files_id": self._file["_id"],
00182                  "n": self._chunk_number,
00183                  "data": Binary(data)}
00184 
00185         self._chunks.insert(chunk)
00186         self._chunk_number += 1
00187         self._position += len(data)
00188 
00189     def __flush_buffer(self):
00190         """Flush the buffer contents out to a chunk.
00191         """
00192         self.__flush_data(self._buffer.getvalue())
00193         self._buffer.close()
00194         self._buffer = StringIO()
00195 
00196     def __flush(self):
00197         """Flush the file to the database.
00198         """
00199         self.__flush_buffer()
00200 
00201         md5 = self._coll.database.command("filemd5", self._id,
00202                                           root=self._coll.name)["md5"]
00203 
00204         self._file["md5"] = md5
00205         self._file["length"] = self._position
00206         self._file["uploadDate"] = datetime.datetime.utcnow()
00207 
00208         try:
00209             return self._coll.files.insert(self._file, safe=True)
00210         except DuplicateKeyError:
00211             raise FileExists("file with _id %r already exists" % self._id)
00212 
00213     def close(self):
00214         """Flush the file and close it.
00215 
00216         A closed file cannot be written any more. Calling
00217         :meth:`close` more than once is allowed.
00218         """
00219         if not self._closed:
00220             self.__flush()
00221             self._closed = True
00222 
00223     def write(self, data):
00224         """Write data to the file. There is no return value.
00225 
00226         `data` can be either a string of bytes or a file-like object
00227         (implementing :meth:`read`). If the file has an
00228         :attr:`encoding` attribute, `data` can also be a
00229         :class:`unicode` instance, which will be encoded as
00230         :attr:`encoding` before being written.
00231 
00232         Due to buffering, the data may not actually be written to the
00233         database until the :meth:`close` method is called. Raises
00234         :class:`ValueError` if this file is already closed. Raises
00235         :class:`TypeError` if `data` is not an instance of
00236         :class:`str`, a file-like object, or an instance of
00237         :class:`unicode` (only allowed if the file has an
00238         :attr:`encoding` attribute).
00239 
00240         :Parameters:
00241           - `data`: string of bytes or file-like object to be written
00242             to the file
00243 
00244         .. versionadded:: 1.9
00245            The ability to write :class:`unicode`, if the file has an
00246            :attr:`encoding` attribute.
00247         """
00248         if self._closed:
00249             raise ValueError("cannot write to a closed file")
00250 
00251         # file-like
00252         try:
00253             if self._buffer.tell() > 0:
00254                 space = self.chunk_size - self._buffer.tell()
00255                 self._buffer.write(data.read(space))
00256                 self.__flush_buffer()
00257             to_write = data.read(self.chunk_size)
00258             while to_write and len(to_write) == self.chunk_size:
00259                 self.__flush_data(to_write)
00260                 to_write = data.read(self.chunk_size)
00261             self._buffer.write(to_write)
00262         # string
00263         except AttributeError:
00264             if not isinstance(data, basestring):
00265                 raise TypeError("can only write strings or file-like objects")
00266 
00267             if isinstance(data, unicode):
00268                 try:
00269                     data = data.encode(self.encoding)
00270                 except AttributeError:
00271                     raise TypeError("must specify an encoding for file in "
00272                                     "order to write unicode")
00273 
00274             while data:
00275                 space = self.chunk_size - self._buffer.tell()
00276 
00277                 if len(data) <= space:
00278                     self._buffer.write(data)
00279                     break
00280                 else:
00281                     self._buffer.write(data[:space])
00282                     self.__flush_buffer()
00283                     data = data[space:]
00284 
00285     def writelines(self, sequence):
00286         """Write a sequence of strings to the file.
00287 
00288         Does not add seperators.
00289         """
00290         for line in sequence:
00291             self.write(line)
00292 
00293     def __enter__(self):
00294         """Support for the context manager protocol.
00295         """
00296         return self
00297 
00298     def __exit__(self, exc_type, exc_val, exc_tb):
00299         """Support for the context manager protocol.
00300 
00301         Close the file and allow exceptions to propogate.
00302         """
00303         self.close()
00304 
00305         # propogate exceptions
00306         return False
00307 
00308 
00309 class GridOut(object):
00310     """Class to read data out of GridFS.
00311     """
00312     def __init__(self, root_collection, file_id=None, file_document=None):
00313         """Read a file from GridFS
00314 
00315         Application developers should generally not need to
00316         instantiate this class directly - instead see the methods
00317         provided by :class:`~gridfs.GridFS`.
00318 
00319         Either `file_id` or `file_document` must be specified,
00320         `file_document` will be given priority if present. Raises
00321         :class:`TypeError` if `root_collection` is not an instance of
00322         :class:`~pymongo.collection.Collection`.
00323 
00324         :Parameters:
00325           - `root_collection`: root collection to read from
00326           - `file_id`: value of ``"_id"`` for the file to read
00327           - `file_document`: file document from `root_collection.files`
00328 
00329         .. versionadded:: 1.9
00330            The `file_document` parameter.
00331         """
00332         if not isinstance(root_collection, Collection):
00333             raise TypeError("root_collection must be an "
00334                             "instance of Collection")
00335 
00336         self.__chunks = root_collection.chunks
00337 
00338         files = root_collection.files
00339         self._file = file_document or files.find_one({"_id": file_id})
00340 
00341         if not self._file:
00342             raise NoFile("no file in gridfs collection %r with _id %r" %
00343                          (files, file_id))
00344 
00345         self.__buffer = ""
00346         self.__position = 0
00347 
00348     _id = _create_property("_id", "The ``'_id'`` value for this file.", True)
00349     name = _create_property("filename", "Name of this file.", True)
00350     content_type = _create_property("contentType", "Mime-type for this file.",
00351                                      True)
00352     length = _create_property("length", "Length (in bytes) of this file.",
00353                                True)
00354     chunk_size = _create_property("chunkSize", "Chunk size for this file.",
00355                                    True)
00356     upload_date = _create_property("uploadDate",
00357                                     "Date that this file was first uploaded.",
00358                                     True)
00359     aliases = _create_property("aliases", "List of aliases for this file.",
00360                                 True)
00361     metadata = _create_property("metadata", "Metadata attached to this file.",
00362                                  True)
00363     md5 = _create_property("md5", "MD5 of the contents of this file "
00364                             "(generated on the server).", True)
00365 
00366     def __getattr__(self, name):
00367         if name in self._file:
00368             return self._file[name]
00369         raise AttributeError("GridIn object has no attribute '%s'" % name)
00370 
00371     def read(self, size=-1):
00372         """Read at most `size` bytes from the file (less if there
00373         isn't enough data).
00374 
00375         The bytes are returned as an instance of :class:`str`. If
00376         `size` is negative or omitted all data is read.
00377 
00378         :Parameters:
00379           - `size` (optional): the number of bytes to read
00380         """
00381         if size == 0:
00382             return ""
00383 
00384         remainder = int(self.length) - self.__position
00385         if size < 0 or size > remainder:
00386             size = remainder
00387 
00388         received = len(self.__buffer)
00389         chunk_number = (received + self.__position) / self.chunk_size
00390         chunks = []
00391 
00392         while received < size:
00393             chunk = self.__chunks.find_one({"files_id": self._id,
00394                                             "n": chunk_number})
00395             if not chunk:
00396                 raise CorruptGridFile("no chunk #%d" % chunk_number)
00397 
00398             if received:
00399                 chunk_data = chunk["data"]
00400             else:
00401                 chunk_data = chunk["data"][self.__position % self.chunk_size:]
00402 
00403             received += len(chunk_data)
00404             chunks.append(chunk_data)
00405             chunk_number += 1
00406 
00407         data = "".join([self.__buffer] + chunks)
00408         self.__position += size
00409         to_return = data[:size]
00410         self.__buffer = data[size:]
00411         return to_return
00412 
00413     def readline(self, size=-1):
00414         """Read one line or up to `size` bytes from the file.
00415 
00416         :Parameters:
00417          - `size` (optional): the maximum number of bytes to read
00418 
00419         .. versionadded:: 1.9
00420         """
00421         bytes = ""
00422         while len(bytes) != size:
00423             byte = self.read(1)
00424             bytes += byte
00425             if byte == "" or byte == "\n":
00426                 break
00427         return bytes
00428 
00429     def tell(self):
00430         """Return the current position of this file.
00431         """
00432         return self.__position
00433 
00434     def seek(self, pos, whence=_SEEK_SET):
00435         """Set the current position of this file.
00436 
00437         :Parameters:
00438          - `pos`: the position (or offset if using relative
00439            positioning) to seek to
00440          - `whence` (optional): where to seek
00441            from. :attr:`os.SEEK_SET` (``0``) for absolute file
00442            positioning, :attr:`os.SEEK_CUR` (``1``) to seek relative
00443            to the current position, :attr:`os.SEEK_END` (``2``) to
00444            seek relative to the file's end.
00445         """
00446         if whence == _SEEK_SET:
00447             new_pos = pos
00448         elif whence == _SEEK_CUR:
00449             new_pos = self.__position + pos
00450         elif whence == _SEEK_END:
00451             new_pos = int(self.length) + pos
00452         else:
00453             raise IOError(22, "Invalid value for `whence`")
00454 
00455         if new_pos < 0:
00456             raise IOError(22, "Invalid value for `pos` - must be positive")
00457 
00458         self.__position = new_pos
00459         self.__buffer = ""
00460 
00461     def __iter__(self):
00462         """Return an iterator over all of this file's data.
00463 
00464         The iterator will return chunk-sized instances of
00465         :class:`str`. This can be useful when serving files using a
00466         webserver that handles such an iterator efficiently.
00467         """
00468         return GridOutIterator(self, self.__chunks)
00469 
00470 
00471 class GridOutIterator(object):
00472     def __init__(self, grid_out, chunks):
00473         self.__id = grid_out._id
00474         self.__chunks = chunks
00475         self.__current_chunk = 0
00476         self.__max_chunk = math.ceil(float(grid_out.length) /
00477                                      grid_out.chunk_size)
00478 
00479     def __iter__(self):
00480         return self
00481 
00482     def next(self):
00483         if self.__current_chunk >= self.__max_chunk:
00484             raise StopIteration
00485         chunk = self.__chunks.find_one({"files_id": self.__id,
00486                                         "n": self.__current_chunk})
00487         if not chunk:
00488             raise CorruptGridFile("no chunk #%d" % self.__current_chunk)
00489         self.__current_chunk += 1
00490         return str(chunk["data"])
00491 
00492 
00493 class GridFile(object):
00494     """No longer supported.
00495 
00496     .. versionchanged:: 1.6
00497        The GridFile class is no longer supported.
00498     """
00499     def __init__(self, *args, **kwargs):
00500         raise UnsupportedAPI("The GridFile class is no longer supported. "
00501                              "Please use GridIn or GridOut instead.")