Context Navigation

Back to Ticket #2070

Ticket #2070: 5070-streaming-file-upload.diff

File 5070-streaming-file-upload.diff, 26.7 KB (added by Øyvind Saltvik <oyvind@…>, 18 years ago)
Updated to trunk, without changes

django/http/init.py

 import os
+import os, pickle
 from Cookie import SimpleCookie
 from pprint import pformat
 from urllib import urlencode, quote
 from django.utils.datastructures import MultiValueDict
+try:
+    from cStringIO import StringIO
+except ImportError:
+    from StringIO import StringIO
 RESERVED_CHARS="!*'();:@&=+$,/?%#[]"
 try:
 …
     def is_secure(self):
         return os.environ.get("HTTPS") == "on"
+def parse_file_upload(header_dict, post_data):
+    "Returns a tuple of (POST MultiValueDict, FILES MultiValueDict)"
+    import email, email.Message
+    from cgi import parse_header
+    raw_message = '\r\n'.join(['%s:%s' % pair for pair in header_dict.items()])
+    raw_message += '\r\n\r\n' + post_data
+    msg = email.message_from_string(raw_message)
+    POST = MultiValueDict()
+    FILES = MultiValueDict()
+    for submessage in msg.get_payload():
+        if submessage and isinstance(submessage, email.Message.Message):
+            name_dict = parse_header(submessage['Content-Disposition'])[1]
+            # name_dict is something like {'name': 'file', 'filename': 'test.txt'} for file uploads
+            # or {'name': 'blah'} for POST fields
+            # We assume all uploaded files have a 'filename' set.
+            if name_dict.has_key('filename'):
+                assert type([]) != type(submessage.get_payload()), "Nested MIME messages are not supported"
+                if not name_dict['filename'].strip():
+                    continue
+                # IE submits the full path, so trim everything but the basename.
+                # (We can't use os.path.basename because it expects Linux paths.)
+                filename = name_dict['filename'][name_dict['filename'].rfind("\\")+1:]
+                FILES.appendlist(name_dict['name'], {
+                    'filename': filename,
+                    'content-type': (submessage.has_key('Content-Type') and submessage['Content-Type'] or None),
+                    'content': submessage.get_payload(),
+                })
+def parse_file_upload(headers, input):
+    from django.conf import settings
+    # Only stream files to disk if FILE_STREAMING_DIR is set
+    file_upload_dir = getattr(settings, 'FILE_UPLOAD_DIR', None)
+    file_upload_min_size = getattr(settings, 'FILE_UPLOAD_MIN_SIZE', 100000)
+    try:
+        parser = MultiPartParser(headers, input, file_upload_dir, file_upload_min_size)
+        return parser.parse()
+    except MultiPartParserError, e:
+        return MultiValueDict({ '_file_upload_error': [e.message] }), {}
+class MultiPartParserError(Exception):
+    def __init__(self, message):
+        self.message = message
+    def __str__(self):
+        return repr(self.message)
+class MultiPartParser(object):
+    """
+    A rfc2388 multipart/form-data parser.
+    parse() reads the input stream in chunk_size chunks and returns a
+    tuple of (POST MultiValueDict, FILES MultiValueDict). If
+    file_upload_dir is defined files will be streamed to temporary
+    files in the specified directory.
+    The FILES dictionary will have 'filename', 'content-type',
+    'content' and 'content-length' entries. For streamed files it will
+    also have 'tmpfilename' and 'tmpfile'. The 'content' entry will
+    only be read from disk when referenced for streamed files.
+    If the header X-Progress-ID is sent with a 32 character hex string
+    a temporary file with the same name will be created in
+    `file_upload_dir`` with a pickled { 'received', 'size' }
+    dictionary with the number of bytes received and the size expected
+    respectively. The file will be unlinked when the parser finishes.
+    """
+    def __init__(self, headers, input, file_upload_dir=None, file_upload_min_size=None, chunk_size=1024*64):
+        try:
+            content_length = int(headers['Content-Length'])
+        except:
+            raise MultiPartParserError('Invalid Content-Length: %s' % headers.get('Content-Length'))
+        content_type = headers.get('Content-Type')
+        if not content_type or not content_type.startswith('multipart/'):
+            raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
+        ctype, opts = self.parse_header(content_type)
+        boundary = opts.get('boundary')
+        from cgi import valid_boundary
+        if not boundary or not valid_boundary(boundary):
+            raise MultiPartParserError('Invalid boundary in multipart form: %s' % boundary)
+        # check if we got a valid X-Progress-ID id
+        progress_id = headers.get('X-Progress-ID')
+        if file_upload_dir and progress_id:
+            import re
+            if re.match(r'^[0-9a-zA-Z]{32}$', progress_id):
+                self._progress_filename = os.path.join(file_upload_dir, progress_id)
             else:
+                POST.appendlist(name_dict['name'], submessage.get_payload())
+    return POST, FILES
+                raise MultiPartParserError('Invalid X-Progress-ID: %s' % progress_id)
+        else:
+            self._progress_filename = None
+        self._boundary = '--' + boundary
+        self._input = input
+        self._size = content_length
+        self._received = 0
+        self._file_upload_dir = file_upload_dir
+        self._chunk_size = chunk_size
+        self._state = 'PREAMBLE'
+        self._partial = ''
+        self._post = MultiValueDict()
+        self._files = MultiValueDict()
+        if file_upload_min_size is not None and content_length < file_upload_min_size:
+            self._file_upload_dir = None # disable file streaming for small request
+        try:
+            # use mx fast string search if available
+            from mx.TextTools import FS
+            self._fs = FS(self._boundary)
+        except ImportError:
+            self._fs = None
+    def parse(self):
+        try:
+            self._parse()
+        finally:
+            if self._progress_filename:
+                try:
+                    os.unlink(self._progress_filename)
+                except OSError:
+                    pass
+        return self._post, self._files
+    def _parse(self):
+        size = self._size
+        try:
+            while size > 0:
+                n = self._read(self._input, min(self._chunk_size, size))
+                if not n:
+                    break
+                size -= n
+        except:
+            # consume any remaining data so we dont generate a "Connection Reset" error
+            size = self._size - self._received
+            while size > 0:
+                data = self._input.read(min(self._chunk_size, size))
+                size -= len(data)
+            raise
+    def _find_boundary(self, data, start, stop):
+        """
+        Find the next boundary and return the end of current part
+        and start of next part.
+        """
+        if self._fs:
+            boundary = self._fs.find(data, start, stop)
+        else:
+            boundary = data.find(self._boundary, start, stop)
+        if boundary >= 0:
+            end = boundary
+            next = boundary + len(self._boundary)
+            # backup over CRLF
+            if end > 0 and data[end-1] == '\n': end -= 1
+            if end > 0 and data[end-1] == '\r': end -= 1
+            # skip over --CRLF
+            if next < stop and data[next] == '-': next += 1
+            if next < stop and data[next] == '-': next += 1
+            if next < stop and data[next] == '\r': next += 1
+            if next < stop and data[next] == '\n': next += 1
+            return True, end, next
+        else:
+            return False, stop, stop
+    class TemporaryFile(object):
+        "A temporary file that tries to delete itself when garbage collected."
+        def __init__(self, dir):
+            import tempfile
+            (fd, name) = tempfile.mkstemp(suffix='.upload', dir=dir)
+            self.file = os.fdopen(fd, 'w+b')
+            self.name = name
+        def __getattr__(self, name):
+            a = getattr(self.__dict__['file'], name)
+            if type(a) != type(0):
+                setattr(self, name, a)
+            return a
+        def __del__(self):
+            try:
+                os.unlink(self.name)
+            except OSError:
+                pass
+    class LazyContent(dict):
+        """
+        A lazy FILES dictionary entry that reads the contents from
+        tmpfile only when referenced.
+        """
+        def __init__(self, data):
+            dict.__init__(self, data)
+        def __getitem__(self, key):
+            if key == 'content' and not self.has_key(key):
+                self['tmpfile'].seek(0)
+                self['content'] = self['tmpfile'].read()
+            return dict.__getitem__(self, key)
+    def _read(self, input, size):
+        data = input.read(size)
+        if not data:
+            return 0
+        read_size = len(data)
+        self._received += read_size
+        if self._partial:
+            data = self._partial + data
+        start = 0
+        stop = len(data)
+        while start < stop:
+            boundary, end, next = self._find_boundary(data, start, stop)
+            if not boundary and read_size:
+                # make sure we dont treat a partial boundary (and its separators) as data
+                stop -= len(self._boundary) + 16
+                end = next = stop
+                if end <= start:
+                    break # need more data
+            if self._state == 'PREAMBLE':
+                # Preamble, just ignore it
+                self._state = 'HEADER'
+            elif self._state == 'HEADER':
+                # Beginning of header, look for end of header and parse it if found.
+                header_end = data.find('\r\n\r\n', start, stop)
+                if header_end == -1:
+                    break # need more data
+                header = data[start:header_end]
+                self._fieldname = None
+                self._filename = None
+                self._content_type = None
+                for line in header.split('\r\n'):
+                    ctype, opts = self.parse_header(line)
+                    if ctype == 'content-disposition: form-data':
+                        self._fieldname = opts.get('name')
+                        self._filename = opts.get('filename')
+                    elif ctype.startswith('content-type: '):
+                        self._content_type = ctype[14:]
+                if self._filename is not None:
+                    # cleanup filename from IE full paths:
+                    self._filename = self._filename[self._filename.rfind("\\")+1:].strip()
+                    if self._filename: # ignore files without filenames
+                        if self._file_upload_dir:
+                            try:
+                                self._file = self.TemporaryFile(dir=self._file_upload_dir)
+                            except:
+                                raise MultiPartParserError("Failed to create temporary file.")
+                        else:
+                            self._file = StringIO()
+                    else:
+                        self._file = None
+                    self._filesize = 0
+                    self._state = 'FILE'
+                else:
+                    self._field = StringIO()
+                    self._state = 'FIELD'
+                next = header_end + 4
+            elif self._state == 'FIELD':
+                # In a field, collect data until a boundary is found.
+                self._field.write(data[start:end])
+                if boundary:
+                    if self._fieldname:
+                        self._post.appendlist(self._fieldname, self._field.getvalue())
+                    self._field.close()
+                    self._state = 'HEADER'
+            elif self._state == 'FILE':
+                # In a file, collect data until a boundary is found.
+                if self._file:
+                    try:
+                        self._file.write(data[start:end])
+                    except IOError, e:
+                        raise MultiPartParserError("Failed to write to temporary file.")
+                    self._filesize += end-start
+                    if self._progress_filename:
+                        f = open(os.path.join(self._file_upload_dir, self._progress_filename), 'w')
+                        pickle.dump({ 'received': self._received, 'size': self._size }, f)
+                        f.close()
+                if boundary:
+                    if self._file:
+                        if self._file_upload_dir:
+                            self._file.seek(0)
+                            file = self.LazyContent({
+                                'filename': self._filename,
+                                'content-type':  self._content_type,
+                                # 'content': is read on demand
+                                'content-length': self._filesize,
+                                'tmpfilename': self._file.name,
+                                'tmpfile': self._file
+                            })
+                        else:
+                            file = {
+                                'filename': self._filename,
+                                'content-type':  self._content_type,
+                                'content': self._file.getvalue(),
+                                'content-length': self._filesize
+                            }
+                            self._file.close()
+                        self._files.appendlist(self._fieldname, file)
+                    self._state = 'HEADER'
+            start = next
+        self._partial = data[start:]
+        return read_size
+    def parse_header(self, line):
+        from cgi import parse_header
+        return parse_header(line)
 class QueryDict(MultiValueDict):
     """A specialized MultiValueDict that takes a query string when initialized.
     This is immutable unless you create a copy of it."""
 …
     if not host:
         host = request.META.get('HTTP_HOST', '')
     return host

django/db/models/base.py

     def _get_FIELD_size(self, field):
         return os.path.getsize(self._get_FIELD_filename(field))
     def _save_FIELD_file(self, field, filename, raw_contents, save=True):
+    def _save_FIELD_file(self, field, filename, raw_field):
         directory = field.get_directory_name()
         try: # Create the date-based directory if it doesn't exist.
             os.makedirs(os.path.join(settings.MEDIA_ROOT, directory))
 …
         setattr(self, field.attname, filename)
         full_filename = self._get_FIELD_filename(field)
+        fp = open(full_filename, 'wb')
+        fp.write(raw_contents)
+        fp.close()
+        if raw_field.has_key('tmpfilename'):
+            raw_field['tmpfile'].close()
+            os.rename(raw_field['tmpfilename'], full_filename)
+        else:
+            fp = open(full_filename, 'wb')
+            fp.write(raw_field['content'])
+            fp.close()
         # Save the width and/or height, if applicable.
         if isinstance(field, ImageField) and (field.width_field or field.height_field):

django/db/models/fields/init.py

         setattr(cls, 'get_%s_filename' % self.name, curry(cls._get_FIELD_filename, field=self))
         setattr(cls, 'get_%s_url' % self.name, curry(cls._get_FIELD_url, field=self))
         setattr(cls, 'get_%s_size' % self.name, curry(cls._get_FIELD_size, field=self))
         setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_contents, save=True: instance._save_FIELD_file(self, filename, raw_contents, save))
+        setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_field: instance._save_FIELD_file(self, filename, raw_field))
         dispatcher.connect(self.delete_file, signal=signals.post_delete, sender=cls)
     def delete_file(self, instance):
 …
         if new_data.get(upload_field_name, False):
             func = getattr(new_object, 'save_%s_file' % self.name)
             if rel:
                 func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0]["content"], save)
+                func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0])
             else:
                 func(new_data[upload_field_name]["filename"], new_data[upload_field_name]["content"], save)
+                func(new_data[upload_field_name]["filename"], new_data[upload_field_name])
     def get_directory_name(self):
         return os.path.normpath(datetime.datetime.now().strftime(self.upload_to))

django/oldforms/init.py

         self.validator_list = [self.isNonEmptyFile] + validator_list
     def isNonEmptyFile(self, field_data, all_data):
         try:
             content = field_data['content']
         except TypeError:
+        if field_data.has_key('_file_upload_error'):
+            raise validators.CriticalValidationError, field_data['_file_upload_error']
+        if not field_data.has_key('filename'):
             raise validators.CriticalValidationError, gettext("No file was submitted. Check the encoding type on the form.")
         if not content:
+        if not field_data['content-length']:
             raise validators.CriticalValidationError, gettext("The submitted file is empty.")
     def render(self, data):
         return '<input type="file" id="%s" class="v%s" name="%s" />' % \
             (self.get_id(), self.__class__.__name__, self.field_name)
+    def prepare(self, new_data):
+        if new_data.has_key('_file_upload_error'):
+            # pretend we got something in the field to raise a validation error later
+            new_data[self.field_name] = { '_file_upload_error': new_data['_file_upload_error'] }
     def html2python(data):
         if data is None:
             raise EmptyValue

django/core/handlers/wsgi.py

             if self.environ.get('CONTENT_TYPE', '').startswith('multipart'):
                 header_dict = dict([(k, v) for k, v in self.environ.items() if k.startswith('HTTP_')])
                 header_dict['Content-Type'] = self.environ.get('CONTENT_TYPE', '')
+                self._post, self._files = http.parse_file_upload(header_dict, self.raw_post_data)
+                header_dict['Content-Length'] = self.environ.get('CONTENT_LENGTH', '')
+                header_dict['X-Progress-ID'] = self.environ.get('HTTP_X_PROGRESS_ID', '')
+                try:
+                    self._post, self._files = http.parse_file_upload(header_dict, self.environ['wsgi.input'])
+                except:
+                    self._post, self._files = {}, {} # make sure we dont read the input stream again
+                    raise
+                self._raw_post_data = None # raw data is not available for streamed multipart messages
             else:
                 self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()
         else:

django/core/handlers/modpython.py

     def _load_post_and_files(self):
         "Populates self._post and self._files"
         if self._req.headers_in.has_key('content-type') and self._req.headers_in['content-type'].startswith('multipart'):
+            self._post, self._files = http.parse_file_upload(self._req.headers_in, self.raw_post_data)
+            self._raw_post_data = None # raw data is not available for streamed multipart messages
+            try:
+                self._post, self._files = http.parse_file_upload(self._req.headers_in, self._req)
+            except:
+                self._post, self._files = {}, {} # make sure we dont read the input stream again
+                raise
         else:
             self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict()

tests/modeltests/test_client/views.py

     return HttpResponse(t.render(c))
+def post_file_view(request):
+    "A view that expects a multipart post and returns a file in the context"
+    t = Template('File {{ file.filename }} received', name='POST Template')
+    c = Context({'file': request.FILES['file_file']})
+    return HttpResponse(t.render(c))
 def redirect_view(request):
     "A view that redirects all requests to the GET view"
     return HttpResponseRedirect('/test_client/get_view/')

tests/modeltests/test_client/models.py

         self.assertEqual(response.template.name, "Book template")
         self.assertEqual(response.content, "Blink - Malcolm Gladwell")
+    def test_post_file_view(self):
+        "POST this python file to a view"
+        import os, tempfile
+        from django.conf import settings
+        file = __file__.replace('.pyc', '.py')
+        for upload_dir in [None, tempfile.gettempdir()]:
+            settings.FILE_UPLOAD_DIR = upload_dir
+            post_data = { 'name': file, 'file': open(file) }
+            response = self.client.post('/test_client/post_file_view/', post_data)
+            self.failUnless('models.py' in response.context['file']['filename'])
+            self.failUnless(len(response.context['file']['content']) == os.path.getsize(file))
+            if upload_dir:
+                self.failUnless(response.context['file']['tmpfilename'])
     def test_redirect(self):
         "GET a URL that redirects elsewhere"
         response = self.client.get('/test_client/redirect_view/')

tests/modeltests/test_client/urls.py

     (r'^get_view/$', views.get_view),
     (r'^post_view/$', views.post_view),
     (r'^raw_post_view/$', views.raw_post_view),
+    (r'^post_file_view/$', views.post_file_view),
     (r'^redirect_view/$', views.redirect_view),
     (r'^form_view/$', views.form_view),
     (r'^login_protected_view/$', views.login_protected_view),

docs/request_response.txt

 ``FILES``
     A dictionary-like object containing all uploaded files. Each key in
     ``FILES`` is the ``name`` from the ``<input type="file" name="" />``. Each
     value in ``FILES`` is a standard Python dictionary with the following three
+    value in ``FILES`` is a standard Python dictionary with the following four
     keys:
         * ``filename`` -- The name of the uploaded file, as a Python string.
         * ``content-type`` -- The content type of the uploaded file.
         * ``content`` -- The raw content of the uploaded file.
+        * ``content-length`` -- The length of the content in bytes.
+    If streaming file uploads are enabled two additional keys
+    describing the uploaded file will be present:
+        * ``tmpfilename`` -- The filename for the temporary file.
+        * ``tmpfile`` -- An open file object for the temporary file.
+    The temporary file will be removed when the request finishes.
+    Note that accessing ``content`` when streaming uploads are enabled
+    will read the whole file into memory which may not be what you want.
     Note that ``FILES`` will only contain data if the request method was POST
     and the ``<form>`` that posted to the request had
     ``enctype="multipart/form-data"``. Otherwise, ``FILES`` will be a blank

docs/settings.txt

 .. _Testing Django Applications: ../testing/
+FILE_UPLOAD_DIR
+---------------
+Default: Not defined
+Path to a directory where temporary files should be written during
+file uploads. Leaving this unset will read files into memory.
+FILE_UPLOAD_MIN_SIZE
+--------------------
+Default: 100000
+An integer specifying the minimum number of bytes that has to be
+received for file upload streaming to take place. Any request smaller
+than this will be handled in memory. Note: ``FILE_UPLOAD_DIR`` has to
+be defined to enable streaming.
 IGNORABLE_404_ENDS
 ------------------

docs/forms.txt

    new_data = request.POST.copy()
    new_data.update(request.FILES)
+Streaming file uploads.
+-----------------------
+File uploads will be read into memory by default. This works fine for
+small to medium sized uploads (from 1MB to 100MB depending on your
+setup and usage). If you want to support larger uploads you can enable
+upload streaming where only a small part of the file will be in memory
+at any time. To do this you need to specify the ``FILE_UPLOAD_DIR``
+setting (see the settings_ document for more details).
+See `request object`_ for more details about ``request.FILES`` objects
+with streaming file uploads enabled.
 Validators
 ==========
 …
     At validation time, the XML fragment is validated against the schema using
     the executable specified in the ``JING_PATH`` setting (see the settings_
     document for more details).
+.. _request object: ../request_response/#httprequest-objects
 .. _`generic views`: ../generic_views/
 .. _`models API`: ../model-api/

Download in other formats:

Original Format

Issues

Context Navigation

Ticket #2070: 5070-streaming-file-upload.diff

django/http/init.py

django/db/models/base.py

django/db/models/fields/init.py

django/oldforms/init.py

django/core/handlers/wsgi.py

django/core/handlers/modpython.py

tests/modeltests/test_client/views.py

tests/modeltests/test_client/models.py

tests/modeltests/test_client/urls.py

docs/request_response.txt

docs/settings.txt

docs/forms.txt

Download in other formats:

Django Links

Learn More

Get Involved

Get Help

Follow Us

Support Us