Ticket #2070: 5089-streaming_file_upload_with_safe_file_move.2.diff
File 5089-streaming_file_upload_with_safe_file_move.2.diff, 29.1 KB (added by , 18 years ago) |
---|
-
django/http/__init__.py
1 import os 1 import os, pickle 2 2 from Cookie import SimpleCookie 3 3 from pprint import pformat 4 4 from urllib import urlencode, quote 5 5 from django.utils.datastructures import MultiValueDict 6 6 7 try: 8 from cStringIO import StringIO 9 except ImportError: 10 from StringIO import StringIO 11 7 12 RESERVED_CHARS="!*'();:@&=+$,/?%#[]" 8 13 9 14 try: … … 42 47 def is_secure(self): 43 48 return os.environ.get("HTTPS") == "on" 44 49 45 def parse_file_upload(header_dict, post_data): 46 "Returns a tuple of (POST MultiValueDict, FILES MultiValueDict)" 47 import email, email.Message 48 from cgi import parse_header 49 raw_message = '\r\n'.join(['%s:%s' % pair for pair in header_dict.items()]) 50 raw_message += '\r\n\r\n' + post_data 51 msg = email.message_from_string(raw_message) 52 POST = MultiValueDict() 53 FILES = MultiValueDict() 54 for submessage in msg.get_payload(): 55 if submessage and isinstance(submessage, email.Message.Message): 56 name_dict = parse_header(submessage['Content-Disposition'])[1] 57 # name_dict is something like {'name': 'file', 'filename': 'test.txt'} for file uploads 58 # or {'name': 'blah'} for POST fields 59 # We assume all uploaded files have a 'filename' set. 60 if name_dict.has_key('filename'): 61 assert type([]) != type(submessage.get_payload()), "Nested MIME messages are not supported" 62 if not name_dict['filename'].strip(): 63 continue 64 # IE submits the full path, so trim everything but the basename. 65 # (We can't use os.path.basename because it expects Linux paths.) 66 filename = name_dict['filename'][name_dict['filename'].rfind("\\")+1:] 67 FILES.appendlist(name_dict['name'], { 68 'filename': filename, 69 'content-type': (submessage.has_key('Content-Type') and submessage['Content-Type'] or None), 70 'content': submessage.get_payload(), 71 }) 50 def parse_file_upload(headers, input): 51 from django.conf import settings 52 53 # Only stream files to disk if FILE_STREAMING_DIR is set 54 file_upload_dir = settings.FILE_UPLOAD_DIR 55 streaming_min_post_size = settings.STREAMING_MIN_POST_SIZE 56 57 try: 58 parser = MultiPartParser(headers, input, file_upload_dir, streaming_min_post_size) 59 return parser.parse() 60 except MultiPartParserError, e: 61 return MultiValueDict({ '_file_upload_error': [e.message] }), {} 62 63 class MultiPartParserError(Exception): 64 def __init__(self, message): 65 self.message = message 66 def __str__(self): 67 return repr(self.message) 68 69 class MultiPartParser(object): 70 """ 71 A rfc2388 multipart/form-data parser. 72 73 parse() reads the input stream in chunk_size chunks and returns a 74 tuple of (POST MultiValueDict, FILES MultiValueDict). If 75 file_upload_dir is defined files will be streamed to temporary 76 files in the specified directory. 77 78 The FILES dictionary will have 'filename', 'content-type', 79 'content' and 'content-length' entries. For streamed files it will 80 also have 'tmpfilename' and 'tmpfile'. The 'content' entry will 81 only be read from disk when referenced for streamed files. 82 83 If the header X-Progress-ID is sent with a 32 character hex string 84 a temporary file with the same name will be created in 85 `file_upload_dir`` with a pickled { 'received', 'size' } 86 dictionary with the number of bytes received and the size expected 87 respectively. The file will be unlinked when the parser finishes. 88 89 """ 90 91 def __init__(self, headers, input, file_upload_dir=None, streaming_min_post_size=None, chunk_size=1024*64): 92 try: 93 content_length = int(headers['Content-Length']) 94 except: 95 raise MultiPartParserError('Invalid Content-Length: %s' % headers.get('Content-Length')) 96 97 content_type = headers.get('Content-Type') 98 99 if not content_type or not content_type.startswith('multipart/'): 100 raise MultiPartParserError('Invalid Content-Type: %s' % content_type) 101 102 ctype, opts = self.parse_header(content_type) 103 boundary = opts.get('boundary') 104 from cgi import valid_boundary 105 if not boundary or not valid_boundary(boundary): 106 raise MultiPartParserError('Invalid boundary in multipart form: %s' % boundary) 107 108 # check if we got a valid X-Progress-ID id 109 progress_id = headers.get('X-Progress-ID') 110 if file_upload_dir and progress_id: 111 import re 112 if re.match(r'^[0-9a-zA-Z]{32}$', progress_id): 113 self._progress_filename = os.path.join(file_upload_dir, progress_id) 72 114 else: 73 POST.appendlist(name_dict['name'], submessage.get_payload()) 74 return POST, FILES 115 raise MultiPartParserError('Invalid X-Progress-ID: %s' % progress_id) 116 else: 117 self._progress_filename = None 118 self._boundary = '--' + boundary 119 self._input = input 120 self._size = content_length 121 self._received = 0 122 self._file_upload_dir = file_upload_dir 123 self._chunk_size = chunk_size 124 self._state = 'PREAMBLE' 125 self._partial = '' 126 self._post = MultiValueDict() 127 self._files = MultiValueDict() 75 128 129 if streaming_min_post_size is not None and content_length < streaming_min_post_size: 130 self._file_upload_dir = None # disable file streaming for small request 131 132 try: 133 # use mx fast string search if available 134 from mx.TextTools import FS 135 self._fs = FS(self._boundary) 136 except ImportError: 137 self._fs = None 138 139 def parse(self): 140 try: 141 self._parse() 142 finally: 143 if self._progress_filename: 144 try: 145 os.unlink(self._progress_filename) 146 except OSError: 147 pass 148 149 return self._post, self._files 150 151 def _parse(self): 152 size = self._size 153 154 try: 155 while size > 0: 156 n = self._read(self._input, min(self._chunk_size, size)) 157 if not n: 158 break 159 size -= n 160 except: 161 # consume any remaining data so we dont generate a "Connection Reset" error 162 size = self._size - self._received 163 while size > 0: 164 data = self._input.read(min(self._chunk_size, size)) 165 size -= len(data) 166 raise 167 168 def _find_boundary(self, data, start, stop): 169 """ 170 Find the next boundary and return the end of current part 171 and start of next part. 172 """ 173 if self._fs: 174 boundary = self._fs.find(data, start, stop) 175 else: 176 boundary = data.find(self._boundary, start, stop) 177 if boundary >= 0: 178 end = boundary 179 next = boundary + len(self._boundary) 180 181 # backup over CRLF 182 if end > 0 and data[end-1] == '\n': end -= 1 183 if end > 0 and data[end-1] == '\r': end -= 1 184 # skip over --CRLF 185 if next < stop and data[next] == '-': next += 1 186 if next < stop and data[next] == '-': next += 1 187 if next < stop and data[next] == '\r': next += 1 188 if next < stop and data[next] == '\n': next += 1 189 190 return True, end, next 191 else: 192 return False, stop, stop 193 194 class TemporaryFile(object): 195 "A temporary file that tries to delete itself when garbage collected." 196 def __init__(self, dir): 197 import tempfile 198 (fd, name) = tempfile.mkstemp(suffix='.upload', dir=dir) 199 self.file = os.fdopen(fd, 'w+b') 200 self.name = name 201 202 def __getattr__(self, name): 203 a = getattr(self.__dict__['file'], name) 204 if type(a) != type(0): 205 setattr(self, name, a) 206 return a 207 208 def __del__(self): 209 try: 210 os.unlink(self.name) 211 except OSError: 212 pass 213 214 class LazyContent(dict): 215 """ 216 A lazy FILES dictionary entry that reads the contents from 217 tmpfile only when referenced. 218 """ 219 def __init__(self, data): 220 dict.__init__(self, data) 221 222 def __getitem__(self, key): 223 if key == 'content' and not self.has_key(key): 224 self['tmpfile'].seek(0) 225 self['content'] = self['tmpfile'].read() 226 return dict.__getitem__(self, key) 227 228 def _read(self, input, size): 229 data = input.read(size) 230 231 if not data: 232 return 0 233 234 read_size = len(data) 235 self._received += read_size 236 237 if self._partial: 238 data = self._partial + data 239 240 start = 0 241 stop = len(data) 242 243 while start < stop: 244 boundary, end, next = self._find_boundary(data, start, stop) 245 246 if not boundary and read_size: 247 # make sure we dont treat a partial boundary (and its separators) as data 248 stop -= len(self._boundary) + 16 249 end = next = stop 250 if end <= start: 251 break # need more data 252 253 if self._state == 'PREAMBLE': 254 # Preamble, just ignore it 255 self._state = 'HEADER' 256 257 elif self._state == 'HEADER': 258 # Beginning of header, look for end of header and parse it if found. 259 260 header_end = data.find('\r\n\r\n', start, stop) 261 if header_end == -1: 262 break # need more data 263 264 header = data[start:header_end] 265 266 self._fieldname = None 267 self._filename = None 268 self._content_type = None 269 270 for line in header.split('\r\n'): 271 ctype, opts = self.parse_header(line) 272 if ctype == 'content-disposition: form-data': 273 self._fieldname = opts.get('name') 274 self._filename = opts.get('filename') 275 elif ctype.startswith('content-type: '): 276 self._content_type = ctype[14:] 277 278 if self._filename is not None: 279 # cleanup filename from IE full paths: 280 self._filename = self._filename[self._filename.rfind("\\")+1:].strip() 281 282 if self._filename: # ignore files without filenames 283 if self._file_upload_dir: 284 try: 285 self._file = self.TemporaryFile(dir=self._file_upload_dir) 286 except: 287 raise MultiPartParserError("Failed to create temporary file.") 288 else: 289 self._file = StringIO() 290 else: 291 self._file = None 292 self._filesize = 0 293 self._state = 'FILE' 294 else: 295 self._field = StringIO() 296 self._state = 'FIELD' 297 next = header_end + 4 298 299 elif self._state == 'FIELD': 300 # In a field, collect data until a boundary is found. 301 302 self._field.write(data[start:end]) 303 if boundary: 304 if self._fieldname: 305 self._post.appendlist(self._fieldname, self._field.getvalue()) 306 self._field.close() 307 self._state = 'HEADER' 308 309 elif self._state == 'FILE': 310 # In a file, collect data until a boundary is found. 311 312 if self._file: 313 try: 314 self._file.write(data[start:end]) 315 except IOError, e: 316 raise MultiPartParserError("Failed to write to temporary file.") 317 self._filesize += end-start 318 319 if self._progress_filename: 320 f = open(os.path.join(self._file_upload_dir, self._progress_filename), 'w') 321 pickle.dump({ 'received': self._received, 'size': self._size }, f) 322 f.close() 323 324 if boundary: 325 if self._file: 326 if self._file_upload_dir: 327 self._file.seek(0) 328 file = self.LazyContent({ 329 'filename': self._filename, 330 'content-type': self._content_type, 331 # 'content': is read on demand 332 'content-length': self._filesize, 333 'tmpfilename': self._file.name, 334 'tmpfile': self._file 335 }) 336 else: 337 file = { 338 'filename': self._filename, 339 'content-type': self._content_type, 340 'content': self._file.getvalue(), 341 'content-length': self._filesize 342 } 343 self._file.close() 344 345 self._files.appendlist(self._fieldname, file) 346 347 self._state = 'HEADER' 348 349 start = next 350 351 self._partial = data[start:] 352 353 return read_size 354 355 def parse_header(self, line): 356 from cgi import parse_header 357 return parse_header(line) 358 359 360 76 361 class QueryDict(MultiValueDict): 77 362 """A specialized MultiValueDict that takes a query string when initialized. 78 363 This is immutable unless you create a copy of it.""" … … 306 591 if not host: 307 592 host = request.META.get('HTTP_HOST', '') 308 593 return host 594 -
django/oldforms/__init__.py
666 666 self.validator_list = [self.isNonEmptyFile] + validator_list 667 667 668 668 def isNonEmptyFile(self, field_data, all_data): 669 try:670 content = field_data['content']671 except TypeError:669 if field_data.has_key('_file_upload_error'): 670 raise validators.CriticalValidationError, field_data['_file_upload_error'] 671 if not field_data.has_key('filename'): 672 672 raise validators.CriticalValidationError, gettext("No file was submitted. Check the encoding type on the form.") 673 if not content:673 if not field_data['content-length']: 674 674 raise validators.CriticalValidationError, gettext("The submitted file is empty.") 675 675 676 676 def render(self, data): 677 677 return '<input type="file" id="%s" class="v%s" name="%s" />' % \ 678 678 (self.get_id(), self.__class__.__name__, self.field_name) 679 679 680 def prepare(self, new_data): 681 if new_data.has_key('_file_upload_error'): 682 # pretend we got something in the field to raise a validation error later 683 new_data[self.field_name] = { '_file_upload_error': new_data['_file_upload_error'] } 684 680 685 def html2python(data): 681 686 if data is None: 682 687 raise EmptyValue -
django/db/models/base.py
12 12 from django.dispatch import dispatcher 13 13 from django.utils.datastructures import SortedDict 14 14 from django.utils.functional import curry 15 from django.utils.file import file_move_safe 15 16 from django.conf import settings 16 17 from itertools import izip 17 18 import types 18 19 import sys 19 20 import os 20 21 22 21 23 class ModelBase(type): 22 24 "Metaclass for all models" 23 25 def __new__(cls, name, bases, attrs): … … 361 363 def _get_FIELD_size(self, field): 362 364 return os.path.getsize(self._get_FIELD_filename(field)) 363 365 364 def _save_FIELD_file(self, field, filename, raw_ contents, save=True):366 def _save_FIELD_file(self, field, filename, raw_field, save=True): 365 367 directory = field.get_directory_name() 366 368 try: # Create the date-based directory if it doesn't exist. 367 369 os.makedirs(os.path.join(settings.MEDIA_ROOT, directory)) … … 383 385 setattr(self, field.attname, filename) 384 386 385 387 full_filename = self._get_FIELD_filename(field) 386 fp = open(full_filename, 'wb') 387 fp.write(raw_contents) 388 fp.close() 388 if raw_field.has_key('tmpfilename'): 389 raw_field['tmpfile'].close() 390 file_move_safe(raw_field['tmpfilename'], full_filename) 391 else: 392 fp = open(full_filename, 'wb') 393 fp.write(raw_field['content']) 394 fp.close() 389 395 390 396 # Save the width and/or height, if applicable. 391 397 if isinstance(field, ImageField) and (field.width_field or field.height_field): -
django/db/models/fields/__init__.py
636 636 setattr(cls, 'get_%s_filename' % self.name, curry(cls._get_FIELD_filename, field=self)) 637 637 setattr(cls, 'get_%s_url' % self.name, curry(cls._get_FIELD_url, field=self)) 638 638 setattr(cls, 'get_%s_size' % self.name, curry(cls._get_FIELD_size, field=self)) 639 setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_ contents, save=True: instance._save_FIELD_file(self, filename, raw_contents, save))639 setattr(cls, 'save_%s_file' % self.name, lambda instance, filename, raw_field, save=True: instance._save_FIELD_file(self, filename, raw_field, save)) 640 640 dispatcher.connect(self.delete_file, signal=signals.post_delete, sender=cls) 641 641 642 642 def delete_file(self, instance): … … 659 659 if new_data.get(upload_field_name, False): 660 660 func = getattr(new_object, 'save_%s_file' % self.name) 661 661 if rel: 662 func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0] ["content"], save)662 func(new_data[upload_field_name][0]["filename"], new_data[upload_field_name][0], save) 663 663 else: 664 func(new_data[upload_field_name]["filename"], new_data[upload_field_name] ["content"], save)664 func(new_data[upload_field_name]["filename"], new_data[upload_field_name], save) 665 665 666 666 def get_directory_name(self): 667 667 return os.path.normpath(datetime.datetime.now().strftime(self.upload_to)) -
django/conf/global_settings.py
240 240 # isExistingURL validator. 241 241 URL_VALIDATOR_USER_AGENT = "Django/0.96pre (http://www.djangoproject.com)" 242 242 243 # The directory to place streamed file uploads. The web server needs write 244 # permissions on this directory. 245 # If this is None, streaming uploads are disabled. 246 FILE_UPLOAD_DIR = None 247 248 249 # The minimum size of a POST before file uploads are streamed to disk. 250 # Any less than this number, and the file is uploaded to memory. 251 # Size is in bytes. 252 STREAMING_MIN_POST_SIZE = 512 * (2**10) 253 254 255 256 243 257 ############## 244 258 # MIDDLEWARE # 245 259 ############## … … 335 349 336 350 # The list of directories to search for fixtures 337 351 FIXTURE_DIRS = () 352 353 -
django/core/handlers/wsgi.py
111 111 if self.environ.get('CONTENT_TYPE', '').startswith('multipart'): 112 112 header_dict = dict([(k, v) for k, v in self.environ.items() if k.startswith('HTTP_')]) 113 113 header_dict['Content-Type'] = self.environ.get('CONTENT_TYPE', '') 114 self._post, self._files = http.parse_file_upload(header_dict, self.raw_post_data) 114 header_dict['Content-Length'] = self.environ.get('CONTENT_LENGTH', '') 115 header_dict['X-Progress-ID'] = self.environ.get('HTTP_X_PROGRESS_ID', '') 116 try: 117 self._post, self._files = http.parse_file_upload(header_dict, self.environ['wsgi.input']) 118 except: 119 self._post, self._files = {}, {} # make sure we dont read the input stream again 120 raise 121 self._raw_post_data = None # raw data is not available for streamed multipart messages 115 122 else: 116 123 self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict() 117 124 else: -
django/core/handlers/modpython.py
47 47 def _load_post_and_files(self): 48 48 "Populates self._post and self._files" 49 49 if self._req.headers_in.has_key('content-type') and self._req.headers_in['content-type'].startswith('multipart'): 50 self._post, self._files = http.parse_file_upload(self._req.headers_in, self.raw_post_data) 50 self._raw_post_data = None # raw data is not available for streamed multipart messages 51 try: 52 self._post, self._files = http.parse_file_upload(self._req.headers_in, self._req) 53 except: 54 self._post, self._files = {}, {} # make sure we dont read the input stream again 55 raise 51 56 else: 52 57 self._post, self._files = http.QueryDict(self.raw_post_data), datastructures.MultiValueDict() 53 58 -
django/utils/file.py
1 import os 2 3 try: 4 import shutils 5 file_move = shutils.move 6 except: 7 file_move = os.rename 8 9 def file_move_safe(old_file_name, new_file_name, chunk_size = 1024*64): 10 """ 11 Moves a file from one location to another in the safest way possible. 12 13 First, it tries using shutils.move, which is OS-dependent but doesn't 14 break with change of filesystems. Then it tries os.rename, which will 15 break if it encounters a change in filesystems. Lastly, it streams 16 it manually from one file to another in python. 17 """ 18 19 try: 20 file_move(old_file_name, new_file_name) 21 return 22 except: 23 pass 24 25 new_file = open(new_file_name, 'wb') 26 old_file = open(old_file_name, 'rb') 27 current_chunk = None 28 29 while current_chunk != '': 30 current_chunk = old_file.read(chunk_size) 31 new_file.write(current_chunk) 32 33 new_file.close() 34 old_file.close() 35 36 os.remove(old_file_name) -
tests/modeltests/test_client/views.py
44 44 45 45 return HttpResponse(t.render(c)) 46 46 47 def post_file_view(request): 48 "A view that expects a multipart post and returns a file in the context" 49 t = Template('File {{ file.filename }} received', name='POST Template') 50 c = Context({'file': request.FILES['file_file']}) 51 return HttpResponse(t.render(c)) 52 47 53 def redirect_view(request): 48 54 "A view that redirects all requests to the GET view" 49 55 return HttpResponseRedirect('/test_client/get_view/') -
tests/modeltests/test_client/models.py
75 75 self.assertEqual(response.template.name, "Book template") 76 76 self.assertEqual(response.content, "Blink - Malcolm Gladwell") 77 77 78 def test_post_file_view(self): 79 "POST this python file to a view" 80 import os, tempfile 81 from django.conf import settings 82 file = __file__.replace('.pyc', '.py') 83 for upload_dir in [None, tempfile.gettempdir()]: 84 settings.FILE_UPLOAD_DIR = upload_dir 85 post_data = { 'name': file, 'file': open(file) } 86 response = self.client.post('/test_client/post_file_view/', post_data) 87 self.failUnless('models.py' in response.context['file']['filename']) 88 self.failUnless(len(response.context['file']['content']) == os.path.getsize(file)) 89 if upload_dir: 90 self.failUnless(response.context['file']['tmpfilename']) 91 92 78 93 def test_redirect(self): 79 94 "GET a URL that redirects elsewhere" 80 95 response = self.client.get('/test_client/redirect_view/') -
tests/modeltests/test_client/urls.py
4 4 urlpatterns = patterns('', 5 5 (r'^get_view/$', views.get_view), 6 6 (r'^post_view/$', views.post_view), 7 (r'^post_file_view/$', views.post_file_view), 7 8 (r'^raw_post_view/$', views.raw_post_view), 8 9 (r'^redirect_view/$', views.redirect_view), 9 10 (r'^form_view/$', views.form_view), -
docs/request_response.txt
72 72 ``FILES`` 73 73 A dictionary-like object containing all uploaded files. Each key in 74 74 ``FILES`` is the ``name`` from the ``<input type="file" name="" />``. Each 75 value in ``FILES`` is a standard Python dictionary with the following three75 value in ``FILES`` is a standard Python dictionary with the following four 76 76 keys: 77 77 78 78 * ``filename`` -- The name of the uploaded file, as a Python string. 79 79 * ``content-type`` -- The content type of the uploaded file. 80 80 * ``content`` -- The raw content of the uploaded file. 81 * ``content-length`` -- The length of the content in bytes. 81 82 83 If streaming file uploads are enabled two additional keys 84 describing the uploaded file will be present: 85 86 * ``tmpfilename`` -- The filename for the temporary file. 87 * ``tmpfile`` -- An open file object for the temporary file. 88 89 The temporary file will be removed when the request finishes. 90 91 Note that accessing ``content`` when streaming uploads are enabled 92 will read the whole file into memory which may not be what you want. 93 82 94 Note that ``FILES`` will only contain data if the request method was POST 83 95 and the ``<form>`` that posted to the request had 84 96 ``enctype="multipart/form-data"``. Otherwise, ``FILES`` will be a blank -
docs/settings.txt
437 437 438 438 .. _Testing Django Applications: ../testing/ 439 439 440 FILE_UPLOAD_DIR 441 --------------- 442 443 Default: ``None`` 444 445 Path to a directory where temporary files should be written during 446 file uploads. Leaving this as ``None`` will disable streaming file uploads, 447 and cause all uploaded files to be stored (temporarily) in memory. 448 440 449 IGNORABLE_404_ENDS 441 450 ------------------ 442 451 … … 774 783 775 784 .. _site framework docs: ../sites/ 776 785 786 STREAMING_MIN_POST_SIZE 787 ----------------------- 788 789 Default: 524288 (``512*1024``) 790 791 An integer specifying the minimum number of bytes that has to be 792 received (in a POST) for file upload streaming to take place. Any 793 request smaller than this will be handled in memory. 794 Note: ``FILE_UPLOAD_DIR`` has to be defined to enable streaming. 795 777 796 TEMPLATE_CONTEXT_PROCESSORS 778 797 --------------------------- 779 798 -
docs/forms.txt
475 475 new_data = request.POST.copy() 476 476 new_data.update(request.FILES) 477 477 478 Streaming file uploads. 479 ----------------------- 480 481 File uploads will be read into memory by default. This works fine for 482 small to medium sized uploads (from 1MB to 100MB depending on your 483 setup and usage). If you want to support larger uploads you can enable 484 upload streaming where only a small part of the file will be in memory 485 at any time. To do this you need to specify the ``FILE_UPLOAD_DIR`` 486 setting (see the settings_ document for more details). 487 488 See `request object`_ for more details about ``request.FILES`` objects 489 with streaming file uploads enabled. 490 478 491 Validators 479 492 ========== 480 493 … … 693 706 .. _`generic views`: ../generic_views/ 694 707 .. _`models API`: ../model-api/ 695 708 .. _settings: ../settings/ 709 .. _request object: ../request_response/#httprequest-objects