Ticket #5253: csv_serializer.patch

File csv_serializer.patch, 12.1 KB (added by Etienne Robillard, 16 years ago)

Patch based on the previous ones and adding support for 0.96.3

  • django/core/serializers/__init__.py

    diff -r c3ff2697c472 django/core/serializers/__init__.py
    a b  
    2323    "xml"    : "django.core.serializers.xml_serializer",
    2424    "python" : "django.core.serializers.python",
    2525    "json"   : "django.core.serializers.json",
     26    "csv"    : "django.core.serializers.csv_serializer"
    2627}
    2728
    2829# Check for PyYaml and register the serializer if it's available.
     
    8788        register_serializer(format, BUILTIN_SERIALIZERS[format])
    8889    if hasattr(settings, "SERIALIZATION_MODULES"):
    8990        for format in settings.SERIALIZATION_MODULES:
    90             register_serializer(format, settings.SERIALIZATION_MODULES[format])
    91  No newline at end of file
     91            register_serializer(format, settings.SERIALIZATION_MODULES[format])
  • new file django/core/serializers/csv_serializer.py

    diff -r c3ff2697c472 django/core/serializers/csv_serializer.py
    - +  
     1"""
     2Serialize multiple table data to and from a single csv stream, using the
     3standard csv module.
     4
     5The format of csv is sort of standardized in rfc4180, stating that there
     6are more implementations, even incompatible ones.  It treats headers as
     7optional where column names are separated the same way as field values.
     8It leaves some important questions open,
     9 - how to handle null values as opposed to empty strings,
     10 - how to handle relations, such as foreign keys or many-to-many
     11   relations,
     12 - how to represent multiple tables in a single csv file.
     13
     14The latter issue is addressed in Creativyst's ctx format at
     15http://www.creativyst.com/Doc/Std/ctx/ctx.htm where a multiple-line
     16header is used to describe metadata.  I didn't want to use their
     17approach because it conflicts with existing csv tools (such as the
     18python csv module) for simpler cases.
     19
     20Let's start with an example what csv this module produces and
     21understands.
     22
     23news_author:registration_number,name
     24555001,Jack
     25555002,Jill
     26
     27news_article:id,authors,title,text,published
     281,"[1, 2]",On Life And Chees,Once upon a time...,2001-05-30
     292,[2],,I should write this,
     30
     31Here is a summary of how values are represented.
     32 - Tables are separated by two lineterminators because it's not
     33   intrusive and gives a good visual guidance.  It's simply parsed as an
     34   empty line by csv tools, preserving the structure.  A single csv file
     35   is also easy to split by the separator using csplit for example.
     36 - Headers are mandatory, containing the column names separated by
     37   commas.
     38 - The first header field is special, it has the form '<table name>:<pk
     39   name>'.  This doesn't conflict with other parsers; and the colon as
     40   separator is widely used in the Unix world and it cannot be part of
     41   the table or column name.  The usage of <pk name> instead of just
     42   'pk' is intentional, although it differs from the constant usage of
     43   'pk' is the json an xml serializers modules -- this is how database
     44   dumps work, for example in sqlite.
     45 - None is represented as an empty string.
     46 - Foreign keys are represented as integers.
     47 - Many-to-many relations are represented as a list of foreign keys.
     48 - Strings are represented as they are except for strings that contain
     49   only zero or more spaces.
     50 - Strings of only zero or more spaces are prepended an extra leading
     51   space, so '' becomes ' ', ' ' becomes '  ', etc.  This may look
     52   strange first but this is how None (represented as '') and ''
     53   (represented as ' ') are distinguished.  Space-only strings are a
     54   rare beast, leading and trailing spaces are also frequently trimmed
     55   by csv parsers, so I find this a fair compromise.
     56"""
     57import codecs
     58import csv
     59try:
     60    from cStringIO import StringIO
     61except ImportError:
     62    from StringIO import StringIO
     63import os
     64import re
     65
     66from django.core.serializers import base
     67from django.db import models
     68# These fields should all extend CharField since they all work with
     69# string data
     70from django.db.models.fields import CharField, FilePathField, SlugField, TextField
     71
     72# FileField and USStateField are only available in Django 1.0.X
     73#from django.db.models.fields.files import FileField
     74#from django.contrib.localflavor.us.models import USStateField
     75
     76spaces_re = re.compile('^[ ]*$')
     77
     78class Serializer(base.Serializer):
     79    "Serialize to csv"
     80
     81    def start_serialization(self):
     82        self.last_model = None
     83        # By default, csv module uses '\r\n' as lineterminator
     84        self.output = UnicodeWriter(self.stream, lineterminator=os.linesep)
     85
     86    def start_object(self, obj):
     87        if not hasattr(obj, "_meta"):
     88            raise base.SerializationError("Non-model object (%s) encountered during serialization" % type(obj))
     89        if self.last_model != obj._meta:
     90            meta = obj._meta
     91            self.last_model = meta
     92            fields = self.selected_fields
     93            if fields:
     94                fields = list(fields)
     95            else:
     96                fields = \
     97                    [field.name for field in meta.fields] + \
     98                    [field.name for field in meta.many_to_many]
     99            if meta.pk.attname in fields:
     100                fields.remove(meta.pk.attname)
     101            header = ['%s:%s' % (meta, meta.pk.attname)]
     102            for field_name in fields:
     103                header.append(field_name)
     104            # Table separator is an empty row
     105            self.output.writerow([])
     106            self.output.writerow(header)
     107        self.row = [str(obj._get_pk_val())]
     108
     109    def end_object(self, obj):
     110        self.output.writerow(self.row)
     111
     112    def handle_field(self, obj, field):
     113        self.row.append(self.get_string_value(obj, field))
     114
     115    def handle_fk_field(self, obj, field):
     116        related = getattr(obj, field.name)
     117        if related is None:
     118            repr = ''
     119        else:
     120            if field.rel.field_name == related._meta.pk.name:
     121                # relation via pk
     122                repr = str(related._get_pk_val())
     123            else:
     124                # relation via other field
     125                repr = str(getattr(related, field.rel.field_name))
     126        self.row.append(repr)
     127
     128    def handle_m2m_field(self, obj, field):
     129        """Represented as a tuple of related ids, or empty string of there
     130        are no related objects"""
     131        related = [related._get_pk_val() for related in getattr(obj, field.name).iterator()]
     132        if related:
     133            self.row.append(str(related))
     134        else:
     135            self.row.append('')
     136
     137    def get_string_value(self, obj, field):
     138        """
     139        None always becomes ''.  For string values prepend a leading
     140        space if the string contains only spaces so '' becomes ' ' and '
     141        ' becomes '  ', etc.  Other values are handled normally.
     142        """
     143        value = getattr(obj, field.name)
     144        if value is None:
     145            return ''
     146        elif is_string_field(field):
     147            if spaces_re.match(value):
     148                return ' ' + value
     149            else:
     150                return value
     151        else:
     152            return super(Serializer, self).get_string_value(obj, field)
     153
     154
     155class Deserializer(base.Deserializer):
     156    "Deserialize from csv"
     157
     158    def __init__(self, stream_or_string, **options):
     159        super(Deserializer, self).__init__(stream_or_string, **options)
     160        self.next = self.__iter__().next
     161
     162    def __iter__(self):
     163        header_coming = True
     164        for values in UnicodeReader(self.stream):
     165            if not values:
     166                header_coming = True
     167            else:
     168                if header_coming:
     169                    # Model
     170                    model, first_field = values[0].split(':', 2)
     171                    try:
     172                        self.model = models.get_model(*model.split("."))
     173                    except TypeError:
     174                        raise base.DeserializationError("No model %s in db" % model)
     175                    # Field names
     176                    self.field_names = [first_field] + values[1:]
     177                    header_coming = False
     178                else:
     179                    # An object
     180                    meta = self.model._meta
     181                    data = {meta.pk.attname: meta.pk.to_python(values[0])}
     182                    m2m_data = {}
     183                    for i in range(1, len(values)):
     184                        name = self.field_names[i]
     185                        value = values[i]
     186                        field = meta.get_field(name)
     187                        if field.rel and isinstance(field.rel, models.ManyToManyRel):
     188                            m2m_data[field.name] = self.handle_m2m_field(value, field)
     189                        elif field.rel and isinstance(field.rel, models.ManyToOneRel):
     190                            data[field.attname] = self.handle_fk_field(value, field)
     191                        else:
     192                            data[field.name] = self.handle_field(value, field)
     193                    yield base.DeserializedObject(self.model(**data), m2m_data)
     194
     195    def handle_field(self, raw, field):
     196        if raw == '':
     197            raw = None
     198        elif is_string_field(field):
     199            if spaces_re.match(raw):
     200                raw = raw[1:]
     201        return field.to_python(raw)
     202
     203    def handle_fk_field(self, raw, field):
     204        if raw == '':
     205            return None
     206        related_field = field.rel.to._meta.get_field(field.rel.field_name)
     207        return related_field.to_python(raw)
     208
     209    def handle_m2m_field(self, raw, field):
     210        if raw:
     211            return eval(raw)
     212        else:
     213            return []
     214
     215
     216def is_string_field(field):
     217    """If all field classes working with strings extended CharField, we
     218    wouldn't need this method"""
     219 
     220    string_types = ('CharField', 'FileField', 'FilePathField', 'SlugField',
     221        'TextField', 'USStateField')
     222   
     223    for s in string_types:
     224        if field.__class__ == s:
     225            #print "%s is of type %s" % (field, s)
     226            return True
     227    return False   
     228
     229    #return bool(isinstance(field,
     230    #    (CharField, FileField, FilePathField, SlugField, TextField,
     231    #    USStateField)))
     232    return NotImplementedError
     233
     234
     235
     236# Copied from csv module examples with some modifications
     237# - getincrementalencoder replaced with getencoder because it works with
     238# python < 2.5
     239
     240class UTF8Recoder:
     241    """
     242    Iterator that reads an encoded stream and reencodes the input to UTF-8
     243    """
     244    def __init__(self, f, encoding):
     245        self.reader = codecs.getreader(encoding)(f)
     246
     247    def __iter__(self):
     248        return self
     249
     250    def next(self):
     251        return self.reader.next().encode("utf-8")
     252
     253class UnicodeReader:
     254    """
     255    A CSV reader which will iterate over lines in the CSV file "f",
     256    which is encoded in the given encoding.
     257    """
     258
     259    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
     260        f = UTF8Recoder(f, encoding)
     261        self.reader = csv.reader(f, dialect=dialect, **kwds)
     262
     263    def next(self):
     264        row = self.reader.next()
     265        return [unicode(s, "utf-8") for s in row]
     266
     267    def __iter__(self):
     268        return self
     269
     270class UnicodeWriter:
     271    """
     272    A CSV writer which will write rows to CSV file "f",
     273    which is encoded in the given encoding.
     274    """
     275
     276    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
     277        # Redirect output to a queue
     278        self.queue = StringIO()
     279        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
     280        self.stream = f
     281        self.encode = codecs.getencoder(encoding)
     282
     283    def writerow(self, row):
     284        #self.writer.writerow([s.encode("utf-8") for s in row])
     285        self.writer.writerow([s for s in row])
     286        # Fetch UTF-8 output from the queue ...
     287        data = self.queue.getvalue()
     288        data = data.decode("utf-8")
     289        # ... and reencode it into the target encoding
     290        data = self.encode(data)[0]
     291        # write to the target stream
     292        self.stream.write(data)
     293        # empty queue
     294        self.queue.truncate(0)
     295
     296    def writerows(self, rows):
     297        for row in rows:
     298            self.writerow(row)
Back to Top