Ticket #5253: csv_serializer_5253.diff

File csv_serializer_5253.diff, 11.3 KB (added by Michael Newman, 16 years ago)

version of old (incomplete) patch that works with r9232

  • django/core/serializers/__init__.py

     
    2323    "xml"    : "django.core.serializers.xml_serializer",
    2424    "python" : "django.core.serializers.python",
    2525    "json"   : "django.core.serializers.json",
     26    "csv"    : "django.core.serializers.csv_serializer",
    2627}
    2728
    2829# Check for PyYaml and register the serializer if it's available.
  • django/core/serializers/csv_serializer.py

     
     1"""
     2Serialize multiple table data to and from a single csv stream, using the
     3standard csv module.
     4
     5The format of csv is sort of standardized in rfc4180, stating that there
     6are more implementations, even incompatible ones.  It treats headers as
     7optional where column names are separated the same way as field values.
     8It leaves some important questions open,
     9 - how to handle null values as opposed to empty strings,
     10 - how to handle relations, such as foreign keys or many-to-many
     11   relations,
     12 - how to represent multiple tables in a single csv file.
     13
     14The latter issue is addressed in Creativyst's ctx format at
     15http://www.creativyst.com/Doc/Std/ctx/ctx.htm where a multiple-line
     16header is used to describe metadata.  I didn't want to use their
     17approach because it conflicts with existing csv tools (such as the
     18python csv module) for simpler cases.
     19
     20Let's start with an example what csv this module produces and
     21understands.
     22
     23news_author:registration_number,name
     24555001,Jack
     25555002,Jill
     26
     27news_article:id,authors,title,text,published
     281,"[1, 2]",On Life And Chees,Once upon a time...,2001-05-30
     292,[2],,I should write this,
     30
     31Here is a summary of how values are represented.
     32 - Tables are separated by two lineterminators because it's not
     33   intrusive and gives a good visual guidance.  It's simply parsed as an
     34   empty line by csv tools, preserving the structure.  A single csv file
     35   is also easy to split by the separator using csplit for example.
     36 - Headers are mandatory, containing the column names separated by
     37   commas.
     38 - The first header field is special, it has the form '<table name>:<pk
     39   name>'.  This doesn't conflict with other parsers; and the colon as
     40   separator is widely used in the Unix world and it cannot be part of
     41   the table or column name.  The usage of <pk name> instead of just
     42   'pk' is intentional, although it differs from the constant usage of
     43   'pk' is the json an xml serializers modules -- this is how database
     44   dumps work, for example in sqlite.
     45 - None is represented as an empty string.
     46 - Foreign keys are represented as integers.
     47 - Many-to-many relations are represented as a list of foreign keys.
     48 - Strings are represented as they are except for strings that contain
     49   only zero or more spaces.
     50 - Strings of only zero or more spaces are prepended an extra leading
     51   space, so '' becomes ' ', ' ' becomes '  ', etc.  This may look
     52   strange first but this is how None (represented as '') and ''
     53   (represented as ' ') are distinguished.  Space-only strings are a
     54   rare beast, leading and trailing spaces are also frequently trimmed
     55   by csv parsers, so I find this a fair compromise.
     56"""
     57import codecs
     58import csv
     59try:
     60    from cStringIO import StringIO
     61except ImportError:
     62    from StringIO import StringIO
     63import os
     64import re
     65
     66from django.core.serializers import base
     67from django.db import models
     68# These fields should all extend CharField since they all work with
     69# string data
     70from django.db.models.fields import CharField, FilePathField, SlugField, TextField
     71from django.db.models.fields.files import FileField
     72from django.contrib.localflavor.us.models import USStateField
     73
     74
     75spaces_re = re.compile('^[ ]*$')
     76
     77class Serializer(base.Serializer):
     78    "Serialize to csv"
     79
     80    def start_serialization(self):
     81        self.last_model = None
     82        # By default, csv module uses '\r\n' as lineterminator
     83        self.output = UnicodeWriter(self.stream, lineterminator=os.linesep)
     84
     85    def start_object(self, obj):
     86        if not hasattr(obj, "_meta"):
     87            raise base.SerializationError("Non-model object (%s) encountered during serialization" % type(obj))
     88        if self.last_model != obj._meta:
     89            meta = obj._meta
     90            self.last_model = meta
     91            fields = self.selected_fields
     92            if fields:
     93                fields = list(fields)
     94            else:
     95                fields = \
     96                    [field.name for field in meta.fields] + \
     97                    [field.name for field in meta.many_to_many]
     98            if meta.pk.attname in fields:
     99                fields.remove(meta.pk.attname)
     100            header = ['%s:%s' % (meta, meta.pk.attname)]
     101            for field_name in fields:
     102                header.append(field_name)
     103            # Table separator is an empty row
     104            self.output.writerow([])
     105            self.output.writerow(header)
     106        self.row = [str(obj._get_pk_val())]
     107
     108    def end_object(self, obj):
     109        self.output.writerow(self.row)
     110
     111    def handle_field(self, obj, field):
     112        self.row.append(self.get_string_value(obj, field))
     113
     114    def handle_fk_field(self, obj, field):
     115        related = getattr(obj, field.name)
     116        if related is None:
     117            repr = ''
     118        else:
     119            if field.rel.field_name == related._meta.pk.name:
     120                # relation via pk
     121                repr = str(related._get_pk_val())
     122            else:
     123                # relation via other field
     124                repr = str(getattr(related, field.rel.field_name))
     125        self.row.append(repr)
     126
     127    def handle_m2m_field(self, obj, field):
     128        """Represented as a tuple of related ids, or empty string of there
     129        are no related objects"""
     130        related = [related._get_pk_val() for related in getattr(obj, field.name).iterator()]
     131        if related:
     132            self.row.append(str(related))
     133        else:
     134            self.row.append('')
     135
     136    def get_string_value(self, obj, field):
     137        """
     138        None always becomes ''.  For string values prepend a leading
     139        space if the string contains only spaces so '' becomes ' ' and '
     140        ' becomes '  ', etc.  Other values are handled normally.
     141        """
     142        value = getattr(obj, field.name)
     143        if value is None:
     144            return ''
     145        elif is_string_field(field):
     146            if spaces_re.match(value):
     147                return ' ' + value
     148            else:
     149                return value
     150        else:
     151            return super(Serializer, self).get_string_value(obj, field)
     152
     153
     154class Deserializer(base.Deserializer):
     155    "Deserialize from csv"
     156
     157    def __init__(self, stream_or_string, **options):
     158        super(Deserializer, self).__init__(stream_or_string, **options)
     159        self.next = self.__iter__().next
     160
     161    def __iter__(self):
     162        header_coming = True
     163        for values in UnicodeReader(self.stream):
     164            if not values:
     165                header_coming = True
     166            else:
     167                if header_coming:
     168                    # Model
     169                    model, first_field = values[0].split(':', 2)
     170                    try:
     171                        self.model = models.get_model(*model.split("."))
     172                    except TypeError:
     173                        raise base.DeserializationError("No model %s in db" % model)
     174                    # Field names
     175                    self.field_names = [first_field] + values[1:]
     176                    header_coming = False
     177                else:
     178                    # An object
     179                    meta = self.model._meta
     180                    data = {meta.pk.attname: meta.pk.to_python(values[0])}
     181                    m2m_data = {}
     182                    for i in range(1, len(values)):
     183                        name = self.field_names[i]
     184                        value = values[i]
     185                        field = meta.get_field(name)
     186                        if field.rel and isinstance(field.rel, models.ManyToManyRel):
     187                            m2m_data[field.name] = self.handle_m2m_field(value, field)
     188                        elif field.rel and isinstance(field.rel, models.ManyToOneRel):
     189                            data[field.attname] = self.handle_fk_field(value, field)
     190                        else:
     191                            data[field.name] = self.handle_field(value, field)
     192                    yield base.DeserializedObject(self.model(**data), m2m_data)
     193
     194    def handle_field(self, raw, field):
     195        if raw == '':
     196            raw = None
     197        elif is_string_field(field):
     198            if spaces_re.match(raw):
     199                raw = raw[1:]
     200        return field.to_python(raw)
     201
     202    def handle_fk_field(self, raw, field):
     203        if raw == '':
     204            return None
     205        related_field = field.rel.to._meta.get_field(field.rel.field_name)
     206        return related_field.to_python(raw)
     207
     208    def handle_m2m_field(self, raw, field):
     209        if raw:
     210            return eval(raw)
     211        else:
     212            return []
     213
     214
     215def is_string_field(field):
     216    """If all field classes working with strings extended CharField, we
     217    wouldn't need this method"""
     218    return bool(isinstance(field,
     219        (CharField, FileField, FilePathField, SlugField, TextField,
     220        USStateField)))
     221
     222
     223# Copied from csv module examples with some modifications
     224# - getincrementalencoder replaced with getencoder because it works with
     225# python < 2.5
     226
     227class UTF8Recoder:
     228    """
     229    Iterator that reads an encoded stream and reencodes the input to UTF-8
     230    """
     231    def __init__(self, f, encoding):
     232        self.reader = codecs.getreader(encoding)(f)
     233
     234    def __iter__(self):
     235        return self
     236
     237    def next(self):
     238        return self.reader.next().encode("utf-8")
     239
     240class UnicodeReader:
     241    """
     242    A CSV reader which will iterate over lines in the CSV file "f",
     243    which is encoded in the given encoding.
     244    """
     245
     246    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
     247        f = UTF8Recoder(f, encoding)
     248        self.reader = csv.reader(f, dialect=dialect, **kwds)
     249
     250    def next(self):
     251        row = self.reader.next()
     252        return [unicode(s, "utf-8") for s in row]
     253
     254    def __iter__(self):
     255        return self
     256
     257class UnicodeWriter:
     258    """
     259    A CSV writer which will write rows to CSV file "f",
     260    which is encoded in the given encoding.
     261    """
     262
     263    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
     264        # Redirect output to a queue
     265        self.queue = StringIO()
     266        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
     267        self.stream = f
     268        self.encode = codecs.getencoder(encoding)
     269
     270    def writerow(self, row):
     271        self.writer.writerow([s.encode("utf-8") for s in row])
     272        # Fetch UTF-8 output from the queue ...
     273        data = self.queue.getvalue()
     274        data = data.decode("utf-8")
     275        # ... and reencode it into the target encoding
     276        data = self.encode(data)[0]
     277        # write to the target stream
     278        self.stream.write(data)
     279        # empty queue
     280        self.queue.truncate(0)
     281
     282    def writerows(self, rows):
     283        for row in rows:
     284            self.writerow(row)
Back to Top