| 1 | """ |
| 2 | Serialize multiple table data to and from a single csv stream, using the |
| 3 | standard csv module. |
| 4 | |
| 5 | The format of csv is sort of standardized in rfc4180, stating that there |
| 6 | are more implementations, even incompatible ones. It treats headers as |
| 7 | optional where column names are separated the same way as field values. |
| 8 | It leaves some important questions open, |
| 9 | - how to handle null values as opposed to empty strings, |
| 10 | - how to handle relations, such as foreign keys or many-to-many |
| 11 | relations, |
| 12 | - how to represent multiple tables in a single csv file. |
| 13 | |
| 14 | The latter issue is addressed in Creativyst's ctx format at |
| 15 | http://www.creativyst.com/Doc/Std/ctx/ctx.htm where a multiple-line |
| 16 | header is used to describe metadata. I didn't want to use their |
| 17 | approach because it conflicts with existing csv tools (such as the |
| 18 | python csv module) for simpler cases. |
| 19 | |
| 20 | Let's start with an example what csv this module produces and |
| 21 | understands. |
| 22 | |
| 23 | news_author:registration_number,name |
| 24 | 555001,Jack |
| 25 | 555002,Jill |
| 26 | |
| 27 | news_article:id,authors,title,text,published |
| 28 | 1,"[1, 2]",On Life And Chees,Once upon a time...,2001-05-30 |
| 29 | 2,[2],,I should write this, |
| 30 | |
| 31 | Here is a summary of how values are represented. |
| 32 | - Tables are separated by two lineterminators because it's not |
| 33 | intrusive and gives a good visual guidance. It's simply parsed as an |
| 34 | empty line by csv tools, preserving the structure. A single csv file |
| 35 | is also easy to split by the separator using csplit for example. |
| 36 | - Headers are mandatory, containing the column names separated by |
| 37 | commas. |
| 38 | - The first header field is special, it has the form '<table name>:<pk |
| 39 | name>'. This doesn't conflict with other parsers; and the colon as |
| 40 | separator is widely used in the Unix world and it cannot be part of |
| 41 | the table or column name. The usage of <pk name> instead of just |
| 42 | 'pk' is intentional, although it differs from the constant usage of |
| 43 | 'pk' is the json an xml serializers modules -- this is how database |
| 44 | dumps work, for example in sqlite. |
| 45 | - None is represented as an empty string. |
| 46 | - Foreign keys are represented as integers. |
| 47 | - Many-to-many relations are represented as a list of foreign keys. |
| 48 | - Strings are represented as they are except for strings that contain |
| 49 | only zero or more spaces. |
| 50 | - Strings of only zero or more spaces are prepended an extra leading |
| 51 | space, so '' becomes ' ', ' ' becomes ' ', etc. This may look |
| 52 | strange first but this is how None (represented as '') and '' |
| 53 | (represented as ' ') are distinguished. Space-only strings are a |
| 54 | rare beast, leading and trailing spaces are also frequently trimmed |
| 55 | by csv parsers, so I find this a fair compromise. |
| 56 | """ |
| 57 | import codecs |
| 58 | import csv |
| 59 | try: |
| 60 | from cStringIO import StringIO |
| 61 | except ImportError: |
| 62 | from StringIO import StringIO |
| 63 | import os |
| 64 | import re |
| 65 | |
| 66 | from django.core.serializers import base |
| 67 | from django.db import models |
| 68 | # These fields should all extend CharField since they all work with |
| 69 | # string data |
| 70 | from django.db.models.fields import CharField, FilePathField, SlugField, TextField |
| 71 | from django.db.models.fields.files import FileField |
| 72 | from django.contrib.localflavor.us.models import USStateField |
| 73 | |
| 74 | |
| 75 | spaces_re = re.compile('^[ ]*$') |
| 76 | |
| 77 | class Serializer(base.Serializer): |
| 78 | "Serialize to csv" |
| 79 | |
| 80 | def start_serialization(self): |
| 81 | self.last_model = None |
| 82 | # By default, csv module uses '\r\n' as lineterminator |
| 83 | self.output = UnicodeWriter(self.stream, lineterminator=os.linesep) |
| 84 | |
| 85 | def start_object(self, obj): |
| 86 | if not hasattr(obj, "_meta"): |
| 87 | raise base.SerializationError("Non-model object (%s) encountered during serialization" % type(obj)) |
| 88 | if self.last_model != obj._meta: |
| 89 | meta = obj._meta |
| 90 | self.last_model = meta |
| 91 | fields = self.selected_fields |
| 92 | if fields: |
| 93 | fields = list(fields) |
| 94 | else: |
| 95 | fields = \ |
| 96 | [field.name for field in meta.fields] + \ |
| 97 | [field.name for field in meta.many_to_many] |
| 98 | if meta.pk.attname in fields: |
| 99 | fields.remove(meta.pk.attname) |
| 100 | header = ['%s:%s' % (meta, meta.pk.attname)] |
| 101 | for field_name in fields: |
| 102 | header.append(field_name) |
| 103 | # Table separator is an empty row |
| 104 | self.output.writerow([]) |
| 105 | self.output.writerow(header) |
| 106 | self.row = [str(obj._get_pk_val())] |
| 107 | |
| 108 | def end_object(self, obj): |
| 109 | self.output.writerow(self.row) |
| 110 | |
| 111 | def handle_field(self, obj, field): |
| 112 | self.row.append(self.get_string_value(obj, field)) |
| 113 | |
| 114 | def handle_fk_field(self, obj, field): |
| 115 | related = getattr(obj, field.name) |
| 116 | if related is None: |
| 117 | repr = '' |
| 118 | else: |
| 119 | if field.rel.field_name == related._meta.pk.name: |
| 120 | # relation via pk |
| 121 | repr = str(related._get_pk_val()) |
| 122 | else: |
| 123 | # relation via other field |
| 124 | repr = str(getattr(related, field.rel.field_name)) |
| 125 | self.row.append(repr) |
| 126 | |
| 127 | def handle_m2m_field(self, obj, field): |
| 128 | """Represented as a tuple of related ids, or empty string of there |
| 129 | are no related objects""" |
| 130 | related = [related._get_pk_val() for related in getattr(obj, field.name).iterator()] |
| 131 | if related: |
| 132 | self.row.append(str(related)) |
| 133 | else: |
| 134 | self.row.append('') |
| 135 | |
| 136 | def get_string_value(self, obj, field): |
| 137 | """ |
| 138 | None always becomes ''. For string values prepend a leading |
| 139 | space if the string contains only spaces so '' becomes ' ' and ' |
| 140 | ' becomes ' ', etc. Other values are handled normally. |
| 141 | """ |
| 142 | value = getattr(obj, field.name) |
| 143 | if value is None: |
| 144 | return '' |
| 145 | elif is_string_field(field): |
| 146 | if spaces_re.match(value): |
| 147 | return ' ' + value |
| 148 | else: |
| 149 | return value |
| 150 | else: |
| 151 | return super(Serializer, self).get_string_value(obj, field) |
| 152 | |
| 153 | |
| 154 | class Deserializer(base.Deserializer): |
| 155 | "Deserialize from csv" |
| 156 | |
| 157 | def __init__(self, stream_or_string, **options): |
| 158 | super(Deserializer, self).__init__(stream_or_string, **options) |
| 159 | self.next = self.__iter__().next |
| 160 | |
| 161 | def __iter__(self): |
| 162 | header_coming = True |
| 163 | for values in UnicodeReader(self.stream): |
| 164 | if not values: |
| 165 | header_coming = True |
| 166 | else: |
| 167 | if header_coming: |
| 168 | # Model |
| 169 | model, first_field = values[0].split(':', 2) |
| 170 | try: |
| 171 | self.model = models.get_model(*model.split(".")) |
| 172 | except TypeError: |
| 173 | raise base.DeserializationError("No model %s in db" % model) |
| 174 | # Field names |
| 175 | self.field_names = [first_field] + values[1:] |
| 176 | header_coming = False |
| 177 | else: |
| 178 | # An object |
| 179 | meta = self.model._meta |
| 180 | data = {meta.pk.attname: meta.pk.to_python(values[0])} |
| 181 | m2m_data = {} |
| 182 | for i in range(1, len(values)): |
| 183 | name = self.field_names[i] |
| 184 | value = values[i] |
| 185 | field = meta.get_field(name) |
| 186 | if field.rel and isinstance(field.rel, models.ManyToManyRel): |
| 187 | m2m_data[field.name] = self.handle_m2m_field(value, field) |
| 188 | elif field.rel and isinstance(field.rel, models.ManyToOneRel): |
| 189 | data[field.attname] = self.handle_fk_field(value, field) |
| 190 | else: |
| 191 | data[field.name] = self.handle_field(value, field) |
| 192 | yield base.DeserializedObject(self.model(**data), m2m_data) |
| 193 | |
| 194 | def handle_field(self, raw, field): |
| 195 | if raw == '': |
| 196 | raw = None |
| 197 | elif is_string_field(field): |
| 198 | if spaces_re.match(raw): |
| 199 | raw = raw[1:] |
| 200 | return field.to_python(raw) |
| 201 | |
| 202 | def handle_fk_field(self, raw, field): |
| 203 | if raw == '': |
| 204 | return None |
| 205 | related_field = field.rel.to._meta.get_field(field.rel.field_name) |
| 206 | return related_field.to_python(raw) |
| 207 | |
| 208 | def handle_m2m_field(self, raw, field): |
| 209 | if raw: |
| 210 | return eval(raw) |
| 211 | else: |
| 212 | return [] |
| 213 | |
| 214 | |
| 215 | def is_string_field(field): |
| 216 | """If all field classes working with strings extended CharField, we |
| 217 | wouldn't need this method""" |
| 218 | return bool(isinstance(field, |
| 219 | (CharField, FileField, FilePathField, SlugField, TextField, |
| 220 | USStateField))) |
| 221 | |
| 222 | |
| 223 | # Copied from csv module examples with some modifications |
| 224 | # - getincrementalencoder replaced with getencoder because it works with |
| 225 | # python < 2.5 |
| 226 | |
| 227 | class UTF8Recoder: |
| 228 | """ |
| 229 | Iterator that reads an encoded stream and reencodes the input to UTF-8 |
| 230 | """ |
| 231 | def __init__(self, f, encoding): |
| 232 | self.reader = codecs.getreader(encoding)(f) |
| 233 | |
| 234 | def __iter__(self): |
| 235 | return self |
| 236 | |
| 237 | def next(self): |
| 238 | return self.reader.next().encode("utf-8") |
| 239 | |
| 240 | class UnicodeReader: |
| 241 | """ |
| 242 | A CSV reader which will iterate over lines in the CSV file "f", |
| 243 | which is encoded in the given encoding. |
| 244 | """ |
| 245 | |
| 246 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): |
| 247 | f = UTF8Recoder(f, encoding) |
| 248 | self.reader = csv.reader(f, dialect=dialect, **kwds) |
| 249 | |
| 250 | def next(self): |
| 251 | row = self.reader.next() |
| 252 | return [unicode(s, "utf-8") for s in row] |
| 253 | |
| 254 | def __iter__(self): |
| 255 | return self |
| 256 | |
| 257 | class UnicodeWriter: |
| 258 | """ |
| 259 | A CSV writer which will write rows to CSV file "f", |
| 260 | which is encoded in the given encoding. |
| 261 | """ |
| 262 | |
| 263 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): |
| 264 | # Redirect output to a queue |
| 265 | self.queue = StringIO() |
| 266 | self.writer = csv.writer(self.queue, dialect=dialect, **kwds) |
| 267 | self.stream = f |
| 268 | self.encode = codecs.getencoder(encoding) |
| 269 | |
| 270 | def writerow(self, row): |
| 271 | self.writer.writerow([s.encode("utf-8") for s in row]) |
| 272 | # Fetch UTF-8 output from the queue ... |
| 273 | data = self.queue.getvalue() |
| 274 | data = data.decode("utf-8") |
| 275 | # ... and reencode it into the target encoding |
| 276 | data = self.encode(data)[0] |
| 277 | # write to the target stream |
| 278 | self.stream.write(data) |
| 279 | # empty queue |
| 280 | self.queue.truncate(0) |
| 281 | |
| 282 | def writerows(self, rows): |
| 283 | for row in rows: |
| 284 | self.writerow(row) |