Context Navigation

Back to Ticket #6422

Ticket #6422: distinct_on.11.diff

File distinct_on.11.diff, 23.5 KB (added by Anssi Kääriäinen, 13 years ago)
Possible approach for using correct aliases

AUTHORS

diff --git a/AUTHORS b/AUTHORS
index cec5db1..06ba219 100644

                answer newbie questions, and generally made Django that much better:
     Marc Garcia <marc.garcia@accopensys.com>
     Andy Gayton <andy-django@thecablelounge.com>
     geber@datacollect.com
+    Jeffrey Gelens <jeffrey@gelens.org>
     Baishampayan Ghose
     Joshua Ginsberg <jag@flowtheory.net>
     Dimitris Glezos <dimitris@glezos.com>

django/db/backends/init.py

diff --git a/django/db/backends/__init__.py b/django/db/backends/__init__.py
index f2bde84..126b5fc 100644

                class BaseDatabaseFeatures(object):
     supports_stddev = None
     can_introspect_foreign_keys = None
+    # Support for the DISTINCT ON clause
+    can_distinct_on_fields = False
     def __init__(self, connection):
         self.connection = connection
-…
+               class BaseDatabaseOperations(object):
         """
         raise NotImplementedError('Full-text search is not implemented for this database backend')
+    def distinct(self, fields):
+        """
+        Returns an SQL DISTINCT clause which removes duplicate rows from the
+        result set. If any fields are given, only the given fields are being
+        checked for duplicates.
+        """
+        if fields:
+            raise NotImplementedError('DISTINCT ON fields is not supported by this database backend')
+        else:
+            return 'DISTINCT'
     def last_executed_query(self, cursor, sql, params):
         """
         Returns a string of the query last executed by the given cursor, with

django/db/backends/postgresql_psycopg2/base.py

diff --git a/django/db/backends/postgresql_psycopg2/base.py b/django/db/backends/postgresql_psycopg2/base.py
index c816237..cff3fa5 100644

                class DatabaseFeatures(BaseDatabaseFeatures):
     has_select_for_update_nowait = True
     has_bulk_insert = True
     supports_tablespaces = True
+    can_distinct_on_fields = True
 class DatabaseWrapper(BaseDatabaseWrapper):
     vendor = 'postgresql'

django/db/backends/postgresql_psycopg2/operations.py

diff --git a/django/db/backends/postgresql_psycopg2/operations.py b/django/db/backends/postgresql_psycopg2/operations.py
index acfeeaf..fe01566 100644

                class DatabaseOperations(BaseDatabaseOperations):
         return 63
+    def distinct(self, fields):
+        if fields:
+            return 'DISTINCT ON (%s)' % ', '.join(fields)
+        else:
+            return 'DISTINCT'
     def last_executed_query(self, cursor, sql, params):
         # http://initd.org/psycopg/docs/cursor.html#cursor.query
         # The query attribute is a Psycopg extension to the DB API 2.0.

django/db/models/query.py

diff --git a/django/db/models/query.py b/django/db/models/query.py
index c752049..43b3954 100644

                class QuerySet(object):
         obj.query.add_ordering(*field_names)
         return obj
     def distinct(self, true_or_false=True):
+    def distinct(self, *field_names):
         """
         Returns a new QuerySet instance that will select only distinct results.
         """
+        assert self.query.can_filter(), \
+                "Cannot create distinct fields once a slice has been taken."
         obj = self._clone()
         obj.query.distinct = true_or_false
+        obj.query.add_distinct_fields(*field_names)
         return obj
     def extra(self, select=None, where=None, params=None, tables=None,
-…
+               class EmptyQuerySet(QuerySet):
         """
         return self
     def distinct(self, true_or_false=True):
+    def distinct(self, fields=None):
         """
         Always returns EmptyQuerySet.
         """

django/db/models/sql/compiler.py

diff --git a/django/db/models/sql/compiler.py b/django/db/models/sql/compiler.py
index cebd77f..62a38ec 100644

                class SQLCompiler(object):
         Does any necessary class setup immediately prior to producing SQL. This
         is for things that can't necessarily be done in __init__ because we
         might not have all the pieces in place at that time.
+        # TODO: after the query has been executed, the altered state should be
+        # cleaned. We are not using a clone() of the query here.
         """
         if not self.query.tables:
             self.query.join((None, self.query.model._meta.db_table, None, None))
-…
+               class SQLCompiler(object):
             return '', ()
         self.pre_sql_setup()
+        # After executing the query, we must get rid of any joins the query
+        # setup created. So, take note of alias counts before the query ran.
+        # However we do not want to get rid of stuff done in pre_sql_setup(),
+        # as the pre_sql_setup will modify query state in a way that forbids
+        # another run of it.
+        self.refcounts_before = self.query.alias_refcount.copy()
         out_cols = self.get_columns(with_col_aliases)
         ordering, ordering_group_by = self.get_ordering()
+        # This must come after 'select' and 'ordering' -- see docstring of
+        # get_from_clause() for details.
+        distinct_fields = self.get_distinct()
+        # This must come after 'select', 'ordering' and 'distinct' -- see
+        # docstring of get_from_clause() for details.
         from_, f_params = self.get_from_clause()
         qn = self.quote_name_unless_alias
-…
+               class SQLCompiler(object):
             params.extend(val[1])
         result = ['SELECT']
         if self.query.distinct:
+            result.append('DISTINCT')
+            result.append(self.connection.ops.distinct(distinct_fields))
         result.append(', '.join(out_cols + self.query.ordering_aliases))
         result.append('FROM')
-…
+               class SQLCompiler(object):
                 raise DatabaseError('NOWAIT is not supported on this database backend.')
             result.append(self.connection.ops.for_update_sql(nowait=nowait))
+        # Finally do cleanup - get rid of the joins we created above.
+        self.query.reset_refcounts(self.refcounts_before)
         return ' '.join(result), tuple(params)
     def as_nested_sql(self):
-…
+               class SQLCompiler(object):
                     col_aliases.add(field.column)
         return result, aliases
+    def get_distinct(self):
+        """
+        Returns a quoted list of fields to use in DISTINCT ON part of the query.
+        Note that this method can alter the tables in the query, and thus this
+        must be called before get_from_clause().
+        """
+        qn = self.quote_name_unless_alias
+        qn2 = self.connection.ops.quote_name
+        result = []
+        options = self.query.model._meta
+        for name in self.query.distinct_fields:
+            # We do pretty much the same join creation & promotion & trimming as in
+            # get_ordering
+            field, target, opts, joins, last, extra = self.query.setup_joins(
+                name.split(LOOKUP_SEP), options, self.query.get_initial_alias(), False)
+            alias = joins[-1]
+            col = target.column
+            if not field.rel:
+                # To avoid inadvertent trimming of a necessary alias, use the
+                # refcount to show that we are referencing a non-relation field on
+                # the model.
+                self.query.ref_alias(alias)
+            # Must use left outer joins for nullable fields and their relations.
+            self.query.promote_alias_chain(joins,
+                self.query.alias_map[joins[0]][JOIN_TYPE] == self.query.LOUTER)
+            if alias:
+                # We have to do the same "final join" optimisation as in
+                # add_filter, since the final column might not otherwise be part of
+                # the select set (so we can't order on it).
+                while 1:
+                    join = self.query.alias_map[alias]
+                    if col != join[RHS_JOIN_COL]:
+                        break
+                    self.query.unref_alias(alias)
+                    alias = join[LHS_ALIAS]
+                    col = join[LHS_JOIN_COL]
+            result.append("%s.%s" % (qn(alias), qn2(col)))
+        return result
     def get_ordering(self):
         """
         Returns a tuple containing a list representing the SQL elements in the
-…
+               class SQLCompiler(object):
         from-clause via a "select".
         This should only be called after any SQL construction methods that
         might change the tables we need. This means the select columns and
         ordering must be done first.
+        might change the tables we need. This means the select columns,
+        ordering and distinct must be done first.
         """
         result = []
         qn = self.quote_name_unless_alias
-…
+               class SQLAggregateCompiler(SQLCompiler):
         """
         if qn is None:
             qn = self.quote_name_unless_alias
         sql = ('SELECT %s FROM (%s) subquery' % (
             ', '.join([
                 aggregate.as_sql(qn, self.connection)

django/db/models/sql/query.py

diff --git a/django/db/models/sql/query.py b/django/db/models/sql/query.py
index 4afe288..362f6fd 100644

                class Query(object):
         self.order_by = []
         self.low_mark, self.high_mark = 0, None  # Used for offset/limit
         self.distinct = False
+        self.distinct_fields = []
         self.select_for_update = False
         self.select_for_update_nowait = False
         self.select_related = False
-…
+               class Query(object):
         obj.order_by = self.order_by[:]
         obj.low_mark, obj.high_mark = self.low_mark, self.high_mark
         obj.distinct = self.distinct
+        obj.distinct_fields = self.distinct_fields[:]
         obj.select_for_update = self.select_for_update
         obj.select_for_update_nowait = self.select_for_update_nowait
         obj.select_related = self.select_related
-…
+               class Query(object):
         else:
             obj.used_aliases = set()
         obj.filter_is_sticky = False
         obj.__dict__.update(kwargs)
         if hasattr(obj, '_setup_query'):
             obj._setup_query()
-…
+               class Query(object):
         Performs a COUNT() query using the current filter constraints.
         """
         obj = self.clone()
         if len(self.select) > 1 or self.aggregate_select:
+        if len(self.select) > 1 or self.aggregate_select or (self.distinct and self.distinct_fields):
             # If a select clause exists, then the query has already started to
             # specify the columns that are to be returned.
             # In this case, we need to use a subquery to evaluate the count.
-…
+               class Query(object):
                 "Cannot combine queries once a slice has been taken."
         assert self.distinct == rhs.distinct, \
             "Cannot combine a unique query with a non-unique query."
+        assert self.distinct_fields == rhs.distinct_fields, \
+            "Cannot combine queries with different distinct fields."
         self.remove_inherited_models()
         # Work out how to relabel the rhs aliases, if necessary.
-…
+               class Query(object):
         """ Increases the reference count for this alias. """
         self.alias_refcount[alias] += 1
     def unref_alias(self, alias):
+    def unref_alias(self, alias, amount=1):
         """ Decreases the reference count for this alias. """
         self.alias_refcount[alias] -= 1
+        self.alias_refcount[alias] -= amount
     def promote_alias(self, alias, unconditional=False):
         """
-…
+               class Query(object):
             if self.promote_alias(alias, must_promote):
                 must_promote = True
+    def reset_refcounts(self, to_counts):
+        """
+        This method will reset reference counts for aliases so that they match
+        that given in to_counts.
+        """
+        for alias, cur_refcount in self.alias_refcount.copy().items():
+            unref_amount = cur_refcount - to_counts.get(alias, 0)
+            self.unref_alias(alias, unref_amount)
     def promote_unused_aliases(self, initial_refcounts, used_aliases):
         """
         Given a "before" copy of the alias_refcounts dictionary (as
-…
+               class Query(object):
     def count_active_tables(self):
         """
         Returns the number of tables in this query with a non-zero reference
+        count.
+        count. Note that after execution, the reference counts are zeroed, so
+        tables added in compiler will not be seen by this method.
         """
         return len([1 for count in self.alias_refcount.itervalues() if count])
-…
+               class Query(object):
         self.select = []
         self.select_fields = []
+    def add_distinct_fields(self, *field_names):
+        """
+        Adds and resolves the given fields to the query's "distinct on" clause.
+        """
+        self.distinct_fields = field_names
+        self.distinct = True
     def add_fields(self, field_names, allow_m2m=True):
         """
         Adds the given (model) fields to the select set. The field names are

docs/ref/models/querysets.txt

diff --git a/docs/ref/models/querysets.txt b/docs/ref/models/querysets.txt
index 6f2cad3..b7bc647 100644

                remain undefined afterward).
 distinct
 ~~~~~~~~
 .. method:: distinct()
+.. method:: distinct([*fields])
 Returns a new ``QuerySet`` that uses ``SELECT DISTINCT`` in its SQL query. This
 eliminates duplicate rows from the query results.
-…
+               query spans multiple tables, it's possible to get duplicate results when a
     :meth:`values()` together, be careful when ordering by fields not in the
     :meth:`values()` call.
+.. versionadded:: 1.4
+The possibility to pass positional arguments (``*fields``) is new in Django 1.4.
+They are names of fields to which the ``DISTINCT`` should be limited. This
+translates to a ``SELECT DISTINCT ON`` SQL query. A ``DISTINCT ON`` query eliminates
+duplicate rows not by comparing all fields in a row, but by comparing only the given
+fields.
+.. note::
+    Note that the ability to specify field names is only available in PostgreSQL.
+.. note::
+    When using the ``DISTINCT ON`` functionality it is required that the columns given
+    to :meth:`distinct` match the first :meth:`order_by` columns. For example ``SELECT
+    DISTINCT ON (a)`` gives you the first row for each value in column ``a``. If you
+    don't specify an order, then you'll get some arbitrary row.
+Examples::
+    >>> Author.objects.distinct()
+    [...]
+    >>> Entry.objects.order_by('pub_date').distinct('pub_date')
+    [...]
+    >>> Entry.objects.order_by('blog').distinct('blog')
+    [...]
+    >>> Entry.objects.order_by('author', 'pub_date').distinct('author', 'pub_date')
+    [...]
+    >>> Entry.objects.order_by('blog__name', 'mod_date').distinct('blog__name', 'mod_date')
+    [...]
+    >>> Entry.objects.order_by('author', 'pub_date').distinct('author')
+    [...]
 values
 ~~~~~~

tests/regressiontests/queries/models.py

diff --git a/tests/regressiontests/queries/models.py b/tests/regressiontests/queries/models.py
index e69ce48..6ad9986 100644

                class Celebrity(models.Model):
     name = models.CharField("Name", max_length=20)
     greatest_fan = models.ForeignKey("Fan", null=True, unique=True)
+    def __unicode__(self):
+        return self.name
 class TvChef(Celebrity):
     pass
-…
+               class OneToOneCategory(models.Model):
     def __unicode__(self):
         return "one2one " + self.new_name
+class Staff(models.Model):
+    id = models.IntegerField(primary_key=True)
+    name = models.CharField(max_length=50)
+    organisation = models.CharField(max_length=100)
+    tags = models.ManyToManyField(Tag, through='StaffTag')
+    coworkers = models.ManyToManyField('self')
+    def __unicode__(self):
+        return self.name
+class StaffTag(models.Model):
+    staff = models.ForeignKey(Staff)
+    tag = models.ForeignKey(Tag)
+    def __unicode__(self):
+        return u"%s -> %s" % (self.tag, self.staff)

tests/regressiontests/queries/tests.py

diff --git a/tests/regressiontests/queries/tests.py b/tests/regressiontests/queries/tests.py
index 6a54125..fca99be 100644

                from .models import (Annotation, Article, Author, Celebrity, Child, Cover,
     ManagedModel, Member, NamedCategory, Note, Number, Plaything, PointerA,
     Ranking, Related, Report, ReservedName, Tag, TvChef, Valid, X, Food, Eaten,
     Node, ObjectA, ObjectB, ObjectC, CategoryItem, SimpleCategory,
     SpecialCategory, OneToOneCategory)
+    SpecialCategory, OneToOneCategory, Staff, StaffTag)
 class BaseQuerysetTest(TestCase):
-…
+               class Queries1Tests(BaseQuerysetTest):
             ['<Item: four>', '<Item: one>']
+        )
-    # FIXME: This is difficult to fix and very much an edge case, so punt for
-    # now.  This is related to the order_by() tests for ticket #2253, but the
-    # old bug exhibited itself here (q2 was pulling too many tables into the
-    # combined query with the new ordering, but only because we have evaluated
-    # q2 already).
-    @unittest.expectedFailure
     def test_order_by_tables(self):
         q1 = Item.objects.order_by('name')
         q2 = Item.objects.filter(id=self.i1.id)
         list(q2)
         self.assertEqual(len((q1 & q2).order_by('name').query.tables), 1)
+    def test_order_by_join_unref(self):
+        """
+        This test is related to the above one, testing that there aren't
+        old JOINs in the query.
+        """
+        qs = Celebrity.objects.order_by('greatest_fan__fan_of')
+        self.assertIn('OUTER JOIN', str(qs.query))
+        qs = qs.order_by('id')
+        self.assertNotIn('OUTER JOIN', str(qs.query))
     def test_tickets_4088_4306(self):
         self.assertQuerysetEqual(
             Report.objects.filter(creator=1001),
-…
+               class ConditionalTests(BaseQuerysetTest):
         t4 = Tag.objects.create(name='t4', parent=t3)
         t5 = Tag.objects.create(name='t5', parent=t3)
+        p1_o1 = Staff.objects.create(id=1, name="p1", organisation="o1")
+        p2_o1 = Staff.objects.create(id=2, name="p2", organisation="o1")
+        p3_o1 = Staff.objects.create(id=3, name="p3", organisation="o1")
+        p1_o2 = Staff.objects.create(id=4, name="p1", organisation="o2")
+        p1_o1.coworkers.add(p2_o1, p3_o1)
+        StaffTag.objects.create(staff=p1_o1, tag=t1)
+        StaffTag.objects.create(staff=p1_o1, tag=t1)
+        celeb1 = Celebrity.objects.create(name="c1")
+        celeb2 = Celebrity.objects.create(name="c2")
+        self.fan1 = Fan.objects.create(fan_of=celeb1)
+        self.fan2 = Fan.objects.create(fan_of=celeb1)
+        self.fan3 = Fan.objects.create(fan_of=celeb2)
     # In Python 2.6 beta releases, exceptions raised in __len__ are swallowed
     # (Python issue 1242657), so these cases return an empty list, rather than
     # raising an exception. Not a lot we can do about that, unfortunately, due to
-…
+               class ConditionalTests(BaseQuerysetTest):
+        )
+    @skipUnlessDBFeature('can_distinct_on_fields')
+    def test_ticket6422(self):
+        """QuerySet.distinct('field', ...) works"""
+        # (qset, expected) tuples
+        qsets = (
+            (
+                Staff.objects.distinct().order_by('name'),
+                ['<Staff: p1>', '<Staff: p1>', '<Staff: p2>', '<Staff: p3>'],
+            ),
+            (
+                Staff.objects.distinct('name').order_by('name'),
+                ['<Staff: p1>', '<Staff: p2>', '<Staff: p3>'],
+            ),
+            (
+                Staff.objects.distinct('organisation').order_by('organisation', 'name'),
+                ['<Staff: p1>', '<Staff: p1>'],
+            ),
+            (
+                Staff.objects.distinct('name', 'organisation').order_by('name', 'organisation'),
+                ['<Staff: p1>', '<Staff: p1>', '<Staff: p2>', '<Staff: p3>'],
+            ),
+            (
+                Celebrity.objects.filter(fan__in=[self.fan1, self.fan2, self.fan3]).\
+                    distinct('name').order_by('name'),
+                ['<Celebrity: c1>', '<Celebrity: c2>'],
+            ),
+            # Does combining querysets work?
+            (
+                (Celebrity.objects.filter(fan__in=[self.fan1, self.fan2]).\
+                    distinct('name').order_by('name')
+                |Celebrity.objects.filter(fan__in=[self.fan3]).\
+                    distinct('name').order_by('name')),
+                ['<Celebrity: c1>', '<Celebrity: c2>'],
+            ),
+            (
+                StaffTag.objects.distinct('staff','tag'),
+                ['<StaffTag: t1 -> p1>'],
+            ),
+            (
+                Tag.objects.order_by('parent__pk', 'pk').distinct('parent'),
+                ['<Tag: t2>', '<Tag: t4>', '<Tag: t1>'],
+            ),
+            (
+                StaffTag.objects.select_related('staff').distinct('staff__name').order_by('staff__name'),
+                ['<StaffTag: t1 -> p1>'],
+            ),
+            # Fetch the alphabetically first coworker for each worker
+            (
+                (Staff.objects.distinct('id').order_by('id', 'coworkers__name').
+                               values_list('id', 'coworkers__name')),
+                ["(1, u'p2')", "(2, u'p1')", "(3, u'p1')", "(4, None)"]
+            ),
+        )
+        for qset, expected in qsets:
+            self.assertQuerysetEqual(qset, expected)
+            self.assertEqual(qset.count(), len(expected))
+        # Combining queries with different distinct_fields is not allowed.
+        base_qs = Celebrity.objects.all()
+        self.assertRaisesMessage(
+            AssertionError,
+            "Cannot combine queries with different distinct fields.",
+            lambda: (base_qs.distinct('id') & base_qs.distinct('name'))
+        )
+        # Test join unreffing
+        c1 = Celebrity.objects.distinct('greatest_fan__id', 'greatest_fan__fan_of')
+        self.assertIn('OUTER JOIN', str(c1.query))
+        c2 = c1.distinct('pk')
+        self.assertNotIn('OUTER JOIN', str(c2.query))
 class UnionTests(unittest.TestCase):
     """
     Tests for the union of two querysets. Bug #12252.

tests/regressiontests/select_related_regress/tests.py

diff --git a/tests/regressiontests/select_related_regress/tests.py b/tests/regressiontests/select_related_regress/tests.py
index 4818b95..4cd4f78 100644

                class SelectRelatedRegressTests(TestCase):
         self.assertEqual([(c.id, unicode(c.start), unicode(c.end)) for c in connections],
             [(c1.id, u'router/4', u'switch/7'), (c2.id, u'switch/7', u'server/1')])
         # This final query should only join seven tables (port, device and building
         # twice each, plus connection once).
         self.assertEqual(connections.query.count_active_tables(), 7)
+        # This final query should only have seven tables (port, device and building
+        # twice each, plus connection once). Thus, 6 joins plus the FROM table.
+        self.assertEqual(str(connections.query).count(" JOIN "), 6)
     def test_regression_8106(self):

Download in other formats:

Original Format

Issues

Context Navigation

Ticket #6422: distinct_on.11.diff

AUTHORS

django/db/backends/init.py

django/db/backends/postgresql_psycopg2/base.py

django/db/backends/postgresql_psycopg2/operations.py

django/db/models/query.py

django/db/models/sql/compiler.py

django/db/models/sql/query.py

docs/ref/models/querysets.txt

tests/regressiontests/queries/models.py

tests/regressiontests/queries/tests.py

tests/regressiontests/select_related_regress/tests.py

Download in other formats:

Django Links

Learn More

Get Involved

Get Help

Follow Us

Support Us