Ticket #6422: distinct_on.11.diff

File distinct_on.11.diff, 23.5 KB (added by Anssi Kääriäinen, 13 years ago)

Possible approach for using correct aliases

  • AUTHORS

    diff --git a/AUTHORS b/AUTHORS
    index cec5db1..06ba219 100644
    a b answer newbie questions, and generally made Django that much better:  
    203203    Marc Garcia <marc.garcia@accopensys.com>
    204204    Andy Gayton <andy-django@thecablelounge.com>
    205205    geber@datacollect.com
     206    Jeffrey Gelens <jeffrey@gelens.org>
    206207    Baishampayan Ghose
    207208    Joshua Ginsberg <jag@flowtheory.net>
    208209    Dimitris Glezos <dimitris@glezos.com>
  • django/db/backends/__init__.py

    diff --git a/django/db/backends/__init__.py b/django/db/backends/__init__.py
    index f2bde84..126b5fc 100644
    a b class BaseDatabaseFeatures(object):  
    379379    supports_stddev = None
    380380    can_introspect_foreign_keys = None
    381381
     382    # Support for the DISTINCT ON clause
     383    can_distinct_on_fields = False
     384
    382385    def __init__(self, connection):
    383386        self.connection = connection
    384387
    class BaseDatabaseOperations(object):  
    532535        """
    533536        raise NotImplementedError('Full-text search is not implemented for this database backend')
    534537
     538    def distinct(self, fields):
     539        """
     540        Returns an SQL DISTINCT clause which removes duplicate rows from the
     541        result set. If any fields are given, only the given fields are being
     542        checked for duplicates.
     543        """
     544        if fields:
     545            raise NotImplementedError('DISTINCT ON fields is not supported by this database backend')
     546        else:
     547            return 'DISTINCT'
     548
    535549    def last_executed_query(self, cursor, sql, params):
    536550        """
    537551        Returns a string of the query last executed by the given cursor, with
  • django/db/backends/postgresql_psycopg2/base.py

    diff --git a/django/db/backends/postgresql_psycopg2/base.py b/django/db/backends/postgresql_psycopg2/base.py
    index c816237..cff3fa5 100644
    a b class DatabaseFeatures(BaseDatabaseFeatures):  
    8282    has_select_for_update_nowait = True
    8383    has_bulk_insert = True
    8484    supports_tablespaces = True
     85    can_distinct_on_fields = True
    8586
    8687class DatabaseWrapper(BaseDatabaseWrapper):
    8788    vendor = 'postgresql'
  • django/db/backends/postgresql_psycopg2/operations.py

    diff --git a/django/db/backends/postgresql_psycopg2/operations.py b/django/db/backends/postgresql_psycopg2/operations.py
    index acfeeaf..fe01566 100644
    a b class DatabaseOperations(BaseDatabaseOperations):  
    179179
    180180        return 63
    181181
     182    def distinct(self, fields):
     183        if fields:
     184            return 'DISTINCT ON (%s)' % ', '.join(fields)
     185        else:
     186            return 'DISTINCT'
     187
    182188    def last_executed_query(self, cursor, sql, params):
    183189        # http://initd.org/psycopg/docs/cursor.html#cursor.query
    184190        # The query attribute is a Psycopg extension to the DB API 2.0.
  • django/db/models/query.py

    diff --git a/django/db/models/query.py b/django/db/models/query.py
    index c752049..43b3954 100644
    a b class QuerySet(object):  
    751751        obj.query.add_ordering(*field_names)
    752752        return obj
    753753
    754     def distinct(self, true_or_false=True):
     754    def distinct(self, *field_names):
    755755        """
    756756        Returns a new QuerySet instance that will select only distinct results.
    757757        """
     758        assert self.query.can_filter(), \
     759                "Cannot create distinct fields once a slice has been taken."
    758760        obj = self._clone()
    759         obj.query.distinct = true_or_false
     761        obj.query.add_distinct_fields(*field_names)
    760762        return obj
    761763
    762764    def extra(self, select=None, where=None, params=None, tables=None,
    class EmptyQuerySet(QuerySet):  
    11791181        """
    11801182        return self
    11811183
    1182     def distinct(self, true_or_false=True):
     1184    def distinct(self, fields=None):
    11831185        """
    11841186        Always returns EmptyQuerySet.
    11851187        """
  • django/db/models/sql/compiler.py

    diff --git a/django/db/models/sql/compiler.py b/django/db/models/sql/compiler.py
    index cebd77f..62a38ec 100644
    a b class SQLCompiler(object):  
    2323        Does any necessary class setup immediately prior to producing SQL. This
    2424        is for things that can't necessarily be done in __init__ because we
    2525        might not have all the pieces in place at that time.
     26        # TODO: after the query has been executed, the altered state should be
     27        # cleaned. We are not using a clone() of the query here.
    2628        """
    2729        if not self.query.tables:
    2830            self.query.join((None, self.query.model._meta.db_table, None, None))
    class SQLCompiler(object):  
    6062            return '', ()
    6163
    6264        self.pre_sql_setup()
     65        # After executing the query, we must get rid of any joins the query
     66        # setup created. So, take note of alias counts before the query ran.
     67        # However we do not want to get rid of stuff done in pre_sql_setup(),
     68        # as the pre_sql_setup will modify query state in a way that forbids
     69        # another run of it.
     70        self.refcounts_before = self.query.alias_refcount.copy()
    6371        out_cols = self.get_columns(with_col_aliases)
    6472        ordering, ordering_group_by = self.get_ordering()
    6573
    66         # This must come after 'select' and 'ordering' -- see docstring of
    67         # get_from_clause() for details.
     74        distinct_fields = self.get_distinct()
     75
     76        # This must come after 'select', 'ordering' and 'distinct' -- see
     77        # docstring of get_from_clause() for details.
    6878        from_, f_params = self.get_from_clause()
    6979
    7080        qn = self.quote_name_unless_alias
    class SQLCompiler(object):  
    7686            params.extend(val[1])
    7787
    7888        result = ['SELECT']
     89
    7990        if self.query.distinct:
    80             result.append('DISTINCT')
     91            result.append(self.connection.ops.distinct(distinct_fields))
     92
    8193        result.append(', '.join(out_cols + self.query.ordering_aliases))
    8294
    8395        result.append('FROM')
    class SQLCompiler(object):  
    129141                raise DatabaseError('NOWAIT is not supported on this database backend.')
    130142            result.append(self.connection.ops.for_update_sql(nowait=nowait))
    131143
     144        # Finally do cleanup - get rid of the joins we created above.
     145        self.query.reset_refcounts(self.refcounts_before)
     146
    132147        return ' '.join(result), tuple(params)
    133148
    134149    def as_nested_sql(self):
    class SQLCompiler(object):  
    292307                    col_aliases.add(field.column)
    293308        return result, aliases
    294309
     310    def get_distinct(self):
     311        """
     312        Returns a quoted list of fields to use in DISTINCT ON part of the query.
     313
     314        Note that this method can alter the tables in the query, and thus this
     315        must be called before get_from_clause().
     316        """
     317        qn = self.quote_name_unless_alias
     318        qn2 = self.connection.ops.quote_name
     319        result = []
     320        options = self.query.model._meta
     321
     322        for name in self.query.distinct_fields:
     323            # We do pretty much the same join creation & promotion & trimming as in
     324            # get_ordering
     325            field, target, opts, joins, last, extra = self.query.setup_joins(
     326                name.split(LOOKUP_SEP), options, self.query.get_initial_alias(), False)
     327            alias = joins[-1]
     328            col = target.column
     329            if not field.rel:
     330                # To avoid inadvertent trimming of a necessary alias, use the
     331                # refcount to show that we are referencing a non-relation field on
     332                # the model.
     333                self.query.ref_alias(alias)
     334            # Must use left outer joins for nullable fields and their relations.
     335            self.query.promote_alias_chain(joins,
     336                self.query.alias_map[joins[0]][JOIN_TYPE] == self.query.LOUTER)
     337            if alias:
     338                # We have to do the same "final join" optimisation as in
     339                # add_filter, since the final column might not otherwise be part of
     340                # the select set (so we can't order on it).
     341                while 1:
     342                    join = self.query.alias_map[alias]
     343                    if col != join[RHS_JOIN_COL]:
     344                        break
     345                    self.query.unref_alias(alias)
     346                    alias = join[LHS_ALIAS]
     347                    col = join[LHS_JOIN_COL]
     348            result.append("%s.%s" % (qn(alias), qn2(col)))
     349        return result
     350
     351
    295352    def get_ordering(self):
    296353        """
    297354        Returns a tuple containing a list representing the SQL elements in the
    class SQLCompiler(object):  
    438495        from-clause via a "select".
    439496
    440497        This should only be called after any SQL construction methods that
    441         might change the tables we need. This means the select columns and
    442         ordering must be done first.
     498        might change the tables we need. This means the select columns,
     499        ordering and distinct must be done first.
    443500        """
    444501        result = []
    445502        qn = self.quote_name_unless_alias
    class SQLAggregateCompiler(SQLCompiler):  
    9841041        """
    9851042        if qn is None:
    9861043            qn = self.quote_name_unless_alias
     1044
    9871045        sql = ('SELECT %s FROM (%s) subquery' % (
    9881046            ', '.join([
    9891047                aggregate.as_sql(qn, self.connection)
  • django/db/models/sql/query.py

    diff --git a/django/db/models/sql/query.py b/django/db/models/sql/query.py
    index 4afe288..362f6fd 100644
    a b class Query(object):  
    127127        self.order_by = []
    128128        self.low_mark, self.high_mark = 0, None  # Used for offset/limit
    129129        self.distinct = False
     130        self.distinct_fields = []
    130131        self.select_for_update = False
    131132        self.select_for_update_nowait = False
    132133        self.select_related = False
    class Query(object):  
    265266        obj.order_by = self.order_by[:]
    266267        obj.low_mark, obj.high_mark = self.low_mark, self.high_mark
    267268        obj.distinct = self.distinct
     269        obj.distinct_fields = self.distinct_fields[:]
    268270        obj.select_for_update = self.select_for_update
    269271        obj.select_for_update_nowait = self.select_for_update_nowait
    270272        obj.select_related = self.select_related
    class Query(object):  
    298300        else:
    299301            obj.used_aliases = set()
    300302        obj.filter_is_sticky = False
     303
    301304        obj.__dict__.update(kwargs)
    302305        if hasattr(obj, '_setup_query'):
    303306            obj._setup_query()
    class Query(object):  
    393396        Performs a COUNT() query using the current filter constraints.
    394397        """
    395398        obj = self.clone()
    396         if len(self.select) > 1 or self.aggregate_select:
     399        if len(self.select) > 1 or self.aggregate_select or (self.distinct and self.distinct_fields):
    397400            # If a select clause exists, then the query has already started to
    398401            # specify the columns that are to be returned.
    399402            # In this case, we need to use a subquery to evaluate the count.
    class Query(object):  
    452455                "Cannot combine queries once a slice has been taken."
    453456        assert self.distinct == rhs.distinct, \
    454457            "Cannot combine a unique query with a non-unique query."
     458        assert self.distinct_fields == rhs.distinct_fields, \
     459            "Cannot combine queries with different distinct fields."
    455460
    456461        self.remove_inherited_models()
    457462        # Work out how to relabel the rhs aliases, if necessary.
    class Query(object):  
    674679        """ Increases the reference count for this alias. """
    675680        self.alias_refcount[alias] += 1
    676681
    677     def unref_alias(self, alias):
     682    def unref_alias(self, alias, amount=1):
    678683        """ Decreases the reference count for this alias. """
    679         self.alias_refcount[alias] -= 1
     684        self.alias_refcount[alias] -= amount
    680685
    681686    def promote_alias(self, alias, unconditional=False):
    682687        """
    class Query(object):  
    705710            if self.promote_alias(alias, must_promote):
    706711                must_promote = True
    707712
     713    def reset_refcounts(self, to_counts):
     714        """
     715        This method will reset reference counts for aliases so that they match
     716        that given in to_counts.
     717        """
     718        for alias, cur_refcount in self.alias_refcount.copy().items():
     719            unref_amount = cur_refcount - to_counts.get(alias, 0)
     720            self.unref_alias(alias, unref_amount)
     721
    708722    def promote_unused_aliases(self, initial_refcounts, used_aliases):
    709723        """
    710724        Given a "before" copy of the alias_refcounts dictionary (as
    class Query(object):  
    832846    def count_active_tables(self):
    833847        """
    834848        Returns the number of tables in this query with a non-zero reference
    835         count.
     849        count. Note that after execution, the reference counts are zeroed, so
     850        tables added in compiler will not be seen by this method.
    836851        """
    837852        return len([1 for count in self.alias_refcount.itervalues() if count])
    838853
    class Query(object):  
    15961611        self.select = []
    15971612        self.select_fields = []
    15981613
     1614    def add_distinct_fields(self, *field_names):
     1615        """
     1616        Adds and resolves the given fields to the query's "distinct on" clause.
     1617        """
     1618        self.distinct_fields = field_names
     1619        self.distinct = True
     1620
     1621
    15991622    def add_fields(self, field_names, allow_m2m=True):
    16001623        """
    16011624        Adds the given (model) fields to the select set. The field names are
  • docs/ref/models/querysets.txt

    diff --git a/docs/ref/models/querysets.txt b/docs/ref/models/querysets.txt
    index 6f2cad3..b7bc647 100644
    a b remain undefined afterward).  
    345345distinct
    346346~~~~~~~~
    347347
    348 .. method:: distinct()
     348.. method:: distinct([*fields])
    349349
    350350Returns a new ``QuerySet`` that uses ``SELECT DISTINCT`` in its SQL query. This
    351351eliminates duplicate rows from the query results.
    query spans multiple tables, it's possible to get duplicate results when a  
    374374    :meth:`values()` together, be careful when ordering by fields not in the
    375375    :meth:`values()` call.
    376376
     377.. versionadded:: 1.4
     378
     379The possibility to pass positional arguments (``*fields``) is new in Django 1.4.
     380They are names of fields to which the ``DISTINCT`` should be limited. This
     381translates to a ``SELECT DISTINCT ON`` SQL query. A ``DISTINCT ON`` query eliminates
     382duplicate rows not by comparing all fields in a row, but by comparing only the given
     383fields.
     384
     385.. note::
     386    Note that the ability to specify field names is only available in PostgreSQL.
     387
     388.. note::
     389    When using the ``DISTINCT ON`` functionality it is required that the columns given
     390    to :meth:`distinct` match the first :meth:`order_by` columns. For example ``SELECT
     391    DISTINCT ON (a)`` gives you the first row for each value in column ``a``. If you
     392    don't specify an order, then you'll get some arbitrary row.
     393
     394Examples::
     395
     396    >>> Author.objects.distinct()
     397    [...]
     398
     399    >>> Entry.objects.order_by('pub_date').distinct('pub_date')
     400    [...]
     401
     402    >>> Entry.objects.order_by('blog').distinct('blog')
     403    [...]
     404
     405    >>> Entry.objects.order_by('author', 'pub_date').distinct('author', 'pub_date')
     406    [...]
     407
     408    >>> Entry.objects.order_by('blog__name', 'mod_date').distinct('blog__name', 'mod_date')
     409    [...]
     410
     411    >>> Entry.objects.order_by('author', 'pub_date').distinct('author')
     412    [...]
     413
    377414values
    378415~~~~~~
    379416
  • tests/regressiontests/queries/models.py

    diff --git a/tests/regressiontests/queries/models.py b/tests/regressiontests/queries/models.py
    index e69ce48..6ad9986 100644
    a b class Celebrity(models.Model):  
    209209    name = models.CharField("Name", max_length=20)
    210210    greatest_fan = models.ForeignKey("Fan", null=True, unique=True)
    211211
     212    def __unicode__(self):
     213        return self.name
     214
    212215class TvChef(Celebrity):
    213216    pass
    214217
    class OneToOneCategory(models.Model):  
    344347    def __unicode__(self):
    345348        return "one2one " + self.new_name
    346349
     350class Staff(models.Model):
     351    id = models.IntegerField(primary_key=True)
     352    name = models.CharField(max_length=50)
     353    organisation = models.CharField(max_length=100)
     354    tags = models.ManyToManyField(Tag, through='StaffTag')
     355    coworkers = models.ManyToManyField('self')
     356
     357    def __unicode__(self):
     358        return self.name
     359
     360class StaffTag(models.Model):
     361    staff = models.ForeignKey(Staff)
     362    tag = models.ForeignKey(Tag)
     363
     364    def __unicode__(self):
     365        return u"%s -> %s" % (self.tag, self.staff)
  • tests/regressiontests/queries/tests.py

    diff --git a/tests/regressiontests/queries/tests.py b/tests/regressiontests/queries/tests.py
    index 6a54125..fca99be 100644
    a b from .models import (Annotation, Article, Author, Celebrity, Child, Cover,  
    1818    ManagedModel, Member, NamedCategory, Note, Number, Plaything, PointerA,
    1919    Ranking, Related, Report, ReservedName, Tag, TvChef, Valid, X, Food, Eaten,
    2020    Node, ObjectA, ObjectB, ObjectC, CategoryItem, SimpleCategory,
    21     SpecialCategory, OneToOneCategory)
     21    SpecialCategory, OneToOneCategory, Staff, StaffTag)
    2222
    2323
    2424class BaseQuerysetTest(TestCase):
    class Queries1Tests(BaseQuerysetTest):  
    234234            ['<Item: four>', '<Item: one>']
    235235        )
    236236
    237     # FIXME: This is difficult to fix and very much an edge case, so punt for
    238     # now.  This is related to the order_by() tests for ticket #2253, but the
    239     # old bug exhibited itself here (q2 was pulling too many tables into the
    240     # combined query with the new ordering, but only because we have evaluated
    241     # q2 already).
    242     @unittest.expectedFailure
    243237    def test_order_by_tables(self):
    244238        q1 = Item.objects.order_by('name')
    245239        q2 = Item.objects.filter(id=self.i1.id)
    246240        list(q2)
    247241        self.assertEqual(len((q1 & q2).order_by('name').query.tables), 1)
    248242
     243    def test_order_by_join_unref(self):
     244        """
     245        This test is related to the above one, testing that there aren't
     246        old JOINs in the query.
     247        """
     248        qs = Celebrity.objects.order_by('greatest_fan__fan_of')
     249        self.assertIn('OUTER JOIN', str(qs.query))
     250        qs = qs.order_by('id')
     251        self.assertNotIn('OUTER JOIN', str(qs.query))
     252
    249253    def test_tickets_4088_4306(self):
    250254        self.assertQuerysetEqual(
    251255            Report.objects.filter(creator=1001),
    class ConditionalTests(BaseQuerysetTest):  
    17391743        t4 = Tag.objects.create(name='t4', parent=t3)
    17401744        t5 = Tag.objects.create(name='t5', parent=t3)
    17411745
     1746        p1_o1 = Staff.objects.create(id=1, name="p1", organisation="o1")
     1747        p2_o1 = Staff.objects.create(id=2, name="p2", organisation="o1")
     1748        p3_o1 = Staff.objects.create(id=3, name="p3", organisation="o1")
     1749        p1_o2 = Staff.objects.create(id=4, name="p1", organisation="o2")
     1750        p1_o1.coworkers.add(p2_o1, p3_o1)
     1751        StaffTag.objects.create(staff=p1_o1, tag=t1)
     1752        StaffTag.objects.create(staff=p1_o1, tag=t1)
     1753
     1754        celeb1 = Celebrity.objects.create(name="c1")
     1755        celeb2 = Celebrity.objects.create(name="c2")
     1756
     1757        self.fan1 = Fan.objects.create(fan_of=celeb1)
     1758        self.fan2 = Fan.objects.create(fan_of=celeb1)
     1759        self.fan3 = Fan.objects.create(fan_of=celeb2)
     1760
    17421761    # In Python 2.6 beta releases, exceptions raised in __len__ are swallowed
    17431762    # (Python issue 1242657), so these cases return an empty list, rather than
    17441763    # raising an exception. Not a lot we can do about that, unfortunately, due to
    class ConditionalTests(BaseQuerysetTest):  
    18101829            2500
    18111830        )
    18121831
     1832    @skipUnlessDBFeature('can_distinct_on_fields')
     1833    def test_ticket6422(self):
     1834        """QuerySet.distinct('field', ...) works"""
     1835        # (qset, expected) tuples
     1836        qsets = (
     1837            (
     1838                Staff.objects.distinct().order_by('name'),
     1839                ['<Staff: p1>', '<Staff: p1>', '<Staff: p2>', '<Staff: p3>'],
     1840            ),
     1841            (
     1842                Staff.objects.distinct('name').order_by('name'),
     1843                ['<Staff: p1>', '<Staff: p2>', '<Staff: p3>'],
     1844            ),
     1845            (
     1846                Staff.objects.distinct('organisation').order_by('organisation', 'name'),
     1847                ['<Staff: p1>', '<Staff: p1>'],
     1848            ),
     1849            (
     1850                Staff.objects.distinct('name', 'organisation').order_by('name', 'organisation'),
     1851                ['<Staff: p1>', '<Staff: p1>', '<Staff: p2>', '<Staff: p3>'],
     1852            ),
     1853            (
     1854                Celebrity.objects.filter(fan__in=[self.fan1, self.fan2, self.fan3]).\
     1855                    distinct('name').order_by('name'),
     1856                ['<Celebrity: c1>', '<Celebrity: c2>'],
     1857            ),
     1858            # Does combining querysets work?
     1859            (
     1860                (Celebrity.objects.filter(fan__in=[self.fan1, self.fan2]).\
     1861                    distinct('name').order_by('name')
     1862                |Celebrity.objects.filter(fan__in=[self.fan3]).\
     1863                    distinct('name').order_by('name')),
     1864                ['<Celebrity: c1>', '<Celebrity: c2>'],
     1865            ),
     1866            (
     1867                StaffTag.objects.distinct('staff','tag'),
     1868                ['<StaffTag: t1 -> p1>'],
     1869            ),
     1870            (
     1871                Tag.objects.order_by('parent__pk', 'pk').distinct('parent'),
     1872                ['<Tag: t2>', '<Tag: t4>', '<Tag: t1>'],
     1873            ),
     1874            (
     1875                StaffTag.objects.select_related('staff').distinct('staff__name').order_by('staff__name'),
     1876                ['<StaffTag: t1 -> p1>'],
     1877            ),
     1878            # Fetch the alphabetically first coworker for each worker
     1879            (
     1880                (Staff.objects.distinct('id').order_by('id', 'coworkers__name').
     1881                               values_list('id', 'coworkers__name')),
     1882                ["(1, u'p2')", "(2, u'p1')", "(3, u'p1')", "(4, None)"]
     1883            ),
     1884        )
     1885        for qset, expected in qsets:
     1886            self.assertQuerysetEqual(qset, expected)
     1887            self.assertEqual(qset.count(), len(expected))
     1888
     1889        # Combining queries with different distinct_fields is not allowed.
     1890        base_qs = Celebrity.objects.all()
     1891        self.assertRaisesMessage(
     1892            AssertionError,
     1893            "Cannot combine queries with different distinct fields.",
     1894            lambda: (base_qs.distinct('id') & base_qs.distinct('name'))
     1895        )
     1896
     1897        # Test join unreffing
     1898        c1 = Celebrity.objects.distinct('greatest_fan__id', 'greatest_fan__fan_of')
     1899        self.assertIn('OUTER JOIN', str(c1.query))
     1900        c2 = c1.distinct('pk')
     1901        self.assertNotIn('OUTER JOIN', str(c2.query))
     1902
    18131903class UnionTests(unittest.TestCase):
    18141904    """
    18151905    Tests for the union of two querysets. Bug #12252.
  • tests/regressiontests/select_related_regress/tests.py

    diff --git a/tests/regressiontests/select_related_regress/tests.py b/tests/regressiontests/select_related_regress/tests.py
    index 4818b95..4cd4f78 100644
    a b class SelectRelatedRegressTests(TestCase):  
    4040        self.assertEqual([(c.id, unicode(c.start), unicode(c.end)) for c in connections],
    4141            [(c1.id, u'router/4', u'switch/7'), (c2.id, u'switch/7', u'server/1')])
    4242
    43         # This final query should only join seven tables (port, device and building
    44         # twice each, plus connection once).
    45         self.assertEqual(connections.query.count_active_tables(), 7)
     43        # This final query should only have seven tables (port, device and building
     44        # twice each, plus connection once). Thus, 6 joins plus the FROM table.
     45        self.assertEqual(str(connections.query).count(" JOIN "), 6)
    4646
    4747
    4848    def test_regression_8106(self):
Back to Top