简体   繁体   中英

Efficient join using Django REST Framework serializers

I am using the below set of serializers to achieve a join, which work fine on my development setup, but performs terribly when there is any distance between the website server and the database server. I got suspicious about the SQL that's running the show and did some logging; it seems that it's doing a new query for every entry and combining the results rather than doing the entire join at once and returning the join like I want. Here are my serializers:

class UserSerializer(serializers.ModelSerializer):
    class Meta:
        model = User
        exclude = ('password', 'last_login', 'is_superuser', 'is_staff', 'is_active', 'date_joined',
                   'groups', 'user_permissions')


class DepartmentSerializer(serializers.HyperlinkedModelSerializer):
    curator = UserSerializer()
    class Meta:
        model = Department
        fields = '__all__'


class CategorySerializer(serializers.HyperlinkedModelSerializer):
    class Meta:
        model = Category
        fields = '__all__'


class DetailedLinkedContentSerializer(serializers.HyperlinkedModelSerializer):
    category = CategorySerializer()
    department = DepartmentSerializer()
    type = serializers.SerializerMethodField()

    class Meta:
        fields = '__all__'
        model = LinkedContent

    def get_type(self, obj):
        return 'link'


class DetailedFileContentSerializer(serializers.HyperlinkedModelSerializer):
    category = CategorySerializer()
    department = DepartmentSerializer()
    link_url = serializers.SerializerMethodField()
    type = serializers.SerializerMethodField()

    class Meta:
        fields = '__all__'
        model = FileContent

    def get_link_url(self, obj):
        return obj.file.url

    def get_type(self, obj):
        return obj.file_type

As you can see, I'm doing my 'join' by including fields in a serializer as serializers of other models, such as category = CategorySerializer(). It looks like that's what DRF recommends , unless I'm misunderstanding something. Here is a small sample of the hundreds and hundreds of queries being run on my development environment:

(0.001) SELECT "content_linkedcontent"."id", "content_linkedcontent"."link_text", "content_linkedcontent"."department_id", "content_linkedcontent"."category_id", "content_linkedcontent"."visibility_rank", "content_linkedcontent"."link_url" FROM "content_linkedcontent"; args=()
(0.001) SELECT "content_category"."id", "content_category"."name", "content_category"."description" FROM "content_category" WHERE "content_category"."id" = 3; args=(3,)
(0.001) SELECT "content_department"."id", "content_department"."name", "content_department"."description", "content_department"."curator_id", "content_department"."visibility_rank" FROM "content_department" WHERE "content_department"."id" = 24; args=(24,)
(0.000) SELECT "auth_user"."id", "auth_user"."password", "auth_user"."last_login", "auth_user"."is_superuser", "auth_user"."username", "auth_user"."first_name", "auth_user"."last_name", "auth_user"."email", "auth_user"."is_staff", "auth_user"."is_active", "auth_user"."date_joined" FROM "auth_user" WHERE "auth_user"."id" = 3; args=(3,)
(0.000) SELECT "content_category"."id", "content_category"."name", "content_category"."description" FROM "content_category" WHERE "content_category"."id" = 3; args=(3,)
(0.000) SELECT "content_department"."id", "content_department"."name", "content_department"."description", "content_department"."curator_id", "content_department"."visibility_rank" FROM "content_department" WHERE "content_department"."id" = 29; args=(29,)
(0.000) SELECT "auth_user"."id", "auth_user"."password", "auth_user"."last_login", "auth_user"."is_superuser", "auth_user"."username", "auth_user"."first_name", "auth_user"."last_name", "auth_user"."email", "auth_user"."is_staff", "auth_user"."is_active", "auth_user"."date_joined" FROM "auth_user" WHERE "auth_user"."id" = 6; args=(6,)
(0.000) SELECT "content_category"."id", "content_category"."name", "content_category"."description" FROM "content_category" WHERE "content_category"."id" = 4; args=(4,)
(0.000) SELECT "content_department"."id", "content_department"."name", "content_department"."description", "content_department"."curator_id", "content_department"."visibility_rank" FROM "content_department" WHERE "content_department"."id" = 25; args=(25,)
(0.000) SELECT "auth_user"."id", "auth_user"."password", "auth_user"."last_login", "auth_user"."is_superuser", "auth_user"."username", "auth_user"."first_name", "auth_user"."last_name", "auth_user"."email", "auth_user"."is_staff", "auth_user"."is_active", "auth_user"."date_joined" FROM "auth_user" WHERE "auth_user"."id" = 6; args=(6,)
(0.000) SELECT "content_category"."id", "content_category"."name", "content_category"."description" FROM "content_category" WHERE "content_category"."id" = 1; args=(1,)
(0.000) SELECT "content_department"."id", "content_department"."name", "content_department"."description", "content_department"."curator_id", "content_department"."visibility_rank" FROM "content_department" WHERE "content_department"."id" = 29; args=(29,)
(0.000) SELECT "auth_user"."id", "auth_user"."password", "auth_user"."last_login", "auth_user"."is_superuser", "auth_user"."username", "auth_user"."first_name", "auth_user"."last_name", "auth_user"."email", "auth_user"."is_staff", "auth_user"."is_active", "auth_user"."date_joined" FROM "auth_user" WHERE "auth_user"."id" = 6; args=(6,)
(0.000) SELECT "content_category"."id", "content_category"."name", "content_category"."description" FROM "content_category" WHERE "content_category"."id" = 1; args=(1,)
(0.000) SELECT "content_department"."id", "content_department"."name", "content_department"."description", "content_department"."curator_id", "content_department"."visibility_rank" FROM "content_department" WHERE "content_department"."id" = 25; args=(25,)
(0.000) SELECT "auth_user"."id", "auth_user"."password", "auth_user"."last_login", "auth_user"."is_superuser", "auth_user"."username", "auth_user"."first_name", "auth_user"."last_name", "auth_user"."email", "auth_user"."is_staff", "auth_user"."is_active", "auth_user"."date_joined" FROM "auth_user" WHERE "auth_user"."id" = 6; args=(6,)
(0.000) SELECT "content_category"."id", "content_category"."name", "content_category"."description" FROM "content_category" WHERE "content_category"."id" = 1; args=(1,)
(0.000) SELECT "content_department"."id", "content_department"."name", "content_department"."description", "content_department"."curator_id", "content_department"."visibility_rank" FROM "content_department" WHERE "content_department"."id" = 24; args=(24,)
(0.000) SELECT "auth_user"."id", "auth_user"."password", "auth_user"."last_login", "auth_user"."is_superuser", "auth_user"."username", "auth_user"."first_name", "auth_user"."last_name", "auth_user"."email", "auth_user"."is_staff", "auth_user"."is_active", "auth_user"."date_joined" FROM "auth_user" WHERE "auth_user"."id" = 3; args=(3,)
(0.000) SELECT "content_category"."id", "content_category"."name", "content_category"."description" FROM "content_category" WHERE "content_category"."id" = 3; args=(3,)
(0.000) SELECT "content_department"."id", "content_department"."name", "content_department"."description", "content_department"."curator_id", "content_department"."visibility_rank" FROM "content_department" WHERE "content_department"."id" = 28; args=(28,)
(0.000) SELECT "auth_user"."id", "auth_user"."password", "auth_user"."last_login", "auth_user"."is_superuser", "auth_user"."username", "auth_user"."first_name", "auth_user"."last_name", "auth_user"."email", "auth_user"."is_staff", "auth_user"."is_active", "auth_user"."date_joined" FROM "auth_user" WHERE "auth_user"."id" = 6; args=(6,)
(0.000) SELECT "content_category"."id", "content_category"."name", "content_category"."description" FROM "content_category" WHERE "content_category"."id" = 1; args=(1,)
(0.000) SELECT "content_department"."id", "content_department"."name", "content_department"."description", "content_department"."curator_id", "content_department"."visibility_rank" FROM "content_department" WHERE "content_department"."id" = 28; args=(28,)
(0.000) SELECT "auth_user"."id", "auth_user"."password", "auth_user"."last_login", "auth_user"."is_superuser", "auth_user"."username", "auth_user"."first_name", "auth_user"."last_name", "auth_user"."email", "auth_user"."is_staff", "auth_user"."is_active", "auth_user"."date_joined" FROM "auth_user" WHERE "auth_user"."id" = 6; args=(6,)
(0.000) SELECT "content_category"."id", "content_category"."name", "content_category"."description" FROM "content_category" WHERE "content_category"."id" = 4; args=(4,)

So how can I do a real join with the information that I want using serializers in DRF?

UPDATE:

I have manage to cut the query time in half by following the advice on this blog entry, here are my updated serializers and the view that use them:

class DetailedLinkedContentSerializer(serializers.HyperlinkedModelSerializer):
    category = CategorySerializer()
    department_query = Department.objects.all()
    department_query = DepartmentSerializer.setup_eager_loading(department_query)
    department = DepartmentSerializer(department_query)
    # department = DepartmentSerializer()
    type = serializers.SerializerMethodField()

    class Meta:
        fields = '__all__'
        model = LinkedContent

    def get_type(self, obj):
        return 'link'

    @staticmethod
    def setup_eager_loading(queryset):
        """ Perform necessary eager loading of data. """
        queryset = queryset.select_related('category', 'department')
        return queryset


class DetailedFileContentSerializer(serializers.HyperlinkedModelSerializer):
    category = CategorySerializer()
    department_query = Department.objects.all()
    department_query = DepartmentSerializer.setup_eager_loading(department_query)
    department = DepartmentSerializer(department_query)
    # department = DepartmentSerializer()
    link_url = serializers.SerializerMethodField()
    type = serializers.SerializerMethodField()

    class Meta:
        fields = '__all__'
        model = FileContent

    def get_link_url(self, obj):
        return obj.file.url

    def get_type(self, obj):
        return obj.file_type

    @staticmethod
    def setup_eager_loading(queryset):
        """ Perform necessary eager loading of data. """
        queryset = queryset.select_related('category', 'department')
        return queryset

and my view that makes use of those serializers:

class DetailedContentView(views.APIView):
    permission_classes = [IsAuthenticated, ContentCuratorOrReadOnly, IsGroupMember, ]
    def get(self, request, *args, **kwargs):
        context = {"request": request}
        linked_content = LinkedContent.objects.all()
        file_content = FileContent.objects.all()
        # this line is newly added
        linked_content = DetailedLinkedContentSerializer.setup_eager_loading(linked_content)
        # this line too
        file_content = DetailedLinkedContentSerializer.setup_eager_loading(file_content)
        linked_content_serializer = DetailedLinkedContentSerializer(linked_content, many=True, context=context)
        file_content_serializer = DetailedFileContentSerializer(file_content, many=True, context=context)

        response = linked_content_serializer.data + file_content_serializer.data
        response = sorted(response, key=lambda x: (x['department']['visibility_rank'], x['visibility_rank']))

        return Response(response)

However, my solution to prefetch my Users serializer within my Department serializer doesn't seem to be doing the trick. Specifically, after updating my department serializer to:

class DepartmentSerializer(serializers.HyperlinkedModelSerializer):
    curator = UserSerializer()
    class Meta:
        model = Department
        fields = '__all__'

    @staticmethod
    def setup_eager_loading(queryset):
        """ Perform necessary eager loading of data. """
        queryset = queryset.select_related('curator')
        return queryset

The following lines:

department_query = Department.objects.all()
department_query = DepartmentSerializer.setup_eager_loading(department_query)
department = DepartmentSerializer(department_query)

don't seem to be prefetching my curators as I want.

I updated my serializers using a mixin from a comment found in this blog :

class EagerLoadingMixin:
    @classmethod
    def setup_eager_loading(cls, queryset):
        if hasattr(cls, "_SELECT_RELATED_FIELDS"):
            queryset = queryset.select_related(*cls._SELECT_RELATED_FIELDS)
        if hasattr(cls, "_PREFETCH_RELATED_FIELDS"):
            queryset = queryset.prefetch_related(*cls._PREFETCH_RELATED_FIELDS)
        return queryset  

and added 'department__curator' to my list of prefetch fields. Now it looks like this:

class DetailedFileContentSerializer(EagerLoadingMixin, serializers.HyperlinkedModelSerializer):
    category = CategorySerializer()
    department = DepartmentSerializer()
    link_url = serializers.SerializerMethodField()
    type = serializers.SerializerMethodField()

    class Meta:
        fields = '__all__'
        model = FileContent

    def get_link_url(self, obj):
        return obj.file.url

    def get_type(self, obj):
        return obj.file_type

    _SELECT_RELATED_FIELDS = ['department', 'category', 'department__curator']

The query now takes roughly 1/3 the time and doesn't contain hundreds of SELECTs. It still takes too long, but I believe I can solve that by changing switched to a different hosting solution for my database.

I'm not sure if my solution also works, but I tried solving this problem using a custom manager , as someone else commented on the blog post . (This is more of a Django solution than a DRF solution.) Note that this also fetches related objects even when you're not using a serializer (eg SomeModel.objects.all() ), which may or may not be what you want. Hopefully this answer will get better after a peer review.

Assumptions:

  • User - Listing: one-to-many

  • Listing - Item: many-to-many

  • Listing - Like: one-to-many

class PreFetchMixin:
    def get_queryset(self):
        queryset = super().get_queryset()
        if hasattr(self, '_SELECT_RELATED_FIELDS'):
            queryset = queryset.select_related(
                *self._SELECT_RELATED_FIELDS)
        if hasattr(self, '_PREFETCH_RELATED_FIELDS'):
            queryset = queryset.prefetch_related(
                *self._PREFETCH_RELATED_FIELDS)
        if hasattr(self, '_ANNOTATIONS'):
            queryset = queryset.annotate(**self._ANNOTATIONS)
        return queryset
# PreFetchMixin must come first considering MRO
from django.db import models
from django.db.models import Count
class ListingManager(PreFetchMixin, models.Manager):
    _SELECT_RELATED_FIELDS = ('user',)
    _PREFETCH_RELATED_FIELDS = ('items',)
    _ANNOTATIONS = {'num_likes': Count('like')}

And add a line to Listing :

class Listing(models.Model):
    ...
    objects = ListingManager()
    ...

If you want to use a DRF serializer, you need to make minor changes:

from rest_framework import serializers
# assumes that ItemSerializer is defined
class ListingSerializer(serializers.ModelSerializer):
    items = ItemSerializer(many=True, read_only=True)
    num_likes = serializers.IntegerField(read_only=True)

    class Meta:
        model = Listing
        fields = '__all__'

In your view:

def get_queryset(self):
    return (
        super().get_queryset()
        .select_related(relation1, relation2, ...)
        .prefetch_related(relation3, relation4, ...)
    )

That's it.

DRF under the hood does:

nestend_instance_for_serialization = getattr(instance, fk_field_name)

serialize_nested(instance.one_to_many_relation_field_name.all())

qs.all() has a cache which is evaluated once and then used by drf and other loops.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM