I've done some reading about Django's bulk_create() and I'm currently trying to implement it but it's not functioning as I would expect. Originally my code didn't use bulk_create and it took about 33 seconds or so to import 6074 rows of data. Slow but it worked.
Models:
class Building(models.Model):
community = models.ForeignKey('Community', related_name='Building Community Name')
physical_location = models.CharField(max_length=80, null=True, blank=True)
data_source = models.CharField(max_length=50, null=True, blank=True)
facility_name = models.CharField(max_length=120, null=True, blank=True)
facility_type = models.CharField(max_length=80, null=True, blank=True)
size = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
audited = models.NullBooleanField(blank=True)
audit_notes = models.TextField(blank=True)
class RetrofitData(models.Model):
building_id = models.ForeignKey('Building')
retrofits_done = models.NullBooleanField(blank=True)
retrofit_notes = models.TextField(blank=True)
fuel_oil_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
district_heating_oil_usage_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
electricity_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
natural_gas_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
propane_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
biomass_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
fuel_oil_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
district_heating_oil_usage_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
electricity_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
natural_gas_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
propane_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
biomass_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
retrofit_cost = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
biomass_heat = models.NullBooleanField(blank=True)
heat_recovery = models.NullBooleanField(blank=True)
Original code:
class BuildingInventoryImporter(dataimport.DataFileImporter):
def toTrueFalse(self, val):
if val == "Yes":
return True
elif val == "No":
return False
else:
return None
def decCleaner(self, val):
if val == '':
return None
else:
return val2dec(val)
models = [Building, RetrofitData]
@transaction.commit_manually
@rollback_on_exception
def do_import(self):
book = xlrd.open_workbook(self.data_file.file.path,
encoding_override='cp1252')
sheet = book.sheet_by_index(2)
for row_index in range(1,sheet.nrows):
row = sheet.row_values(row_index)
temp_id= row_index
community_name = row[0]
gnis = row[1]
physical_location = row[2]
data_source = row[3]
facility_type = row[5]
if row[4] == '':
if facility_type =="Other" or facility_type == "Office" or facility_type == "Public Assembly" or facility_type == "Public Safety":
facility_name = "Unavailable"
elif facility_type =="Health Care - Hospitals":
facility_name = community_name + " Clinic"
elif facility_type == "Education - K - 12":
facility_name = community_name + " School(s)"
else:
facility_name = row[4]
size = self.decCleaner(row[6])
audited = self.toTrueFalse(row[7])
audit_notes = row[8]
building, created = self.get_or_new(Building, id=temp_id)
try:
community = Community.objects.get(gnis_feature_id=gnis)
except Community.DoesNotExist:
self.warning("The value entered for the Community GNIS: {0} does not exist.".format(row[1]))
try:
community = Community.objects.get(name=community_name)
self.warning("The Community name: {0} is in the db but does not match its associated Community GNIS").format(row[0])
except Community.DoesNotExist:
self.warning("Neither the Community name: {0} nor the Community GNIS: {1} exist.".format(row[0], row[1]))
continue
building.community = community
building.physical_location = physical_location
building.data_source = data_source
building.facility_name = facility_name
building.facility_type = facility_type
building.size = size
building.audited = audited
building.audit_notes = audit_notes
building.save()
retrofit_data, created = self.get_or_new(RetrofitData, building_id=building)
retrofit_data.retrofits_done = self.toTrueFalse(row[9])
retrofit_data.retrofit_notes = row[10]
retrofit_data.fuel_oil_preretrofit = self.decCleaner(row[11])
if row[12] == 999999999: #They decided that a unknown value would be represented as 999999999 in the dataset.
retrofit_data.district_heating_oil_usage_preretrofit = None
else:
retrofit_data.district_heating_oil_usage_preretrofit = self.decCleaner(row[12])
retrofit_data.electricity_preretrofit = self.decCleaner(row[13])
retrofit_data.natural_gas_preretrofit = self.decCleaner(row[14])
retrofit_data.propane_preretrofit = self.decCleaner(row[15])
retrofit_data.biomass_preretrofit = self.decCleaner(row[16])
retrofit_data.fuel_oil_postretrofit = self.decCleaner(row[17])
retrofit_data.district_heating_oil_usage_postretrofit = self.decCleaner(row[18])
retrofit_data.electricity_postretrofit = self.decCleaner(row[19])
retrofit_data.natural_gas_postretrofit = self.decCleaner(row[20])
retrofit_data.propane_postretrofit = self.decCleaner(row[21])
retrofit_data.biomass_postretrofit = self.decCleaner(row[22])
retrofit_data.retrofit_cost = self.decCleaner(row[23])
retrofit_data.biomass_heat = self.toTrueFalse(row[24])
retrofit_data.heat_recovery = self.toTrueFalse(row[25])
retrofit_data.save()
if self.dry_run:
transaction.rollback()
else:
transaction.commit()
dataimport.register(BuildingInventoryImporter)
It would have to hit the database like ~1200 times throughout the course of the data import which was causing the slow import. So to remedy this I looked into using the bulk_create()
Modified Code:
class BuildingInventoryImporterV2(dataimport.DataFileImporter):
models = [Building, RetrofitData]
def do_import(self, dry_run=True):
book = xlrd.open_workbook(self.data_file.file.path,
encoding_override='cp1252')
sheet = book.sheet_by_index(2)
building_bulk_list = []
retrofit_bulk_list = []
for row_index in range(1,sheet.nrows):
row = sheet.row_values(row_index)
temp_id= row_index
community_name = row[0]
gnis = row[1]
facility_type = row[5]
try:
community = Community.objects.get(gnis_feature_id=gnis)
except Community.DoesNotExist:
self.warning("The value entered for the Community GNIS: {0} does not exist.".format(row[1]))
try:
community = Community.objects.get(name=community_name)
self.warning("The Community name: {0} is in the db but does not match its associated Community GNIS").format(row[0])
except Community.DoesNotExist:
self.warning("Neither the Community name: {0} nor the Community GNIS: {1} exist.".format(row[0], row[1]))
continue
if row[4] == '':
if facility_type =="Other" or facility_type == "Office" or facility_type == "Public Assembly" or facility_type == "Public Safety":
facility_name = "Unavailable"
elif facility_type =="Health Care - Hospitals":
facility_name = community_name + " Clinic"
elif facility_type == "Education - K - 12":
facility_name = community_name + " School(s)"
else:
facility_name = row[4]
building_to_add = Building(
community=community,
physical_location=row[2],
data_source=row[3],
facility_name=facility_name,
facility_type=facility_type,
size=self.decCleaner(row[6]),
audited=self.toTrueFalse(row[7]),
audit_notes=row[8]
)
building_bulk_list.append(building_to_add)
if self.dry_run is False:
Building.objects.bulk_create(building_bulk_list)
for row_index in range(1,sheet.nrows):
row = sheet.row_values(row_index)
#They decided that a unknown value would be represented as 999999999 in the dataset.
if row[12] == 999999999:
district_heating_oil_usage_preretrofit = None
else:
district_heating_oil_usage_preretrofit = self.decCleaner(row[12])
retrofit_data_to_add = RetrofitData(
building_id=Building.objects.get(id=temp_id),
retrofits_done=self.toTrueFalse(row[9]),
retrofit_notes=row[10],
fuel_oil_preretrofit=self.decCleaner(row[11]),
district_heating_oil_usage_preretrofit=district_heating_oil_usage_preretrofit,
electricity_preretrofit=self.decCleaner(row[13]),
natural_gas_preretrofit=self.decCleaner(row[14]),
propane_preretrofit=self.decCleaner(row[15]),
biomass_preretrofit=self.decCleaner(row[16]),
fuel_oil_postretrofit=self.decCleaner(row[17]),
district_heating_oil_usage_postretrofit=self.decCleaner(row[18]),
electricity_postretrofit=self.decCleaner(row[19]),
natural_gas_postretrofit=self.decCleaner(row[20]),
propane_postretrofit=self.decCleaner(row[21]),
biomass_postretrofit=self.decCleaner(row[22]),
retrofit_cost=self.decCleaner(row[23]),
biomass_heat=self.toTrueFalse(row[24]),
heat_recovery=self.toTrueFalse(row[25])
)
retrofit_bulk_list.append(retrofit_data_to_add)
if self.dry_run is False:
Building.objects.bulk_create(retrofit_bulk_list)
dataimport.register(BuildingInventoryImporterV2)
Problem arises when I get to the second block of code to bulk import RetroFitData. To my understanding bulk_create() will not assign an AutoField pk when it is called thus you need to place the bulk_create() data in the database before it is assigned AutoField pk. But it seems that that is not accurate as well. After running the import I get the following error:
Traceback:
File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/core/handlers/base.py" in get_response
111. response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/utils/decorators.py" in _wrapped_view
105. response = view_func(request, *args, **kwargs)
File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/views/decorators/cache.py" in _wrapped_view_func
52. response = view_func(request, *args, **kwargs)
File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/contrib/admin/sites.py" in inner
206. return view(request, *args, **kwargs)
File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/contrib/auth/decorators.py" in _wrapped_view
21. return view_func(request, *args, **kwargs)
File "/home/bhernandez/ISER/aedg/core/adminviews.py" in data_import
465. results = importer.run()
File "/home/bhernandez/ISER/aedg/core/dataimport/__init__.py" in run
114. self.do_import()
File "/home/bhernandez/ISER/aedg/akw/dataimport/etc.py" in do_import
656. building_id=Building.objects.get(id=temp_id),
File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/db/models/manager.py" in manager_method
92. return getattr(self.get_queryset(), name)(*args, **kwargs)
File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/db/models/query.py" in get
357. self.model._meta.object_name)
Exception Type: DoesNotExist at /admin/core/datafile/174/import/
Exception Value: Building matching query does not exist.
But when I check my Buildings table it's been populated... Any help or advice is much appreciated.
So when you come to create RetrofitData
you need to know the ID of the Building
objects you've just created.
The chances are you're using a database that has the ID field set to autoincrement, and therefore the objects you create using bulk_create
won't get a PK assigned.
I suppose using building_bulk_list
you could use it's length to get the last set of Building
objects from the database, but then why not create the building_bulk_list
with a more conventional approach for creating objects, calling save()
, allowing you to create a list of IDs?
Then using that list of IDs you can run bulk_create
for RetrofitData
, iterating over that list of IDs to setup the relationship to Building
?
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.