[英]How to store HDF5 (HDF Store) in a Django model field
我目前正在做一个项目,在该项目中我生成 Pandas DataFrames 作为分析结果。 我正在 Django 中进行开发,并希望在“结果”模型中使用“数据”字段来存储 Pandas DataFrame。
看来 HDF5(HDF Store) 是存储我的 Pandas DataFrames 的最有效方式。 但是,我不知道如何在我的模型中创建自定义字段来保存它。 我将在下面展示简化的 views.py 和 models.py 来说明。
模型.py
class Result(model.Model):
scenario = models.ForeignKey(Scenario)
# HOW DO I Store HDFStore
data = models.HDF5Field()
视图.py
class AnalysisAPI(View):
model = Result
def get(self, request):
request_dict = request.GET.dict()
scenario_id = request_dict['scenario_id']
scenario = Scenario.objects.get(pk=scenario_id)
result = self.model.objects.get(scenario=scenario)
analysis_results_df = result.data['analysis_results_df']
return JsonResponse(
analysis_results_df.to_json(orient="records")
)
def post(self, request):
request_dict = request.POST.dict()
scenario_id = request_dict['scenario_id']
scenario = Scenario.objects.get(pk=scenario_id)
record_list = request_dict['record_list']
analysis_results_df = run_analysis(record_list)
data = HDFStore('store.h5')
data['analysis_results_df'] = analysis_results_df
new_result = self.model(scenario=scenario, data=data)
new_result.save()
return JsonResponse(
dict(status="OK", message="Analysis results saved.")
)
我感谢任何帮助,并且我也对另一种存储方法持开放态度,例如 Pickle,只要我可以将它与 Django 一起使用,它具有类似的性能。
您可以创建一个自定义模型字段,将您的数据保存到存储中的文件并将相对文件路径保存到数据库。
这里是你如何能继承models.CharField
应用程式的fields.py
:
import os
from django.core.exceptions import ValidationError
from django.core.files.storage import default_storage
from django.db import models
from django.utils.translation import gettext_lazy as _
class DataFrameField(models.CharField):
"""
custom field to save Pandas DataFrame to the hdf5 file format
as advised in the official pandas documentation:
http://pandas.pydata.org/pandas-docs/stable/io.html#io-perf
"""
attr_class = DataFrame
default_error_messages = {
"invalid": _("Please provide a DataFrame object"),
}
def __init__(
self,
verbose_name=None,
name=None,
upload_to="data",
storage=None,
unique_fields=[],
**kwargs
):
self.storage = storage or default_storage
self.upload_to = upload_to
self.unique_fields = unique_fields
kwargs.setdefault("max_length", 100)
super().__init__(verbose_name, name, **kwargs)
def deconstruct(self):
name, path, args, kwargs = super().deconstruct()
if kwargs.get("max_length") == 100:
del kwargs["max_length"]
if self.upload_to != "data":
kwargs["upload_to"] = self.upload_to
if self.storage is not default_storage:
kwargs["storage"] = self.storage
kwargs["unique_fields"] = self.unique_fields
return name, path, args, kwargs
__init__
和deconstruct
方法非常受 Django 原始FileField 的启发。 还有一个额外的unique_fields
参数可用于创建可预测的唯一文件名。
def from_db_value(self, value, expression, connection):
"""
return a DataFrame object from the filepath saved in DB
"""
if value is None:
return value
return self.retrieve_dataframe(value)
def get_absolute_path(self, value):
"""
return absolute path based on the value saved in the Database.
"""
return self.storage.path(value)
def retrieve_dataframe(self, value):
"""
return the pandas DataFrame and add filepath as property to Dataframe
"""
# read dataframe from storage
absolute_filepath = self.get_absolute_path(value)
dataframe = read_hdf(absolute_filepath)
# add relative filepath as instance property for later use
dataframe.filepath = value
return dataframe
根据保存在数据库中的文件路径,使用from_db_value
方法将from_db_value
从存储加载到内存中。
检索 DataFrame 时,您还将文件路径作为实例属性添加到其中,以便在将 DataFrame 保存回数据库时可以使用该值。
def pre_save(self, model_instance, add):
"""
save the dataframe field to an hdf5 field before saving the model
"""
dataframe = super().pre_save(model_instance, add)
if dataframe is None:
return dataframe
if not isinstance(dataframe, DataFrame):
raise ValidationError(
self.error_messages["invalid"], code="invalid",
)
self.save_dataframe_to_file(dataframe, model_instance)
return dataframe
def get_prep_value(self, value):
"""
save the value of the dataframe.filepath set in pre_save
"""
if value is None:
return value
# save only the filepath to the database
if value.filepath:
return value.filepath
def save_dataframe_to_file(self, dataframe, model_instance):
"""
write the Dataframe into an hdf5 file in storage at filepath
"""
# try to retrieve the filepath set when loading from the database
if not dataframe.get("filepath"):
dataframe.filepath = self.generate_filepath(model_instance)
full_filepath = self.storage.path(dataframe.filepath)
# Create any intermediate directories that do not exist.
# shamelessly copied from Django's original Storage class
directory = os.path.dirname(full_filepath)
if not os.path.exists(directory):
try:
if self.storage.directory_permissions_mode is not None:
# os.makedirs applies the global umask, so we reset it,
# for consistency with file_permissions_mode behavior.
old_umask = os.umask(0)
try:
os.makedirs(directory, self.storage.directory_permissions_mode)
finally:
os.umask(old_umask)
else:
os.makedirs(directory)
except FileExistsError:
# There's a race between os.path.exists() and os.makedirs().
# If os.makedirs() fails with FileExistsError, the directory
# was created concurrently.
pass
if not os.path.isdir(directory):
raise IOError("%s exists and is not a directory." % directory)
# save to storage
dataframe.to_hdf(full_filepath, "df", mode="w", format="fixed")
def generate_filepath(self, instance):
"""
return a filepath based on the model's class name, dataframe_field and unique fields
"""
# create filename based on instance and field name
class_name = instance.__class__.__name__
# generate unique id from unique fields:
unique_id_values = []
for field in self.unique_fields:
unique_field_value = getattr(instance, field)
# get field value or id if the field value is a related model instance
unique_id_values.append(
str(getattr(unique_field_value, "id", unique_field_value))
)
# filename, for example: route_data_<uuid>.h5
filename = "{class_name}_{field_name}_{unique_id}.h5".format(
class_name=class_name.lower(),
field_name=self.name,
unique_id="".join(unique_id_values),
)
# generate filepath
dirname = self.upload_to
filepath = os.path.join(dirname, filename)
return self.storage.generate_filename(filepath)
保存数据框与一个HDF5文件pre_save
方法和文件保存路径在数据库get_prep_value
。
在我的情况下,它有助于使用uuid
模型字段来创建唯一的文件名,因为对于新模型实例, pk
在pre-save
方法中尚不可用,但uuid
值可用。
然后你可以在你的models.py
使用这个字段:
from .fields import DataFrameField
# track data as a pandas DataFrame
data = DataFrameField(null=True, upload_to="data", unique_fields=["uuid"])
请注意,您不能在 Django 管理员或模型表单中使用此字段。 这将需要在自定义表单 Widget 上进行额外的工作,以编辑前端中的 DataFrame 内容,可能作为表格。
还要注意,对于测试,我必须使用tempfile用临时目录覆盖MEDIA_ROOT
设置,以防止在实际媒体文件夹中创建无用的文件。
它不是 HDF5,但请查看 picklefield:
from picklefield.fields import PickledObjectField
class Result(model.Model):
scenario = models.ForeignKey(Scenario)
data = PickledObjectField(blank=True, null=True)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.