1👍
You can create a custom Model field that saves your data to a file in storage and saves the relative file path to the database.
Here is how you could subclass models.CharField
in your app’s fields.py
:
import os
from django.core.exceptions import ValidationError
from django.core.files.storage import default_storage
from django.db import models
from django.utils.translation import gettext_lazy as _
class DataFrameField(models.CharField):
"""
custom field to save Pandas DataFrame to the hdf5 file format
as advised in the official pandas documentation:
http://pandas.pydata.org/pandas-docs/stable/io.html#io-perf
"""
attr_class = DataFrame
default_error_messages = {
"invalid": _("Please provide a DataFrame object"),
}
def __init__(
self,
verbose_name=None,
name=None,
upload_to="data",
storage=None,
unique_fields=[],
**kwargs
):
self.storage = storage or default_storage
self.upload_to = upload_to
self.unique_fields = unique_fields
kwargs.setdefault("max_length", 100)
super().__init__(verbose_name, name, **kwargs)
def deconstruct(self):
name, path, args, kwargs = super().deconstruct()
if kwargs.get("max_length") == 100:
del kwargs["max_length"]
if self.upload_to != "data":
kwargs["upload_to"] = self.upload_to
if self.storage is not default_storage:
kwargs["storage"] = self.storage
kwargs["unique_fields"] = self.unique_fields
return name, path, args, kwargs
The __init__
and deconstruct
methods are very much inspired by the Django original FileField. There is an additional unique_fields
parameter that is useful for creating predictable unique file names.
def from_db_value(self, value, expression, connection):
"""
return a DataFrame object from the filepath saved in DB
"""
if value is None:
return value
return self.retrieve_dataframe(value)
def get_absolute_path(self, value):
"""
return absolute path based on the value saved in the Database.
"""
return self.storage.path(value)
def retrieve_dataframe(self, value):
"""
return the pandas DataFrame and add filepath as property to Dataframe
"""
# read dataframe from storage
absolute_filepath = self.get_absolute_path(value)
dataframe = read_hdf(absolute_filepath)
# add relative filepath as instance property for later use
dataframe.filepath = value
return dataframe
You load the DataFrame to memory from storage with the from_db_value
method based on the file path saved in the database.
When retrieving the DataFrame, you also add the file path as instance property to it, so that you can use that value when saving the DataFrame back to the database.
def pre_save(self, model_instance, add):
"""
save the dataframe field to an hdf5 field before saving the model
"""
dataframe = super().pre_save(model_instance, add)
if dataframe is None:
return dataframe
if not isinstance(dataframe, DataFrame):
raise ValidationError(
self.error_messages["invalid"], code="invalid",
)
self.save_dataframe_to_file(dataframe, model_instance)
return dataframe
def get_prep_value(self, value):
"""
save the value of the dataframe.filepath set in pre_save
"""
if value is None:
return value
# save only the filepath to the database
if value.filepath:
return value.filepath
def save_dataframe_to_file(self, dataframe, model_instance):
"""
write the Dataframe into an hdf5 file in storage at filepath
"""
# try to retrieve the filepath set when loading from the database
if not dataframe.get("filepath"):
dataframe.filepath = self.generate_filepath(model_instance)
full_filepath = self.storage.path(dataframe.filepath)
# Create any intermediate directories that do not exist.
# shamelessly copied from Django's original Storage class
directory = os.path.dirname(full_filepath)
if not os.path.exists(directory):
try:
if self.storage.directory_permissions_mode is not None:
# os.makedirs applies the global umask, so we reset it,
# for consistency with file_permissions_mode behavior.
old_umask = os.umask(0)
try:
os.makedirs(directory, self.storage.directory_permissions_mode)
finally:
os.umask(old_umask)
else:
os.makedirs(directory)
except FileExistsError:
# There's a race between os.path.exists() and os.makedirs().
# If os.makedirs() fails with FileExistsError, the directory
# was created concurrently.
pass
if not os.path.isdir(directory):
raise IOError("%s exists and is not a directory." % directory)
# save to storage
dataframe.to_hdf(full_filepath, "df", mode="w", format="fixed")
def generate_filepath(self, instance):
"""
return a filepath based on the model's class name, dataframe_field and unique fields
"""
# create filename based on instance and field name
class_name = instance.__class__.__name__
# generate unique id from unique fields:
unique_id_values = []
for field in self.unique_fields:
unique_field_value = getattr(instance, field)
# get field value or id if the field value is a related model instance
unique_id_values.append(
str(getattr(unique_field_value, "id", unique_field_value))
)
# filename, for example: route_data_<uuid>.h5
filename = "{class_name}_{field_name}_{unique_id}.h5".format(
class_name=class_name.lower(),
field_name=self.name,
unique_id="".join(unique_id_values),
)
# generate filepath
dirname = self.upload_to
filepath = os.path.join(dirname, filename)
return self.storage.generate_filename(filepath)
Save the DataFrame to an hdf5 file with the pre_save
method and save the file path to the Database in get_prep_value
.
In my case it helped to use a uuid
Model Field to create the unique file name, because for new model instances, the pk
was not yet available in the pre-save
method, but the uuid
value was.
You can then use this field in your models.py
:
from .fields import DataFrameField
# track data as a pandas DataFrame
data = DataFrameField(null=True, upload_to="data", unique_fields=["uuid"])
Please note that you cannot use this field in the Django admin or in a Model form. That would require additional work on a custom form Widget to edit the DataFrame content in the front-end, probably as a table.
Also beware that for tests, I had to override the MEDIA_ROOT
setting with a temporary directory using tempfile to prevent creating useless files in the actual media folder.
-1👍
It’s not HDF5, but check out picklefield:
from picklefield.fields import PickledObjectField
class Result(model.Model):
scenario = models.ForeignKey(Scenario)
data = PickledObjectField(blank=True, null=True)