[Django]-How to store HDF5 (HDF Store) in a Django model field

1👍

You can create a custom Model field that saves your data to a file in storage and saves the relative file path to the database.

Here is how you could subclass models.CharField in your app’s fields.py:

import os

from django.core.exceptions import ValidationError
from django.core.files.storage import default_storage
from django.db import models
from django.utils.translation import gettext_lazy as _

class DataFrameField(models.CharField):
    """
    custom field to save Pandas DataFrame to the hdf5 file format
    as advised in the official pandas documentation:
    http://pandas.pydata.org/pandas-docs/stable/io.html#io-perf
    """

    attr_class = DataFrame

    default_error_messages = {
        "invalid": _("Please provide a DataFrame object"),
    }

    def __init__(
        self,
        verbose_name=None,
        name=None,
        upload_to="data",
        storage=None,
        unique_fields=[],
        **kwargs
    ):

        self.storage = storage or default_storage
        self.upload_to = upload_to
        self.unique_fields = unique_fields

        kwargs.setdefault("max_length", 100)
        super().__init__(verbose_name, name, **kwargs)

    def deconstruct(self):
        name, path, args, kwargs = super().deconstruct()
        if kwargs.get("max_length") == 100:
            del kwargs["max_length"]
        if self.upload_to != "data":
            kwargs["upload_to"] = self.upload_to
        if self.storage is not default_storage:
            kwargs["storage"] = self.storage
        kwargs["unique_fields"] = self.unique_fields
        return name, path, args, kwargs

The __init__ and deconstruct methods are very much inspired by the Django original FileField. There is an additional unique_fields parameter that is useful for creating predictable unique file names.

    def from_db_value(self, value, expression, connection):
        """
        return a DataFrame object from the filepath saved in DB
        """
        if value is None:
            return value

        return self.retrieve_dataframe(value)

    def get_absolute_path(self, value):
        """
        return absolute path based on the value saved in the Database.
        """
        return self.storage.path(value)

    def retrieve_dataframe(self, value):
        """
        return the pandas DataFrame and add filepath as property to Dataframe
        """

        # read dataframe from storage
        absolute_filepath = self.get_absolute_path(value)
        dataframe = read_hdf(absolute_filepath)

        # add relative filepath as instance property for later use
        dataframe.filepath = value

        return dataframe

You load the DataFrame to memory from storage with the from_db_value method based on the file path saved in the database.

When retrieving the DataFrame, you also add the file path as instance property to it, so that you can use that value when saving the DataFrame back to the database.

    def pre_save(self, model_instance, add):
        """
        save the dataframe field to an hdf5 field before saving the model
        """
        dataframe = super().pre_save(model_instance, add)

        if dataframe is None:
            return dataframe

        if not isinstance(dataframe, DataFrame):
            raise ValidationError(
                self.error_messages["invalid"], code="invalid",
            )

        self.save_dataframe_to_file(dataframe, model_instance)

        return dataframe

    def get_prep_value(self, value):
        """
        save the value of the dataframe.filepath set in pre_save
        """
        if value is None:
            return value

        # save only the filepath to the database
        if value.filepath:
            return value.filepath

    def save_dataframe_to_file(self, dataframe, model_instance):
        """
        write the Dataframe into an hdf5 file in storage at filepath
        """
        # try to retrieve the filepath set when loading from the database
        if not dataframe.get("filepath"):
            dataframe.filepath = self.generate_filepath(model_instance)

        full_filepath = self.storage.path(dataframe.filepath)

        # Create any intermediate directories that do not exist.
        # shamelessly copied from Django's original Storage class
        directory = os.path.dirname(full_filepath)
        if not os.path.exists(directory):
            try:
                if self.storage.directory_permissions_mode is not None:
                    # os.makedirs applies the global umask, so we reset it,
                    # for consistency with file_permissions_mode behavior.
                    old_umask = os.umask(0)
                    try:
                        os.makedirs(directory, self.storage.directory_permissions_mode)
                    finally:
                        os.umask(old_umask)
                else:
                    os.makedirs(directory)
            except FileExistsError:
                # There's a race between os.path.exists() and os.makedirs().
                # If os.makedirs() fails with FileExistsError, the directory
                # was created concurrently.
                pass
        if not os.path.isdir(directory):
            raise IOError("%s exists and is not a directory." % directory)

        # save to storage
        dataframe.to_hdf(full_filepath, "df", mode="w", format="fixed")

    def generate_filepath(self, instance):
        """
        return a filepath based on the model's class name, dataframe_field and unique fields
        """

        # create filename based on instance and field name
        class_name = instance.__class__.__name__

        # generate unique id from unique fields:
        unique_id_values = []
        for field in self.unique_fields:
            unique_field_value = getattr(instance, field)

            # get field value or id if the field value is a related model instance
            unique_id_values.append(
                str(getattr(unique_field_value, "id", unique_field_value))
            )

        # filename, for example: route_data_<uuid>.h5
        filename = "{class_name}_{field_name}_{unique_id}.h5".format(
            class_name=class_name.lower(),
            field_name=self.name,
            unique_id="".join(unique_id_values),
        )

        # generate filepath
        dirname = self.upload_to
        filepath = os.path.join(dirname, filename)
        return self.storage.generate_filename(filepath)

Save the DataFrame to an hdf5 file with the pre_save method and save the file path to the Database in get_prep_value.

In my case it helped to use a uuid Model Field to create the unique file name, because for new model instances, the pk was not yet available in the pre-save method, but the uuid value was.

You can then use this field in your models.py:

from .fields import DataFrameField

# track data as a pandas DataFrame
data = DataFrameField(null=True, upload_to="data", unique_fields=["uuid"])

Please note that you cannot use this field in the Django admin or in a Model form. That would require additional work on a custom form Widget to edit the DataFrame content in the front-end, probably as a table.

Also beware that for tests, I had to override the MEDIA_ROOT setting with a temporary directory using tempfile to prevent creating useless files in the actual media folder.

-1👍

It’s not HDF5, but check out picklefield:

from picklefield.fields import PickledObjectField

class Result(model.Model):
    scenario = models.ForeignKey(Scenario)

    data = PickledObjectField(blank=True, null=True)

https://pypi.python.org/pypi/django-picklefield

👤Brian

Leave a comment