h5py性能测评

2019/05/04 技术

h5py性能测评

代码

import pickle
import sys
import time
import unittest

import h5py
import numpy as np
import os


class TestH5(unittest.TestCase):
    def setUp(self):
        self.pickle_file = "./data.pkl"
        self.h5_file = "./data.h5"

    def tearDown(self):
        os.remove(self.pickle_file)
        os.remove(self.h5_file)

    @staticmethod
    def get_file_size(file_path):
        file_size = os.path.getsize(file_path) / float(1024 * 1024)
        return "{}MB".format(round(file_size, 2))

    @staticmethod
    def get_size(obj):
        return sys.getsizeof(obj)

    def create_file(self):
        """ 创建文件 """
        data = np.random.random(size=(100000, 1024))
        print("size of data is {}".format(self.get_size(data)))
        target_index = [1, 5, 10, 50, 100, 500, 1000, 5000, 9000, 9001, 9003]
        target_result = data[target_index]
        print("size of target_result is {}".format(self.get_size(target_result)))

        # pickle
        with open(self.pickle_file, "wb") as fw:
            pickle.dump(data, fw)
        print("pickle file size is {}".format(self.get_file_size(self.pickle_file)))

        # h5py
        with h5py.File(self.h5_file, 'w') as hf:
            hf.create_dataset('data', data=data)
        print("h5 file size is {}".format(self.get_file_size(self.h5_file)))

        return target_index, target_result

    def pickle_load(self, target_index, target_result):
        time_start = time.time()
        with open(self.pickle_file, "rb") as fr:
            all_data = pickle.load(fr)
            self.assertTrue((target_result == all_data[target_index]).all())
        return time.time() - time_start

    def h5py_load(self, target_index, target_result):
        time_start = time.time()
        with h5py.File(self.h5_file, 'r') as hf:
            all_data = hf["data"]
            self.assertTrue((target_result == all_data[target_index]).all())
        return time.time() - time_start

    def testFileLoad(self):
        """ 文件加载 """

        target_index, target_result = self.create_file()

        # pickle: load 100 time
        time_list = []
        for i in range(10):
            time_list.append(self.pickle_load(target_index=target_index, target_result=target_result))
        print("pickle load 10 times: {}s per step, max time is {}s, min time is {}s!".format(
            sum(time_list) / len(time_list), max(time_list), min(time_list)))

        # h5py: load 10 time
        time_list = []
        for i in range(10):
            time_list.append(self.h5py_load(target_index=target_index, target_result=target_result))
        print("h5 load 10 times: {}s per step, max time is {}s, min time is {}s!".format(
            sum(time_list) / len(time_list), max(time_list), min(time_list)))

测试结果

文件加载测试结果如下:

Launching unittests with arguments python -m unittest hdf5_benchmark.TestH5 in /mnt/e/frkhit/wsl/tmp/pycharm_benchmark
size of data is 819200112
size of target_result is 90224
pickle file size is 781.25MB
h5 file size is 781.25MB
pickle load 10 times: 2.1771466970443725s per step, max time is 2.5986461639404297s, min time is 2.0592007637023926s!
h5 load 10 times: 0.002041530609130859s per step, max time is 0.004301786422729492s, min time is 0.0013699531555175781s!

结论:

  • h5py不一定能节省空间, 在本测试中, h5py的文件大小与pickle一样
  • h5py在加载数据时, 更省时间(只从硬盘中加载需要的数据)

Search

    Table of Contents