From: W. Trevor King Date: Thu, 4 Nov 2010 12:34:34 +0000 (-0400) Subject: Add HDF5 post. X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=910421924ff2399966d59eb67b25bdfd6a9ece98;p=mw2txt.git Add HDF5 post. --- diff --git a/posts/HDF5.mdwn b/posts/HDF5.mdwn new file mode 100644 index 0000000..068394a --- /dev/null +++ b/posts/HDF5.mdwn @@ -0,0 +1,401 @@ +[[!meta title="HDF5 and h5py"]] + +[h5py][] is a [[Python interface]] to the [Hierarchical Data +Format][HDF] library, version 5. It provides a mature, stable, open +way to store data. The HDF5 [tutorial][] provides an excellent +introduction to the basic concepts of HDF5. + +Useful utilities included with the HDF5 library: + +* `h5dump` (command line HDF5 extraction) +* `h5stat` (command line HDF5 database statistics) + +I'll walk through the HDF5 tutorial with `h5py` to give you a feel for +how things work. It may help to keep in mind the following HDF5 to +filesystem concept map: + + + + + + +
HDF5filesystem
datasetfile
attributemetadata/header
groupdirectory
+ +[h5py]: http://code.google.com/p/h5py/ +[HDF]: http://www.hdfgroup.org/HDF5/ +[tutorial]: http://www.hdfgroup.org/HDF5/Tutor/ + + +Creating an HDF5 file +---------------------" + + >>> import h5py + >>> f = h5py.File('file.h5', 'w') + >>> f.close() + +Which creates + + $ h5dump file.h5 + HDF5 "file.h5" { + GROUP "/" { + } + } + +Creating a dataset +------------------ + + >>> import h5py + >>> import numpy + >>> f = h5py.File('dset.h5', 'w') + >>> f['dset'] = numpy.zeros((6,4), dtype=numpy.int32) + >>> f.close() + +Which creates + + $ h5dump dset.h5 + HDF5 "dset.h5" { + GROUP "/" { + DATASET "dset" { + DATATYPE H5T_STD_I32LE + DATASPACE SIMPLE { ( 6, 4 ) / ( 6, 4 ) } + DATA { + (0,0): 0, 0, 0, 0, + (1,0): 0, 0, 0, 0, + (2,0): 0, 0, 0, 0, + (3,0): 0, 0, 0, 0, + (4,0): 0, 0, 0, 0, + (5,0): 0, 0, 0, 0 + } + } + } + } + +Reading from and writing to a dataset +------------------------------------- + + >>> import h5py + >>> import numpy + >>> f = h5py.File('dset.h5', 'w') + >>> f['dset'] = numpy.arange(24, dtype=numpy.int32).reshape((4, 6)) + >>> dset = f['dset'] + >>> dset + + >>> dset.value + array([[ 0, 1, 2, 3, 4, 5], + [ 6, 7, 8, 9, 10, 11], + [12, 13, 14, 15, 16, 17], + [18, 19, 20, 21, 22, 23]]) + >>> f.close() + +Which creates + + $ h5dump dset.h5 + HDF5 "dset.h5" { + GROUP "/" { + DATASET "dset" { + DATATYPE H5T_STD_I32LE + DATASPACE SIMPLE { ( 4, 6 ) / ( 4, 6 ) } + DATA { + (0,0): 0, 1, 2, 3, 4, 5 + (1,0): 6, 7, 8, 9, 10, 11 + (3,0): 12, 13, 14, 15, 16, 17 + (4,0): 18, 19, 20, 21, 22, 23 + } + } + } + } + +Creating an attribute +--------------------- + +Using our file from the previous example: + + >>> import h5py + >>> import numpy + >>> f = h5py.File('dset.h5', 'a') + >>> dset = f['dset'] + >>> dset.attrs['Units'] = [100, 200] + >>> f.close() + +Which creates + + $ h5dump dset.h5 + HDF5 "dset.h5" { + GROUP "/" { + DATASET "dset" { + DATATYPE H5T_STD_I32LE + DATASPACE SIMPLE { ( 6, 4 ) / ( 6, 4 ) } + DATA { + (0,0): 0, 1, 2, 3, + (1,0): 4, 5, 6, 7, + (2,0): 8, 9, 10, 11, + (3,0): 12, 13, 14, 15, + (4,0): 16, 17, 18, 19, + (5,0): 20, 21, 22, 23 + } + ATTRIBUTE "Units" { + DATATYPE H5T_STD_I32LE + DATASPACE SIMPLE { ( 2 ) / ( 2 ) } + DATA { + (0): 100, 200 + } + } + } + } + } + +Creating a group +---------------- + + >>> import h5py + >>> f = h5py.File('group.h5', 'w') + >>> g = f.create_group('/MyGroup') + >>> g + + >>> f.close() + +Which creates + + $ h5dump group.h5 + HDF5 "group.h5" { + GROUP "/" { + GROUP "MyGroup" { + } + } + } + +Creating groups using absolute and relative names +------------------------------------------------- + + >>> import h5py + >>> f = h5py.File('groups.h5', 'w') + >>> g1 = f.create_group('/MyGroup') + >>> g2 = f.create_group('/MyGroup/Group_A') + >>> g3 = g1.create_group('Group_B') + >>> f.keys() + ['MyGroup'] + >>> f['MyGroup'].keys() + ['Group_A', 'Group_B'] + >>> f.close() + +Which creates + + $ h5dump groups.h5 + HDF5 "groups.h5" { + GROUP "/" { + GROUP "MyGroup" { + GROUP "Group_A" { + } + GROUP "Group_B" { + } + } + } + } + +Creating datasets in groups +--------------------------- + +Using our file from the previous example: + + >>> import h5py + >>> f = h5py.File('groups.h5', 'a') + >>> f['/MyGroup/dset1'] = [3, 3] + >>> g = f['/MyGroup/Group_A'] + >>> g['dset2'] = [2, 10] + >>> f.close() + +Which creates + + $ h5dump groups.h5 + HDF5 "groups.h5" { + GROUP "/" { + GROUP "MyGroup" { + GROUP "Group_A" { + DATASET "dset2" { + DATATYPE H5T_STD_I32LE + DATASPACE SIMPLE { ( 2 ) / ( 2 ) } + DATA { + (0): 2, 10 + } + } + } + GROUP "Group_B" { + } + DATASET "dset1" { + DATATYPE H5T_STD_I32LE + DATASPACE SIMPLE { ( 2 ) / ( 2 ) } + DATA { + (0): 3, 3 + } + } + } + } + } + +Reading from or writing to a subset of a dataset +------------------------------------------------ + +Just use the [Numpy slice indexing][slice] you're used to. + + >>> import h5py + >>> import numpy + >>> f = h5py.File('hype.h5', 'w') + >>> f['IntArray'] = numpy.ones((8, 10)) + >>> dset = f['IntArray'] + >>> dset.value + array([[ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], + [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], + [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], + [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], + [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], + [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], + [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], + [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]) + >>> f['IntArray'][:,5:] = 2 + >>> dset.value + array([[ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.], + [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.], + [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.], + [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.], + [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.], + [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.], + [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.], + [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.]]) + >>> dset[1:4,2:6] = 5 + >>> f['IntArray'].value + array([[ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.], + [ 1., 1., 5., 5., 5., 5., 2., 2., 2., 2.], + [ 1., 1., 5., 5., 5., 5., 2., 2., 2., 2.], + [ 1., 1., 5., 5., 5., 5., 2., 2., 2., 2.], + [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.], + [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.], + [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.], + [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.]]) + >>> f.close() + +[slice]: http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html + +Datatypes +--------- + +Your array's `numpy.dtype` will be preserved. + + >>> import h5py + >>> f = h5py.File('dtype.h5', 'w') + >>> f['complex'] = 2 + 3j + >>> f['complex'].dtype + dtype('complex128') + >>> type(f['complex'].value) + + >>> f['complex array'] = [1 + 2j, 3 + 4j] + >>> f['complex array'].dtype + dtype('complex128') + >>> type(f['complex array'].value) + + >>> f.close() + +Which creates + + $ h5dump dtype.h5 + HDF5 "dtype.h5" { + GROUP "/" { + DATASET "complex" { + DATATYPE H5T_COMPOUND { + H5T_IEEE_F64LE "r"; + H5T_IEEE_F64LE "i"; + } + DATASPACE SCALAR + DATA { + (0): { + 2, + 3 + } + } + } + DATASET "complex array" { + DATATYPE H5T_COMPOUND { + H5T_IEEE_F64LE "r"; + H5T_IEEE_F64LE "i"; + } + DATASPACE SIMPLE { ( 2 ) / ( 2 ) } + DATA { + (0): { + 1, + 2 + }, + (1): { + 3, + 4 + } + } + } + } + } + +Properties +---------- + +No examples here... + +Chunking and extendible datasets +-------------------------------- + +Extendible datasets must be chunked. + + >>> import h5py + >>> import numpy + >>> f = h5py.File('ext.h5', 'w') + >>> f['simple'] = [1, 2, 3] # not chunked + >>> s = f['simple'] + >>> s.chunks == None + True + >>> s.resize((6,)) + Traceback (most recent call last): + ... + TypeError: Only chunked datasets can be resized + >>> c = f.create_dataset('chunked', (3,), numpy.int32, chunks=(2,)) + >>> c.chunks + (2,) + >>> c[:] = [9, 8, 7] + >>> c.resize((6,)) + >>> c.value + array([1, 2, 3, 0, 0, 0]) + >>> c.resize((6,2)) + Traceback (most recent call last): + ... + TypeError: New shape length (2) must match dataset rank (1) + >>> f.close() + +The "chunkiness" of data is not listed by `h5dump`, + + $ h5dump dtype.h5 + HDF5 "ext.h5" { + GROUP "/" { + DATASET "chunked" { + DATATYPE H5T_STD_I32LE + DATASPACE SIMPLE { ( 6 ) / ( 6 ) } + DATA { + (0): 1, 2, 3, 0, 0, 0 + } + } + DATASET "simple" { + DATATYPE H5T_STD_I32LE + DATASPACE SIMPLE { ( 3 ) / ( 3 ) } + DATA { + (0): 1, 2, 3 + } + } + } + } + +but it is preserved. + + >>> f = h5py.File('ext.h5', 'a') + >>> f['chunked'].chunks + (2,) + >>> f['simple'].chunks == None + True + +[[!tag tags/python]] +[[!tag tags/programming]] +[[!tag tags/tools]]