--- /dev/null
+[[!meta title="HDF5 and h5py"]]
+
+[h5py][] is a [[Python interface]] to the [Hierarchical Data
+Format][HDF] library, version 5. It provides a mature, stable, open
+way to store data. The HDF5 [tutorial][] provides an excellent
+introduction to the basic concepts of HDF5.
+
+Useful utilities included with the HDF5 library:
+
+* `h5dump` (command line HDF5 extraction)
+* `h5stat` (command line HDF5 database statistics)
+
+I'll walk through the HDF5 tutorial with `h5py` to give you a feel for
+how things work. It may help to keep in mind the following HDF5 to
+filesystem concept map:
+
+<table>
+ <tr><th>HDF5</th><th>filesystem</th></tr>
+ <tr><td>dataset</td><td>file</td></tr>
+ <tr><td>attribute</td><td>metadata/header</td></tr>
+ <tr><td>group</td><td>directory</td></tr>
+</table>
+
+[h5py]: http://code.google.com/p/h5py/
+[HDF]: http://www.hdfgroup.org/HDF5/
+[tutorial]: http://www.hdfgroup.org/HDF5/Tutor/
+
+
+Creating an HDF5 file
+---------------------"
+
+ >>> import h5py
+ >>> f = h5py.File('file.h5', 'w')
+ >>> f.close()
+
+Which creates
+
+ $ h5dump file.h5
+ HDF5 "file.h5" {
+ GROUP "/" {
+ }
+ }
+
+Creating a dataset
+------------------
+
+ >>> import h5py
+ >>> import numpy
+ >>> f = h5py.File('dset.h5', 'w')
+ >>> f['dset'] = numpy.zeros((6,4), dtype=numpy.int32)
+ >>> f.close()
+
+Which creates
+
+ $ h5dump dset.h5
+ HDF5 "dset.h5" {
+ GROUP "/" {
+ DATASET "dset" {
+ DATATYPE H5T_STD_I32LE
+ DATASPACE SIMPLE { ( 6, 4 ) / ( 6, 4 ) }
+ DATA {
+ (0,0): 0, 0, 0, 0,
+ (1,0): 0, 0, 0, 0,
+ (2,0): 0, 0, 0, 0,
+ (3,0): 0, 0, 0, 0,
+ (4,0): 0, 0, 0, 0,
+ (5,0): 0, 0, 0, 0
+ }
+ }
+ }
+ }
+
+Reading from and writing to a dataset
+-------------------------------------
+
+ >>> import h5py
+ >>> import numpy
+ >>> f = h5py.File('dset.h5', 'w')
+ >>> f['dset'] = numpy.arange(24, dtype=numpy.int32).reshape((4, 6))
+ >>> dset = f['dset']
+ >>> dset
+ <HDF5 dataset "dset": shape (4, 6), type "<i4">
+ >>> dset.value
+ array([[ 0, 1, 2, 3, 4, 5],
+ [ 6, 7, 8, 9, 10, 11],
+ [12, 13, 14, 15, 16, 17],
+ [18, 19, 20, 21, 22, 23]])
+ >>> f.close()
+
+Which creates
+
+ $ h5dump dset.h5
+ HDF5 "dset.h5" {
+ GROUP "/" {
+ DATASET "dset" {
+ DATATYPE H5T_STD_I32LE
+ DATASPACE SIMPLE { ( 4, 6 ) / ( 4, 6 ) }
+ DATA {
+ (0,0): 0, 1, 2, 3, 4, 5
+ (1,0): 6, 7, 8, 9, 10, 11
+ (3,0): 12, 13, 14, 15, 16, 17
+ (4,0): 18, 19, 20, 21, 22, 23
+ }
+ }
+ }
+ }
+
+Creating an attribute
+---------------------
+
+Using our file from the previous example:
+
+ >>> import h5py
+ >>> import numpy
+ >>> f = h5py.File('dset.h5', 'a')
+ >>> dset = f['dset']
+ >>> dset.attrs['Units'] = [100, 200]
+ >>> f.close()
+
+Which creates
+
+ $ h5dump dset.h5
+ HDF5 "dset.h5" {
+ GROUP "/" {
+ DATASET "dset" {
+ DATATYPE H5T_STD_I32LE
+ DATASPACE SIMPLE { ( 6, 4 ) / ( 6, 4 ) }
+ DATA {
+ (0,0): 0, 1, 2, 3,
+ (1,0): 4, 5, 6, 7,
+ (2,0): 8, 9, 10, 11,
+ (3,0): 12, 13, 14, 15,
+ (4,0): 16, 17, 18, 19,
+ (5,0): 20, 21, 22, 23
+ }
+ ATTRIBUTE "Units" {
+ DATATYPE H5T_STD_I32LE
+ DATASPACE SIMPLE { ( 2 ) / ( 2 ) }
+ DATA {
+ (0): 100, 200
+ }
+ }
+ }
+ }
+ }
+
+Creating a group
+----------------
+
+ >>> import h5py
+ >>> f = h5py.File('group.h5', 'w')
+ >>> g = f.create_group('/MyGroup')
+ >>> g
+ <HDF5 group "/MyGroup" (0 members)>
+ >>> f.close()
+
+Which creates
+
+ $ h5dump group.h5
+ HDF5 "group.h5" {
+ GROUP "/" {
+ GROUP "MyGroup" {
+ }
+ }
+ }
+
+Creating groups using absolute and relative names
+-------------------------------------------------
+
+ >>> import h5py
+ >>> f = h5py.File('groups.h5', 'w')
+ >>> g1 = f.create_group('/MyGroup')
+ >>> g2 = f.create_group('/MyGroup/Group_A')
+ >>> g3 = g1.create_group('Group_B')
+ >>> f.keys()
+ ['MyGroup']
+ >>> f['MyGroup'].keys()
+ ['Group_A', 'Group_B']
+ >>> f.close()
+
+Which creates
+
+ $ h5dump groups.h5
+ HDF5 "groups.h5" {
+ GROUP "/" {
+ GROUP "MyGroup" {
+ GROUP "Group_A" {
+ }
+ GROUP "Group_B" {
+ }
+ }
+ }
+ }
+
+Creating datasets in groups
+---------------------------
+
+Using our file from the previous example:
+
+ >>> import h5py
+ >>> f = h5py.File('groups.h5', 'a')
+ >>> f['/MyGroup/dset1'] = [3, 3]
+ >>> g = f['/MyGroup/Group_A']
+ >>> g['dset2'] = [2, 10]
+ >>> f.close()
+
+Which creates
+
+ $ h5dump groups.h5
+ HDF5 "groups.h5" {
+ GROUP "/" {
+ GROUP "MyGroup" {
+ GROUP "Group_A" {
+ DATASET "dset2" {
+ DATATYPE H5T_STD_I32LE
+ DATASPACE SIMPLE { ( 2 ) / ( 2 ) }
+ DATA {
+ (0): 2, 10
+ }
+ }
+ }
+ GROUP "Group_B" {
+ }
+ DATASET "dset1" {
+ DATATYPE H5T_STD_I32LE
+ DATASPACE SIMPLE { ( 2 ) / ( 2 ) }
+ DATA {
+ (0): 3, 3
+ }
+ }
+ }
+ }
+ }
+
+Reading from or writing to a subset of a dataset
+------------------------------------------------
+
+Just use the [Numpy slice indexing][slice] you're used to.
+
+ >>> import h5py
+ >>> import numpy
+ >>> f = h5py.File('hype.h5', 'w')
+ >>> f['IntArray'] = numpy.ones((8, 10))
+ >>> dset = f['IntArray']
+ >>> dset.value
+ array([[ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
+ [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])
+ >>> f['IntArray'][:,5:] = 2
+ >>> dset.value
+ array([[ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.],
+ [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.],
+ [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.],
+ [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.],
+ [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.],
+ [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.],
+ [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.],
+ [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.]])
+ >>> dset[1:4,2:6] = 5
+ >>> f['IntArray'].value
+ array([[ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.],
+ [ 1., 1., 5., 5., 5., 5., 2., 2., 2., 2.],
+ [ 1., 1., 5., 5., 5., 5., 2., 2., 2., 2.],
+ [ 1., 1., 5., 5., 5., 5., 2., 2., 2., 2.],
+ [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.],
+ [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.],
+ [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.],
+ [ 1., 1., 1., 1., 1., 2., 2., 2., 2., 2.]])
+ >>> f.close()
+
+[slice]: http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+
+Datatypes
+---------
+
+Your array's `numpy.dtype` will be preserved.
+
+ >>> import h5py
+ >>> f = h5py.File('dtype.h5', 'w')
+ >>> f['complex'] = 2 + 3j
+ >>> f['complex'].dtype
+ dtype('complex128')
+ >>> type(f['complex'].value)
+ <type 'complex'>
+ >>> f['complex array'] = [1 + 2j, 3 + 4j]
+ >>> f['complex array'].dtype
+ dtype('complex128')
+ >>> type(f['complex array'].value)
+ <type 'numpy.ndarray'>
+ >>> f.close()
+
+Which creates
+
+ $ h5dump dtype.h5
+ HDF5 "dtype.h5" {
+ GROUP "/" {
+ DATASET "complex" {
+ DATATYPE H5T_COMPOUND {
+ H5T_IEEE_F64LE "r";
+ H5T_IEEE_F64LE "i";
+ }
+ DATASPACE SCALAR
+ DATA {
+ (0): {
+ 2,
+ 3
+ }
+ }
+ }
+ DATASET "complex array" {
+ DATATYPE H5T_COMPOUND {
+ H5T_IEEE_F64LE "r";
+ H5T_IEEE_F64LE "i";
+ }
+ DATASPACE SIMPLE { ( 2 ) / ( 2 ) }
+ DATA {
+ (0): {
+ 1,
+ 2
+ },
+ (1): {
+ 3,
+ 4
+ }
+ }
+ }
+ }
+ }
+
+Properties
+----------
+
+No examples here...
+
+Chunking and extendible datasets
+--------------------------------
+
+Extendible datasets must be chunked.
+
+ >>> import h5py
+ >>> import numpy
+ >>> f = h5py.File('ext.h5', 'w')
+ >>> f['simple'] = [1, 2, 3] # not chunked
+ >>> s = f['simple']
+ >>> s.chunks == None
+ True
+ >>> s.resize((6,))
+ Traceback (most recent call last):
+ ...
+ TypeError: Only chunked datasets can be resized
+ >>> c = f.create_dataset('chunked', (3,), numpy.int32, chunks=(2,))
+ >>> c.chunks
+ (2,)
+ >>> c[:] = [9, 8, 7]
+ >>> c.resize((6,))
+ >>> c.value
+ array([1, 2, 3, 0, 0, 0])
+ >>> c.resize((6,2))
+ Traceback (most recent call last):
+ ...
+ TypeError: New shape length (2) must match dataset rank (1)
+ >>> f.close()
+
+The "chunkiness" of data is not listed by `h5dump`,
+
+ $ h5dump dtype.h5
+ HDF5 "ext.h5" {
+ GROUP "/" {
+ DATASET "chunked" {
+ DATATYPE H5T_STD_I32LE
+ DATASPACE SIMPLE { ( 6 ) / ( 6 ) }
+ DATA {
+ (0): 1, 2, 3, 0, 0, 0
+ }
+ }
+ DATASET "simple" {
+ DATATYPE H5T_STD_I32LE
+ DATASPACE SIMPLE { ( 3 ) / ( 3 ) }
+ DATA {
+ (0): 1, 2, 3
+ }
+ }
+ }
+ }
+
+but it is preserved.
+
+ >>> f = h5py.File('ext.h5', 'a')
+ >>> f['chunked'].chunks
+ (2,)
+ >>> f['simple'].chunks == None
+ True
+
+[[!tag tags/python]]
+[[!tag tags/programming]]
+[[!tag tags/tools]]