posts/HDF5.mdwn

   1 [[!meta  title="HDF5 and h5py"]]
   2
   3 [h5py][] is a [[Python]] interface to the [Hierarchical Data
   4 Format][HDF] library, version 5.  It provides a mature, stable, open
   5 way to store data.  The HDF5 [tutorial][] provides an excellent
   6 introduction to the basic concepts of HDF5.
   7
   8 Useful utilities included with the HDF5 library:
   9
  10 * `h5dump` (command line HDF5 extraction)
  11 * `h5stat` (command line HDF5 database statistics)
  12
  13 There's also [[HDFView]] which provides a nice graphical interface.
  14
  15 I'll walk through the HDF5 tutorial with `h5py` to give you a feel for
  16 how things work.  It may help to keep in mind the following HDF5 to
  17 filesystem concept map:
  18
  19 <table>
  20   <tr><th>HDF5</th><th>filesystem</th></tr>
  21   <tr><td>dataset</td><td>file</td></tr>
  22   <tr><td>attribute</td><td>metadata/header</td></tr>
  23   <tr><td>group</td><td>directory</td></tr>
  24 </table>
  25
  26 [h5py]: http://code.google.com/p/h5py/
  27 [HDF]: http://www.hdfgroup.org/HDF5/
  28 [tutorial]: http://www.hdfgroup.org/HDF5/Tutor/
  29
  30
  31 Creating an HDF5 file
  32 ---------------------
  33
  34     >>> import h5py
  35     >>> f = h5py.File('file.h5', 'w')
  36     >>> f.close()
  37
  38 Which creates
  39
  40     $ h5dump file.h5
  41     HDF5 "file.h5" {
  42     GROUP "/" {
  43     }
  44     }
  45
  46 Creating a dataset
  47 ------------------
  48
  49     >>> import h5py
  50     >>> import numpy
  51     >>> f = h5py.File('dset.h5', 'w')
  52     >>> f['dset'] = numpy.zeros((6,4), dtype=numpy.int32)
  53     >>> f.close()
  54
  55 Which creates
  56
  57     $ h5dump dset.h5
  58     HDF5 "dset.h5" {
  59     GROUP "/" {
  60        DATASET "dset" {
  61           DATATYPE  H5T_STD_I32LE
  62           DATASPACE  SIMPLE { ( 6, 4 ) / ( 6, 4 ) }
  63           DATA {
  64           (0,0): 0, 0, 0, 0,
  65           (1,0): 0, 0, 0, 0,
  66           (2,0): 0, 0, 0, 0,
  67           (3,0): 0, 0, 0, 0,
  68           (4,0): 0, 0, 0, 0,
  69           (5,0): 0, 0, 0, 0
  70           }
  71        }
  72     }
  73     }
  74
  75 Reading from and writing to a dataset
  76 -------------------------------------
  77
  78     >>> import h5py
  79     >>> import numpy
  80     >>> f = h5py.File('dset.h5', 'w')
  81     >>> f['dset'] = numpy.arange(24, dtype=numpy.int32).reshape((4, 6))
  82     >>> dset = f['dset']
  83     >>> dset
  84     <HDF5 dataset "dset": shape (4, 6), type "<i4">
  85     >>> dset[...]
  86     array([[ 0,  1,  2,  3,  4,  5],
  87            [ 6,  7,  8,  9, 10, 11],
  88            [12, 13, 14, 15, 16, 17],
  89            [18, 19, 20, 21, 22, 23]])
  90     >>> f.close()
  91
  92 Which creates
  93
  94     $ h5dump dset.h5
  95     HDF5 "dset.h5" {
  96     GROUP "/" {
  97        DATASET "dset" {
  98           DATATYPE  H5T_STD_I32LE
  99           DATASPACE  SIMPLE { ( 4, 6 ) / ( 4, 6 ) }
 100           DATA {
 101           (0,0): 0, 1, 2, 3, 4, 5
 102           (1,0): 6, 7, 8, 9, 10, 11
 103           (3,0): 12, 13, 14, 15, 16, 17
 104           (4,0): 18, 19, 20, 21, 22, 23
 105           }
 106        }
 107     }
 108     }
 109
 110 Creating an attribute
 111 ---------------------
 112
 113 Using our file from the previous example:
 114
 115     >>> import h5py
 116     >>> import numpy
 117     >>> f = h5py.File('dset.h5', 'a')
 118     >>> dset = f['dset']
 119     >>> dset.attrs['Units'] = [100, 200]
 120     >>> f.close()
 121
 122 Which creates
 123
 124     $ h5dump dset.h5
 125     HDF5 "dset.h5" {
 126     GROUP "/" {
 127        DATASET "dset" {
 128           DATATYPE  H5T_STD_I32LE
 129           DATASPACE  SIMPLE { ( 6, 4 ) / ( 6, 4 ) }
 130           DATA {
 131           (0,0): 0, 1, 2, 3,
 132           (1,0): 4, 5, 6, 7,
 133           (2,0): 8, 9, 10, 11,
 134           (3,0): 12, 13, 14, 15,
 135           (4,0): 16, 17, 18, 19,
 136           (5,0): 20, 21, 22, 23
 137           }
 138           ATTRIBUTE "Units" {
 139              DATATYPE  H5T_STD_I32LE
 140              DATASPACE  SIMPLE { ( 2 ) / ( 2 ) }
 141              DATA {
 142              (0): 100, 200
 143              }
 144           }
 145        }
 146     }
 147     }
 148
 149 Creating a group
 150 ----------------
 151
 152     >>> import h5py
 153     >>> f = h5py.File('group.h5', 'w')
 154     >>> g = f.create_group('/MyGroup')
 155     >>> g
 156     <HDF5 group "/MyGroup" (0 members)>
 157     >>> f.close()
 158
 159 Which creates
 160
 161     $ h5dump group.h5
 162     HDF5 "group.h5" {
 163     GROUP "/" {
 164        GROUP "MyGroup" {
 165        }
 166     }
 167     }
 168
 169 Creating groups using absolute and relative names
 170 -------------------------------------------------
 171
 172     >>> import h5py
 173     >>> f = h5py.File('groups.h5', 'w')
 174     >>> g1 = f.create_group('/MyGroup')
 175     >>> g2 = f.create_group('/MyGroup/Group_A')
 176     >>> g3 = g1.create_group('Group_B')
 177     >>> f.keys()
 178     ['MyGroup']
 179     >>> f['MyGroup'].keys()
 180     ['Group_A', 'Group_B']
 181     >>> f.close()
 182
 183 Which creates
 184
 185     $ h5dump groups.h5
 186     HDF5 "groups.h5" {
 187     GROUP "/" {
 188        GROUP "MyGroup" {
 189           GROUP "Group_A" {
 190           }
 191           GROUP "Group_B" {
 192           }
 193        }
 194     }
 195     }
 196
 197 Creating datasets in groups
 198 ---------------------------
 199
 200 Using our file from the previous example:
 201
 202     >>> import h5py
 203     >>> f = h5py.File('groups.h5', 'a')
 204     >>> f['/MyGroup/dset1'] = [3, 3]
 205     >>> g = f['/MyGroup/Group_A']
 206     >>> g['dset2'] = [2, 10]
 207     >>> f.close()
 208
 209 Which creates
 210
 211     $ h5dump groups.h5
 212     HDF5 "groups.h5" {
 213     GROUP "/" {
 214        GROUP "MyGroup" {
 215           GROUP "Group_A" {
 216              DATASET "dset2" {
 217                 DATATYPE  H5T_STD_I32LE
 218                 DATASPACE  SIMPLE { ( 2 ) / ( 2 ) }
 219                 DATA {
 220                 (0): 2, 10
 221                 }
 222              }
 223           }
 224           GROUP "Group_B" {
 225           }
 226           DATASET "dset1" {
 227              DATATYPE  H5T_STD_I32LE
 228              DATASPACE  SIMPLE { ( 2 ) / ( 2 ) }
 229              DATA {
 230              (0): 3, 3
 231              }
 232           }
 233        }
 234     }
 235     }
 236
 237 Reading from or writing to a subset of a dataset
 238 ------------------------------------------------
 239
 240 Just use the [Numpy slice indexing][slice] you're used to.
 241
 242     >>> import h5py
 243     >>> import numpy
 244     >>> f = h5py.File('slice.h5', 'w')
 245     >>> f['IntArray'] = numpy.ones((8, 10))
 246     >>> dset = f['IntArray']
 247     >>> dset[...]
 248     array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
 249            [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
 250            [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
 251            [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
 252            [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
 253            [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
 254            [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
 255            [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]])
 256     >>> f['IntArray'][:,5:] = 2
 257     >>> dset[...]
 258     array([[ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.],
 259            [ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.],
 260            [ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.],
 261            [ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.],
 262            [ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.],
 263            [ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.],
 264            [ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.],
 265            [ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.]])
 266     >>> dset[1:4,2:6] = 5
 267     >>> f['IntArray'][...]
 268     array([[ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.],
 269            [ 1.,  1.,  5.,  5.,  5.,  5.,  2.,  2.,  2.,  2.],
 270            [ 1.,  1.,  5.,  5.,  5.,  5.,  2.,  2.,  2.,  2.],
 271            [ 1.,  1.,  5.,  5.,  5.,  5.,  2.,  2.,  2.,  2.],
 272            [ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.],
 273            [ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.],
 274            [ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.],
 275            [ 1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,  2.]])
 276     >>> f.close()
 277
 278 Here's an example of altering a scalar value:
 279
 280     >>> import h5py
 281     >>> import numpy
 282     >>> f = h5py.File('scalar.h5', 'w')
 283     >>> f['int'] = 1
 284     >>> dset = f['int']
 285                 >>> f['int'][...]
 286     1
 287                 >>> f['int'][...] = 2
 288                 >>> f['int'][...]
 289                 2
 290                 >>> f.pop('int')
 291     >>> f.close()
 292
 293 I haven't been able to track down official documentation for the
 294 `dataset[...]` syntax, but it is mentioned in [the 1.3 release
 295 announcement][message] that Andrew sent to the `scipy-user` list.
 296
 297 [slice]: http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
 298 [message]: http://mail.scipy.org/pipermail/scipy-user/2010-February/024364.html
 299
 300 Datatypes
 301 ---------
 302
 303 Your array's `numpy.dtype` will be preserved.
 304
 305     >>> import h5py
 306     >>> f = h5py.File('dtype.h5', 'w')
 307     >>> f['complex'] = 2 + 3j
 308     >>> f['complex'].dtype
 309     dtype('complex128')
 310     >>> type(f['complex'][...])
 311     <type 'complex'>
 312     >>> f['complex array'] = [1 + 2j, 3 + 4j]
 313     >>> f['complex array'].dtype
 314     dtype('complex128')
 315     >>> type(f['complex array'][...])
 316     <type 'numpy.ndarray'>
 317     >>> f.close()
 318
 319 Which creates
 320
 321     $ h5dump dtype.h5
 322     HDF5 "dtype.h5" {
 323     GROUP "/" {
 324        DATASET "complex" {
 325           DATATYPE  H5T_COMPOUND {
 326              H5T_IEEE_F64LE "r";
 327              H5T_IEEE_F64LE "i";
 328           }
 329           DATASPACE  SCALAR
 330           DATA {
 331           (0): {
 332                 2,
 333                 3
 334              }
 335           }
 336        }
 337        DATASET "complex array" {
 338           DATATYPE  H5T_COMPOUND {
 339              H5T_IEEE_F64LE "r";
 340              H5T_IEEE_F64LE "i";
 341           }
 342           DATASPACE  SIMPLE { ( 2 ) / ( 2 ) }
 343           DATA {
 344           (0): {
 345                 1,
 346                 2
 347              },
 348           (1): {
 349                 3,
 350                 4
 351              }
 352           }
 353        }
 354     }
 355     }
 356
 357 Properties
 358 ----------
 359
 360 No examples here...
 361
 362 Chunking and extendible datasets
 363 --------------------------------
 364
 365 Extendible datasets must be chunked.
 366
 367     >>> import h5py
 368     >>> import numpy
 369     >>> f = h5py.File('ext.h5', 'w')
 370     >>> f['simple'] = [1, 2, 3]  # not chunked
 371     >>> s = f['simple']
 372     >>> s.chunks == None
 373     True
 374     >>> s.resize((6,))
 375     Traceback (most recent call last):
 376       ...
 377     TypeError: Only chunked datasets can be resized
 378     >>> c = f.create_dataset('chunked', (3,), numpy.int32, chunks=(2,))
 379     >>> c.chunks
 380     (2,)
 381     >>> c[:] = [9, 8, 7]
 382     >>> c.resize((6,))
 383     >>> c[...]
 384     array([1, 2, 3, 0, 0, 0])
 385     >>> c.resize((6,2))
 386     Traceback (most recent call last):
 387       ...
 388     TypeError: New shape length (2) must match dataset rank (1)
 389     >>> f.close()
 390
 391 The "chunkiness" of data is not listed by `h5dump`,
 392
 393     $ h5dump dtype.h5
 394     HDF5 "ext.h5" {
 395     GROUP "/" {
 396        DATASET "chunked" {
 397           DATATYPE  H5T_STD_I32LE
 398           DATASPACE  SIMPLE { ( 6 ) / ( 6 ) }
 399           DATA {
 400           (0): 1, 2, 3, 0, 0, 0
 401           }
 402        }
 403        DATASET "simple" {
 404           DATATYPE  H5T_STD_I32LE
 405           DATASPACE  SIMPLE { ( 3 ) / ( 3 ) }
 406           DATA {
 407           (0): 1, 2, 3
 408           }
 409        }
 410     }
 411     }
 412
 413 but it is preserved.
 414
 415     >>> f = h5py.File('ext.h5', 'a')
 416     >>> f['chunked'].chunks
 417     (2,)
 418     >>> f['simple'].chunks == None
 419     True
 420
 421 [[!tag tags/python]]
 422 [[!tag tags/programming]]
 423 [[!tag tags/tools]]