Hdf5 table

From oldwiki.scinet.utoronto.ca
Jump to navigation Jump to search

Storing tables in HDF5

The HDF5 Table interface condenses the steps needed to create tables in HDF5. The datatype of the dataset that gets created is of type H5T_COMPOUND. The members of the table can have different datatypes.


Writting a table using Python (PyTables)

PyTables is a package for managing hierarchical datasets and designed to efficiently and easily cope with extremely large amounts of data. PyTables is built on top of the HDF5 library, using the Python language and the NumPy package. The following example shows how to store a table of 10 records with 8 members :

name ADCcount grid_i grid_j pressure energy idnumber pressure2
16-character String Unsigned short integer 32-bit integer 32-bit integer float (single-precision) double (double-precision) Signed 64-bit integer 2-dim table of float (2*3)

The script has been run on gpc with the following modules :

module load gcc/4.8.1  intel/14.0.0  python/2.7.2  hdf5/1811-v18-serial-gcc

PyTable 3.0.0 has been compiled in my scratch directory.

from tables import *

class Particle(IsDescription):
    name      = StringCol(16)   # 16-character String                                                                                                         
    ADCcount  = UInt16Col()     # Unsigned short integer                                                                                                      
    grid_i    = Int32Col()      # 32-bit integer                                                                                                              
    grid_j    = Int32Col()      # 32-bit integer                                                                                                              
    pressure  = Float32Col()    # float  (single-precision)                                                                                                   
    energy    = Float64Col()    # double (double-precision)                                                                                                   
    idnumber  = Int64Col()      # Signed 64-bit integer                                                                                                       
    pressure2    = Float32Col(shape=(2,3)) # array of floats (single-precision)

h5file = open_file("tutorial1.h5", mode = "w", title = "Test file")
group = h5file.create_group("/", 'detector', 'Detector information')
table = h5file.create_table(group, 'readout', Particle, "Readout example")
particle = table.row
for i in xrange(10):
    particle['name']  = 'Particle: %6d' % (i)
    particle['ADCcount'] = (i * 256) % (1 << 16)
    particle['grid_i'] = i
    particle['grid_j'] = 10 - i
    particle['pressure'] = float(i*i)
    particle['energy'] = float(particle['pressure'] ** 4)
    particle['idnumber'] = i * (2 ** 34)
    particle['pressure2'] = [
        [0.5+float(i),1.5+float(i),2.5+float(i)],
        [-1.5+float(i),-2.5+float(i),-3.5+float(i)]]
    # Insert a new particle record                                                                                                                            
    particle.append()

h5file.close()

Reading the table with a C++ code with MPI for parallel programming

The following example shows how to read the table in a MPI process (each MPI process will read one individual record). The code has been compiled and tested on BlueGene with the following modules :

module load vacpp/12.1  xlf/14.1  mpich2/xl hdf5/189-v18-mpich2-xlc
mpixlcxx -I$SCINET_HDF5_INC -L$SCINET_ZLIB_LIB -L$SCINET_SZIP_LIB -L$SCINET_HDF5_LIB Test.cpp -o Test -lhdf5_hl -lhdf5 -lsz -lz

Test.cpp :

#include "hdf5.h"
#include "hdf5_hl.h"
#include <stdlib.h>
#include <iostream>
#include <stdint.h>
#include <mpi.h>
#define NFIELDS  (hsize_t)  7
#define H5FILE_NAME     "tutorial1.h5"

int main(int argc, char *argv[])
{
  // DEF OF SIZE OF VARIABLES TO READ                                                                                                                         
  typedef struct Particle
  {
    unsigned short int    ADCcount;
    double energy;
    int grid_i;
    int grid_j;
    long  idnumber;
    char   name[16];
    float pressure;
  } Particle;

  /* Calculate the size and the offsets of our struct members in memory */
  size_t dst_size =  sizeof( Particle );

  /* Calculate the size and the offsets of our struct members in memory */
  size_t dst_size =  sizeof( Particle );

  size_t dst_offset[NFIELDS] = {
    HOFFSET( Particle, ADCcount ),
    HOFFSET( Particle, energy ),
    HOFFSET( Particle, grid_i ),
    HOFFSET( Particle, grid_j ),
    HOFFSET( Particle, idnumber ),
    HOFFSET( Particle, name ),
    HOFFSET( Particle, pressure ),
  };

  //////////////////////////////////////////////////////////////////////////////////////////////////////////////                                              
  //MPI                                                                                                                                                       

  //HDF5 APIs definitions                                                                                                                                     
  hid_t       file_id;         /* file and dataset identifiers */
  hid_t plist_id;        /* property list identifier( access template) */
  herr_t status;

  // MPI variables                                                                                                                                            
  int mpi_size, mpi_rank;
  MPI_Comm comm  = MPI_COMM_WORLD;
  MPI_Info info  = MPI_INFO_NULL;

  //Initialize MPI                                                                                                                                            
  MPI_Init(&argc, &argv);
  MPI_Comm_size(comm, &mpi_size);
  MPI_Comm_rank(comm, &mpi_rank);

  // Set up file access property list with parallel I/O access                                                                                                
  plist_id = H5Pcreate(H5P_FILE_ACCESS);//creates a new property list as an instance of some property list class                                              
  H5Pset_fapl_mpio(plist_id, comm, info);

  // Read file collectively.                                                                                                                                  
  file_id = H5Fopen(H5FILE_NAME, H5F_ACC_RDONLY, plist_id);//H5F_ACC_RDONLY : read-only mode                                                                  

  Particle  dst_buf[1];
  size_t dst_sizes[NFIELDS] = {
    sizeof( dst_buf[0].ADCcount),
    sizeof( dst_buf[0].energy),
    sizeof( dst_buf[0].grid_i),
    sizeof( dst_buf[0].grid_j),
    sizeof( dst_buf[0].idnumber),
    sizeof( dst_buf[0].name),
    sizeof( dst_buf[0].pressure),
  };

  //READ FRACTION OF TABLE : example reading one record per MPI process                                                                                       
  hsize_t start=mpi_rank;//read Record number mpi_rank                                                                                                        
  hsize_t nrecords=1;//read 1 record                                                                                                                          
  status=H5TBread_records(file_id,"/detector/readout",start,nrecords,dst_size,dst_offset,dst_sizes,dst_buf);

  std::cout<<"Rank = "<<mpi_rank
           <<" ,ADCcount = "<<dst_buf[0].ADCcount
           <<" ,idnumber = "<<dst_buf[0].idnumber
           <<" ,grid_i = "<<dst_buf[0].grid_i
           <<" ,grid_j = "<<dst_buf[0].grid_j
           <<" ,pressure = "<<dst_buf[0].pressure
           <<" ,name = "<<dst_buf[0].name
           <<" ,energy = "<<dst_buf[0].energy
           <<std::endl;


  //Close property list.                                                                                                                                      
  H5Pclose(plist_id);

  // Close the file.                                                                                                                                          
  H5Fclose(file_id);

  MPI_Finalize();

  return 1;
}

The job has been launched on BlueGene :

runjob --np 10 --ranks-per-node=1 --envs OMP_NUM_THREADS=1 : /PATH_TO_THE_TEST_DIRECTORY/Test

The output of the job :

Rank = 0 ,ADCcount = 0 ,idnumber = 0 ,grid_i = 0 ,grid_j = 10 ,pressure = 0 ,name = Particle:      0 ,energy = 0
Rank = 1 ,ADCcount = 256 ,idnumber = 17179869184 ,grid_i = 1 ,grid_j = 9 ,pressure = 1 ,name = Particle:      1 ,energy = 1
Rank = 7 ,ADCcount = 1792 ,idnumber = 120259084288 ,grid_i = 7 ,grid_j = 3 ,pressure = 49 ,name = Particle:      7 ,energy = 5.7648e+06
Rank = 5 ,ADCcount = 1280 ,idnumber = 85899345920 ,grid_i = 5 ,grid_j = 5 ,pressure = 25 ,name = Particle:      5 ,energy = 390625
Rank = 6 ,ADCcount = 1536 ,idnumber = 103079215104 ,grid_i = 6 ,grid_j = 4 ,pressure = 36 ,name = Particle:      6 ,energy = 1.67962e+06
Rank = 2 ,ADCcount = 512 ,idnumber = 34359738368 ,grid_i = 2 ,grid_j = 8 ,pressure = 4 ,name = Particle:      2 ,energy = 256
Rank = 4 ,ADCcount = 1024 ,idnumber = 68719476736 ,grid_i = 4 ,grid_j = 6 ,pressure = 16 ,name = Particle:      4 ,energy = 65536
Rank = 9 ,ADCcount = 2304 ,idnumber = 154618822656 ,grid_i = 9 ,grid_j = 1 ,pressure = 81 ,name = Particle:      9 ,energy = 4.30467e+07
Rank = 3 ,ADCcount = 768 ,idnumber = 51539607552 ,grid_i = 3 ,grid_j = 7 ,pressure = 9 ,name = Particle:      3 ,energy = 6561
Rank = 8 ,ADCcount = 2048 ,idnumber = 137438953472 ,grid_i = 8 ,grid_j = 2 ,pressure = 64 ,name = Particle:      8 ,energy = 1.67772e+07