Source code for toupy.io.h5chunk_shape_3D

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Standard library imports
from functools import reduce
import math
import operator

__all__ = ["binlist", "numVals", "perturbShape", "chunk_shape_3D"]

"""
"""


[docs]def binlist(n, width=0):
    """
    Return list of bits that represent a non-negative integer.

    Parameters
    ----------
    n : int
        non-negative integer
    width : int
        number of bits in returned zero-filled list (default 0)
    """
    return list(map(int, list(bin(n)[2:].zfill(width))))


[docs]def numVals(shape):
    """
    Return number of values in chunk of specified shape, given by a list of dimension lengths.

    Parameters
    ----------
    shape : sequence of ints
        list of variable dimension sizes
    """
    if len(shape) == 0:
        return 1
    return reduce(operator.mul, shape)


[docs]def perturbShape(shape, onbits):
    """
    Return shape perturbed by adding 1 to elements corresponding to 1 bits in onbits

    Parameters
    ----------
    shape : sequence of ints
        list of variable dimension sizes
    onbits : int
        non-negative integer less than 2**len(shape)
    """
    return list(map(sum, zip(shape, binlist(onbits, len(shape)))))


[docs]def chunk_shape_3D(varShape, valSize=4, chunkSize=4096):
    """
    Return a 'good shape' for a 3D variable, assuming balanced 1D/(n-1)D access [#site]_

    Parameters
    ----------
    varShape : sequence of ints
        length 3 list of variable dimension sizes
    chunkSize : int, optional
        maximum chunksize desired, in bytes (default 4096)
    valSize : int, optional
        size of each data value, in bytes (default 4)

    Returns
    -------
    tuple
        Returns integer chunk lengths of a chunk shape that provides
        balanced access of 1D subsets and 2D subsets of a netCDF or HDF5
        variable var with shape (T, X, Y), where the 1D subsets are of the
        form var[:,x,y] and the 2D slices are of the form var[t,:,:],
        typically 1D time series and 2D spatial slices.

    Notes
    -----
    'Good shape' for chunks means that the number of chunks accessed to read either
    kind of 1D or 2D subset is approximately equal, and the size of
    each chunk (uncompressed) is no more than chunkSize, which is
    often a disk block size. Code fetched from [#code1]_ and [#code2]_.

    References
    ----------
    .. [#site] https://www.unidata.ucar.edu/blogs/developer/en/entry/chunking_data_choosing_shapes
    .. [#code1] https://www.unidata.ucar.edu/blog_content/data/2013/chunk_shape_3D.py
    .. [#code2] https://github.com/HDFGroup/datacontainer/blob/master/lib/chunking.py
    """

    rank = 3
    chunkVals = chunkSize / float(valSize)  # ideal number of values in a chunk
    numChunks = (
        varShape[0] * varShape[1] * varShape[2] / chunkVals
    )  # ideal number of chunks
    axisChunks = numChunks ** 0.25  # ideal number of chunks along each 2D axis
    cFloor = []  # will be first estimate of good chunk shape
    # cFloor  = [varShape[0] // axisChunks**2, varShape[1] // axisChunks, varShape[2] // axisChunks]
    # except that each chunk shape dimension must be at least 1
    # chunkDim = max(1.0, varShape[0] // axisChunks**2)
    if varShape[0] / axisChunks ** 2 < 1.0:
        chunkDim = 1.0
        axisChunks = axisChunks / math.sqrt(varShape[0] / axisChunks ** 2)
    else:
        chunkDim = varShape[0] // axisChunks ** 2
    cFloor.append(chunkDim)
    prod = 1.0  # factor to increase other dims if some must be increased to 1.0
    for ii in range(1, rank):
        if varShape[ii] / axisChunks < 1.0:
            prod *= axisChunks / varShape[ii]
    for ii in range(1, rank):
        if varShape[ii] / axisChunks < 1.0:
            chunkDim = 1.0
        else:
            chunkDim = (prod * varShape[ii]) // axisChunks
        cFloor.append(chunkDim)

    # cFloor is typically too small, (numVals(cFloor) < chunkSize)
    # Adding 1 to each shape dim results in chunks that are too large,
    # (numVals(cCeil) > chunkSize).  Want to just add 1 to some of the
    # axes to get as close as possible to chunkSize without exceeding
    # it.  Here we use brute force, compute numVals(cCand) for all
    # 2**rank candidates and return the one closest to chunkSize
    # without exceeding it.
    bestChunkSize = 0
    cBest = cFloor
    for ii in range(8):
        # cCand = map(sum,zip(cFloor, binlist(i, rank)))
        cCand = perturbShape(cFloor, ii)
        thisChunkSize = valSize * numVals(cCand)
        if bestChunkSize < thisChunkSize <= chunkSize:
            bestChunkSize = thisChunkSize
            cBest = list(cCand)  # make a copy of best candidate so far
    return tuple(map(int, cBest))