Module fd4_part_1d_mod


Uses:
    module fd4_globaldef_mod
    module fd4_mpi_mod
    module timing_mod
Subroutines and functions:
    private function fd4_part_1d_scan (nblocks, blkweights_ps, pstart, sumload, guess) result (pend)
    public subroutine fd4_part_1d_binsrch (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, partition, maxload, reqsteps)
    public subroutine fd4_part_1d_parallel (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, mpi, partition, maxload, reqsteps, mpi_time, err)
    public subroutine fd4_part_1d_simple_h1 (nblocks, nprocs, blkweights_ps, partition, maxload)
    public subroutine fd4_part_1d_simple_h2 (nblocks, nprocs, blkweights_ps, partition, maxload)

One-dimensional partitioning of prefix-summed weight arrays in partitions with (almost) equal load. Used for space-filling curve partitioning.

Author: Matthias Lieber


Description of Subroutines and Functions

fd4_part_1d_scan

private function fd4_part_1d_scan (nblocks, blkweights_ps, pstart, sumload, guess) result (pend)
    integer (kind=i_k), intent(in) :: nblocks
    real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps
    integer (kind=i_k), intent(in) :: pstart
    real (kind=r8k), intent(in) :: sumload
    integer (kind=i_k), intent(inout) :: guess
    integer (kind=i_k) :: pend
end function fd4_part_1d_scan
Parameters:
nblocks number of blocks
blkweights_ps prefix sum of ordered block weights, weights(0) must be 0
pstart start index for search
sumload upper bound for search in blkweights_ps values
guess guess for result value for clever algorithm
Search the largest value which is not larger than sumload in the array of prefix summed block weights blkweights_ps starting at pstart and return the index in blkweights_ps. Returns a value >=pstart and <=nblocks.

If compiled with CLEVER_SEARCH, use a more sophisticated algorithm which starts searching at index guess. Returns a new value for guess for subsequent searches.

Internal function, called by fd4_part_1d_binsrch and fd4_part_1d_parallel.


fd4_part_1d_binsrch

public subroutine fd4_part_1d_binsrch (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, partition, maxload, reqsteps)
    integer (kind=i_k), intent(in) :: nblocks
    integer (kind=i_k), intent(in) :: nprocs
    real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps
    real (kind=r8k), intent(in) :: maxweight
    real (kind=r8k), intent(in) :: eps
    integer (kind=i_k), intent(in) :: maxsteps
    integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition
    real (kind=r8k), intent(out) :: maxload
    integer (kind=i_k), intent(out) :: reqsteps
end subroutine fd4_part_1d_binsrch
Parameters:
nblocks number of blocks
nprocs number of processes
blkweights_ps prefix sum of ordered block weights, weights(0) must be 0
maxweight max weight in weight array
eps max. difference of the resulting bottleneck value from the minimal
maxsteps max. number of binary search steps, may stop earlier depending on eps
partition output partition vector, contains start indices of partitions
maxload estimation of load of max loaded process, real value is a bit smaller
reqsteps required number of search steps to reach the accuracy specified by eps
Calculate a well-balanced partition for given block weights. Note: weights have to be given as prefix sums!

The algorithm is exact for eps=0.0 and large maxsteps.

Algorithm:

Caution: Be sure that maxweight is really the max difference of subsequent values in blkweights_ps and not the maximum weight in the block weights before prefix sum. Small rounding errors due to limited computer precision can lead to different values. With a too small maxweight the algorithm misses the exit condition and thus runs maxsteps steps. If the block weights do not utilize the full real(8) precision, this problem should not appear.

The algorithm corresponds mostly the method EBS (exact bisection) from:

Pinar, A. and C. Aykanat: Fast optimal load balancing algorithms for 1D partitioning. Journal of Parallel and Distributed Computing, 64(8):974-996, 2004.


fd4_part_1d_parallel

public subroutine fd4_part_1d_parallel (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, mpi, partition, maxload, reqsteps, mpi_time, err)
    integer (kind=i_k), intent(in) :: nblocks
    integer (kind=i_k), intent(in) :: nprocs
    real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps
    real (kind=r8k), intent(in) :: maxweight
    real (kind=r8k), intent(in) :: eps
    integer (kind=i_k), intent(in) :: maxsteps
    type (fd4_mpi), intent(in) :: mpi
    integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition
    real (kind=r8k), intent(out) :: maxload
    integer (kind=i_k), intent(out) :: reqsteps
    integer (kind=i8k), intent(out) :: mpi_time
    integer (kind=i_k), intent(out) :: err
end subroutine fd4_part_1d_parallel
Parameters:
nblocks number of blocks
nprocs number of processes
blkweights_ps prefix sum of ordered block weights, weights(0) must be 0
maxweight max weight in weight array
eps max. difference of the resulting bottleneck value from the minimal
maxsteps max. number of binary search steps, may stop earlier depending on eps
mpi domain's MPI context
partition output partition vector, contains start indices of partitions
maxload estimation of load of max loaded process, real value is a bit smaller
reqsteps required number of search steps to reach the accuracy specified by eps
mpi_time the time of this rank for MPI_Allreduce in us
err error status: 0...ok
Calculate a well-balanced partition for given block weights. Note: weights have to be given as prefix sums!

The algorithm is exact for eps=0.0 and large maxsteps.

Algorithm:

Caution: Be sure that maxweight is really the max difference of subsequent values in blkweights_ps and not the maximum weight in the block weights before prefix sum. Small rounding errors due to limited computer precision can lead to different values. With a too small maxweight the algorithm misses the exit condition and thus runs maxsteps steps. If the block weights do not utilize the full real(8) precision, this problem should not appear.

The bisection algorithm is based on the method EBS (exact bisection) from:

Pinar, A. and C. Aykanat: Fast optimal load balancing algorithms for 1D partitioning. Journal of Parallel and Distributed Computing, 64(8):974-996, 2004.


fd4_part_1d_simple_h1

public subroutine fd4_part_1d_simple_h1 (nblocks, nprocs, blkweights_ps, partition, maxload)
    integer (kind=i_k), intent(in) :: nblocks
    integer (kind=i_k), intent(in) :: nprocs
    real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps
    integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition
    real (kind=r8k), intent(out) :: maxload
end subroutine fd4_part_1d_simple_h1
Parameters:
nblocks number of blocks
nprocs number of processes
blkweights_ps prefix sum of ordered block weights, weights(0) must be 0
partition output partition vector, contains start indices of partitions
maxload estimation of load of max loaded process, real value is a bit smaller
Calculate a partition for given block weights. Note: weights have to be given as prefix sums!

Algorithm:

This is simply the method described as Heuristic 1 in:

Miguet, S. und J.-M. Pierson: Heuristics for 1D rectilinear partitioning as a low cost and high quality answer to dynamic load balancing. in: High-Performance Computing and Networking, LNCS, vol. 1225, pages 550-564. Springer, 1997.


fd4_part_1d_simple_h2

public subroutine fd4_part_1d_simple_h2 (nblocks, nprocs, blkweights_ps, partition, maxload)
    integer (kind=i_k), intent(in) :: nblocks
    integer (kind=i_k), intent(in) :: nprocs
    real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps
    integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition
    real (kind=r8k), intent(out) :: maxload
end subroutine fd4_part_1d_simple_h2
Parameters:
nblocks number of blocks
nprocs number of processes
blkweights_ps prefix sum of ordered block weights, weights(0) must be 0
partition output partition vector, contains start indices of partitions
maxload estimation of load of max loaded process, real value is a bit smaller
Calculate a partition for given block weights. Note: weights have to be given as prefix sums!

Algorithm:

This is simply the method described as Heuristic 2 in:

Miguet, S. und J.-M. Pierson: Heuristics for 1D rectilinear partitioning as a low cost and high quality answer to dynamic load balancing. in: High-Performance Computing and Networking, LNCS, vol. 1225, pages 550-564. Springer, 1997.