module fd4_globaldef_mod module fd4_mpi_mod module timing_modSubroutines and functions:
private function fd4_part_1d_scan_han (nblocks, blkweights_ps, pstart, sumload, avgblk, step) result (pend) public function fd4_part_1d_scan (nblocks, blkweights_ps, pstart, sumload, guess) result (pend) public subroutine fd4_part_1d_binsrch (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, partition, maxload, reqsteps) public subroutine fd4_part_1d_parallel (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, mpi, partition, maxload, reqsteps, mpi_time, err) public subroutine fd4_part_1d_simple_h1 (nblocks, nprocs, blkweights_ps, partition, maxload) public subroutine fd4_part_1d_simple_h2 (nblocks, nprocs, blkweights_ps, partition, maxload)
Preprocessor options:
Author: Matthias Lieber
private function fd4_part_1d_scan_han (nblocks, blkweights_ps, pstart, sumload, avgblk, step) result (pend) integer (kind=i_k), intent(in) :: nblocks real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps integer (kind=i_k), intent(in) :: pstart real (kind=r8k), intent(in) :: sumload integer (kind=i_k), intent(in) :: avgblk integer (kind=i_k), intent(inout) :: step integer (kind=i_k) :: pend end function fd4_part_1d_scan_hanParameters:
nblocks | number of blocks |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
pstart | start index for search |
sumload | upper bound for search in blkweights_ps values |
avgblk | average number of blocks per partition (nblocks/nprocs) |
step | start of search interval of length avgblk, use for subsequent calls |
Improved algorithm with log(nblocks/nprocs) complexity, based on ideas proposed by Han et al. 1992.
In the first call step should be avgblk.
public function fd4_part_1d_scan (nblocks, blkweights_ps, pstart, sumload, guess) result (pend) integer (kind=i_k), intent(in) :: nblocks real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps integer (kind=i_k), intent(in) :: pstart real (kind=r8k), intent(in) :: sumload integer (kind=i_k), intent(inout) :: guess integer (kind=i_k) :: pend end function fd4_part_1d_scanParameters:
nblocks | number of blocks |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
pstart | start index for search |
sumload | upper bound for search in blkweights_ps values |
guess | guess for result value for clever algorithm |
If compiled with CLEVER_SEARCH, use a more sophisticated algorithm which starts searching at index guess. Returns a new value for guess for subsequent searches.
Called by fd4_part_1d_binsrch, fd4_part_1d_parallel, fd4_part_1d_simple_h1, and fd4_part_1d_simple_h2.
public subroutine fd4_part_1d_binsrch (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, partition, maxload, reqsteps) integer (kind=i_k), intent(in) :: nblocks integer (kind=i_k), intent(in) :: nprocs real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps real (kind=r8k), intent(in) :: maxweight real (kind=r8k), intent(in) :: eps integer (kind=i_k), intent(in) :: maxsteps integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition real (kind=r8k), intent(out) :: maxload integer (kind=i_k), intent(out) :: reqsteps end subroutine fd4_part_1d_binsrchParameters:
nblocks | number of blocks |
nprocs | number of processes |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
maxweight | max weight in weight array |
eps | max. difference of the resulting bottleneck value from the minimal |
maxsteps | max. number of binary search steps, may stop earlier depending on eps |
partition | output partition vector, contains start indices of partitions |
maxload | estimation of load of max loaded process, real value is a bit smaller |
reqsteps | required number of search steps to reach the accuracy specified by eps |
The algorithm is exact for eps=0.0 and large maxsteps.
Algorithm:
The algorithm corresponds mostly the method EBS (exact bisection) from:
Pinar, A. and C. Aykanat: Fast optimal load balancing algorithms for 1D partitioning. Journal of Parallel and Distributed Computing, 64(8):974-996, 2004.
public subroutine fd4_part_1d_parallel (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, mpi, partition, maxload, reqsteps, mpi_time, err) integer (kind=i_k), intent(in) :: nblocks integer (kind=i_k), intent(in) :: nprocs real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps real (kind=r8k), intent(in) :: maxweight real (kind=r8k), intent(in) :: eps integer (kind=i_k), intent(in) :: maxsteps type (fd4_mpi), intent(in) :: mpi integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition real (kind=r8k), intent(out) :: maxload integer (kind=i_k), intent(out) :: reqsteps integer (kind=i8k), intent(out) :: mpi_time integer (kind=i_k), intent(out) :: err end subroutine fd4_part_1d_parallelParameters:
nblocks | number of blocks |
nprocs | number of processes |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
maxweight | max weight in weight array |
eps | max. difference of the resulting bottleneck value from the minimal |
maxsteps | max. number of binary search steps, may stop earlier depending on eps |
mpi | domain's MPI context |
partition | output partition vector, contains start indices of partitions |
maxload | estimation of load of max loaded process, real value is a bit smaller |
reqsteps | required number of search steps to reach the accuracy specified by eps |
mpi_time | the time of this rank for MPI_Allreduce in us |
err | error status: 0...ok |
The algorithm is exact for eps=0.0 and large maxsteps.
Algorithm:
The bisection algorithm is based on the method EBS (exact bisection) from:
Pinar, A. and C. Aykanat: Fast optimal load balancing algorithms for 1D partitioning. Journal of Parallel and Distributed Computing, 64(8):974-996, 2004.
public subroutine fd4_part_1d_simple_h1 (nblocks, nprocs, blkweights_ps, partition, maxload) integer (kind=i_k), intent(in) :: nblocks integer (kind=i_k), intent(in) :: nprocs real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition real (kind=r8k), intent(out) :: maxload end subroutine fd4_part_1d_simple_h1Parameters:
nblocks | number of blocks |
nprocs | number of processes |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
partition | output partition vector, contains start indices of partitions |
maxload | estimation of load of max loaded process, real value is a bit smaller |
Algorithm:
Miguet, S. und J.-M. Pierson: Heuristics for 1D rectilinear partitioning as a low cost and high quality answer to dynamic load balancing. in: High-Performance Computing and Networking, LNCS, vol. 1225, pages 550-564. Springer, 1997.
public subroutine fd4_part_1d_simple_h2 (nblocks, nprocs, blkweights_ps, partition, maxload) integer (kind=i_k), intent(in) :: nblocks integer (kind=i_k), intent(in) :: nprocs real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition real (kind=r8k), intent(out) :: maxload end subroutine fd4_part_1d_simple_h2Parameters:
nblocks | number of blocks |
nprocs | number of processes |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
partition | output partition vector, contains start indices of partitions |
maxload | estimation of load of max loaded process, real value is a bit smaller |
Algorithm:
Miguet, S. und J.-M. Pierson: Heuristics for 1D rectilinear partitioning as a low cost and high quality answer to dynamic load balancing. in: High-Performance Computing and Networking, LNCS, vol. 1225, pages 550-564. Springer, 1997.