module fd4_globaldef_mod module fd4_mpi_mod module timing_modSubroutines and functions:
private function fd4_part_1d_scan_han (nblocks, blkweights_ps, sumload, avgblk, step) result (res) private function fd4_part_1d_scan_han_reverse (nblocks, blkweights_ps, sumload, avgblk, step) result (res) public function fd4_part_1d_scan (nblocks, blkweights_ps, pstart, sumload, guess) result (res) public function fd4_part_1d_scan_bin (nblocks, blkweights_ps, pstart, pend, sumload) result (res) private function fd4_part_1d_scan_bin_reverse (nblocks, blkweights_ps, pstart, pend, sumload) result (res) public subroutine fd4_part_1d_binsrch (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, partition, maxload, reqsteps) public subroutine fd4_part_1d_parallel (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, mpi, partition, maxload, reqsteps, mpi_time, err) public subroutine fd4_part_1d_simple_h1 (nblocks, nprocs, blkweights_ps, partition, maxload) public subroutine fd4_part_1d_simple_h2 (nblocks, nprocs, blkweights_ps, partition, maxload) public subroutine fd4_part_1d_simple_h1_reverse (nblocks, nprocs, blkweights_ps, partition, maxload) public subroutine fd4_part_1d_simple_rb (nblocks, nprocs, blkweights_ps, partition, maxload) public subroutine fd4_part_1d_binsrch_ebs (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, cwsb, partition, maxload, reqsteps)
Preprocessor options:
Author: Matthias Lieber
private function fd4_part_1d_scan_han (nblocks, blkweights_ps, sumload, avgblk, step) result (res) integer (kind=i_k), intent(in) :: nblocks real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps real (kind=r8k), intent(in) :: sumload integer (kind=i_k), intent(in) :: avgblk integer (kind=i_k), intent(inout) :: step integer (kind=i_k) :: res end function fd4_part_1d_scan_hanParameters:
nblocks | number of blocks |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
sumload | upper bound for search in blkweights_ps values |
avgblk | average number of blocks per partition (nblocks/nprocs) |
step | start of search interval of length avgblk, use for subsequent calls |
Improved algorithm with log(nblocks/nprocs) complexity, based on ideas proposed by Han et al. 1992.
In the first call step should be avgblk.
private function fd4_part_1d_scan_han_reverse (nblocks, blkweights_ps, sumload, avgblk, step) result (res) integer (kind=i_k), intent(in) :: nblocks real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps real (kind=r8k), intent(in) :: sumload integer (kind=i_k), intent(in) :: avgblk integer (kind=i_k), intent(inout) :: step integer (kind=i_k) :: res end function fd4_part_1d_scan_han_reverseParameters:
nblocks | number of blocks |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
sumload | upper bound for search in blkweights_ps values |
avgblk | average number of blocks per partition (nblocks/nprocs) |
step | start of search interval of length avgblk, use for subsequent calls |
This is the right-to-left version of fd4_part_1d_scan_han as proposed by Pinar and Aykanat.
In the first call step should be nblocks - avgblock.
public function fd4_part_1d_scan (nblocks, blkweights_ps, pstart, sumload, guess) result (res) integer (kind=i_k), intent(in) :: nblocks real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps integer (kind=i_k), intent(in) :: pstart real (kind=r8k), intent(in) :: sumload integer (kind=i_k), intent(inout) :: guess integer (kind=i_k) :: res end function fd4_part_1d_scanParameters:
nblocks | number of blocks |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
pstart | start index for search |
sumload | upper bound for search in blkweights_ps values |
guess | guess for result value for clever algorithm |
Uses a more sophisticated algorithm which starts searching at index guess. Typically faster than fd4_part_1d_scan_han. Returns a new value for guess for subsequent searches.
Called by fd4_part_1d_binsrch, fd4_part_1d_parallel, fd4_part_1d_simple_h1, and fd4_part_1d_simple_h2.
public function fd4_part_1d_scan_bin (nblocks, blkweights_ps, pstart, pend, sumload) result (res) integer (kind=i_k), intent(in) :: nblocks real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps integer (kind=i_k), intent(in) :: pstart integer (kind=i_k), intent(in) :: pend real (kind=r8k), intent(in) :: sumload integer (kind=i_k) :: res end function fd4_part_1d_scan_binParameters:
nblocks | number of blocks |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
pstart | start index for search |
pend | end index for search |
sumload | upper bound for search in blkweights_ps values |
The search is performed between pstart and pend. Use pstart=0 and pend=nblocks for search over all weights.
Uses binary search in the blkweights_ps array.
private function fd4_part_1d_scan_bin_reverse (nblocks, blkweights_ps, pstart, pend, sumload) result (res) integer (kind=i_k), intent(in) :: nblocks real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps integer (kind=i_k), intent(in) :: pstart integer (kind=i_k), intent(in) :: pend real (kind=r8k), intent(in) :: sumload integer (kind=i_k) :: res end function fd4_part_1d_scan_bin_reverseParameters:
nblocks | number of blocks |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
pstart | start index for search |
pend | end index for search |
sumload | upper bound for search in blkweights_ps values |
The search is performed between pstart and pend. Use pstart=0 and pend=nblocks for search over all weights.
Uses binary search in the blkweights_ps array.
public subroutine fd4_part_1d_binsrch (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, partition, maxload, reqsteps) integer (kind=i_k), intent(in) :: nblocks integer (kind=i_k), intent(in) :: nprocs real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps real (kind=r8k), intent(in) :: maxweight real (kind=r8k), intent(in) :: eps integer (kind=i_k), intent(in) :: maxsteps integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition real (kind=r8k), intent(out) :: maxload integer (kind=i_k), intent(out) :: reqsteps end subroutine fd4_part_1d_binsrchParameters:
nblocks | number of blocks |
nprocs | number of processes |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
maxweight | max weight in weight array |
eps | max. difference of the resulting bottleneck value from the minimal |
maxsteps | max. number of binary search steps, may stop earlier depending on eps |
partition | output partition vector, contains start indices of partitions |
maxload | estimation of load of max loaded process, real value is a bit smaller |
reqsteps | required number of search steps to reach the accuracy specified by eps |
The algorithm is exact for eps=0.0 and large maxsteps.
Algorithm:
The algorithm is similar to the method EBS (exact bisection) from:
Pinar, A. and C. Aykanat: Fast optimal load balancing algorithms for 1D partitioning. Journal of Parallel and Distributed Computing, 64(8):974-996, 2004.
public subroutine fd4_part_1d_parallel (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, mpi, partition, maxload, reqsteps, mpi_time, err) integer (kind=i_k), intent(in) :: nblocks integer (kind=i_k), intent(in) :: nprocs real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps real (kind=r8k), intent(in) :: maxweight real (kind=r8k), intent(in) :: eps integer (kind=i_k), intent(in) :: maxsteps type (fd4_mpi), intent(in) :: mpi integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition real (kind=r8k), intent(out) :: maxload integer (kind=i_k), intent(out) :: reqsteps integer (kind=i8k), intent(out) :: mpi_time integer (kind=i_k), intent(out) :: err end subroutine fd4_part_1d_parallelParameters:
nblocks | number of blocks |
nprocs | number of processes |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
maxweight | max weight in weight array |
eps | max. difference of the resulting bottleneck value from the minimal |
maxsteps | max. number of binary search steps, may stop earlier depending on eps |
mpi | domain's MPI context |
partition | output partition vector, contains start indices of partitions |
maxload | estimation of load of max loaded process, real value is a bit smaller |
reqsteps | required number of search steps to reach the accuracy specified by eps |
mpi_time | the time of this rank for MPI_Allreduce in us |
err | error status: 0...ok |
The algorithm is exact for eps=0.0 and large maxsteps.
Algorithm:
The bisection algorithm is based on the method EBS (exact bisection) from:
Pinar, A. and C. Aykanat: Fast optimal load balancing algorithms for 1D partitioning. Journal of Parallel and Distributed Computing, 64(8):974-996, 2004.
public subroutine fd4_part_1d_simple_h1 (nblocks, nprocs, blkweights_ps, partition, maxload) integer (kind=i_k), intent(in) :: nblocks integer (kind=i_k), intent(in) :: nprocs real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition real (kind=r8k), intent(out) :: maxload end subroutine fd4_part_1d_simple_h1Parameters:
nblocks | number of blocks |
nprocs | number of processes |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
partition | output partition vector, contains start indices of partitions |
maxload | estimation of load of max loaded process, real value is a bit smaller |
Algorithm:
Miguet, S. und J.-M. Pierson: Heuristics for 1D rectilinear partitioning as a low cost and high quality answer to dynamic load balancing. in: High-Performance Computing and Networking, LNCS, vol. 1225, pages 550-564. Springer, 1997.
public subroutine fd4_part_1d_simple_h2 (nblocks, nprocs, blkweights_ps, partition, maxload) integer (kind=i_k), intent(in) :: nblocks integer (kind=i_k), intent(in) :: nprocs real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition real (kind=r8k), intent(out) :: maxload end subroutine fd4_part_1d_simple_h2Parameters:
nblocks | number of blocks |
nprocs | number of processes |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
partition | output partition vector, contains start indices of partitions |
maxload | estimation of load of max loaded process, real value is a bit smaller |
Algorithm:
Miguet, S. und J.-M. Pierson: Heuristics for 1D rectilinear partitioning as a low cost and high quality answer to dynamic load balancing. in: High-Performance Computing and Networking, LNCS, vol. 1225, pages 550-564. Springer, 1997.
public subroutine fd4_part_1d_simple_h1_reverse (nblocks, nprocs, blkweights_ps, partition, maxload) integer (kind=i_k), intent(in) :: nblocks integer (kind=i_k), intent(in) :: nprocs real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition real (kind=r8k), intent(out) :: maxload end subroutine fd4_part_1d_simple_h1_reverseParameters:
nblocks | number of blocks |
nprocs | number of processes |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
partition | output partition vector, contains start indices of partitions |
maxload | estimation of load of max loaded process, real value is a bit smaller |
Algorithm:
public subroutine fd4_part_1d_simple_rb (nblocks, nprocs, blkweights_ps, partition, maxload) integer (kind=i_k), intent(in) :: nblocks integer (kind=i_k), intent(in) :: nprocs real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition real (kind=r8k), optional, intent(out) :: maxload end subroutine fd4_part_1d_simple_rbParameters:
nblocks | number of blocks |
nprocs | number of processes |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
partition | output partition vector, contains start indices of partitions |
maxload | estimation of load of max loaded process, real value is a bit smaller |
Algorithm:
public subroutine fd4_part_1d_binsrch_ebs (nblocks, nprocs, blkweights_ps, maxweight, eps, maxsteps, cwsb, partition, maxload, reqsteps) integer (kind=i_k), intent(in) :: nblocks integer (kind=i_k), intent(in) :: nprocs real (kind=r8k), intent(in), dimension (0:nblocks) :: blkweights_ps real (kind=r8k), intent(in) :: maxweight real (kind=r8k), intent(in) :: eps integer (kind=i_k), intent(in) :: maxsteps logical, intent(in) :: cwsb integer (kind=i_k), intent(out), dimension (0:nprocs) :: partition real (kind=r8k), intent(out) :: maxload integer (kind=i_k), intent(out) :: reqsteps end subroutine fd4_part_1d_binsrch_ebsParameters:
nblocks | number of blocks |
nprocs | number of processes |
blkweights_ps | prefix sum of ordered block weights, weights(0) must be 0 |
maxweight | max weight in weight array |
eps | max. difference of the resulting bottleneck value from the minimal |
maxsteps | max. number of binary search steps, may stop earlier depending on eps |
cwsb | construct final part. with separator bounds (T) or Han's method (F)? |
partition | output partition vector, contains start indices of partitions |
maxload | estimation of load of max loaded process, real value is a bit smaller |
reqsteps | required number of search steps to reach the accuracy specified by eps |
This is the method EBS (exact bisection) from:
Pinar, A. and C. Aykanat: Fast optimal load balancing algorithms for 1D partitioning. Journal of Parallel and Distributed Computing, 64(8):974-996, 2004.