Hi there,
in the course of modernizing some of my code I have moved to putting many "call by refernce" routines which were previously "stand-alone" behind types. In addition arrays are now passed using type bound pointers, usually with the "contiguous" attribute. This has the big advantage of avoiding to pass array boundaries explicitly (because many of my arrays start at index value zero). However, I have noticed a speed difference to an extent that the type bound routines need up to twice as much time for processing large arrays than the direct routines.
The below program mimics the above structure. It implements the frist part of an implicit multiplication of a 4.5 Mio x 100 matrix with an 4.8 Mio x 4.8 Mio structured sparse co-variance matrix (the latter matrix can be stored in form of four vectors and is held by the type). This routine needs about 4.8 seconds when called directly, and about 6.6 seconds when called through the type. This is n ot a big difference in absolute numbers, but sums up when performed several thousand times. Given that the type transports the array into the routine via a pointer with attribute "contiguous" this difference in speed should not appear. however, may be I haven't understood the standard incorrectly. The speed was measured on a Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz with 56 processors. The compiler was ifort 17.4. The data set can be supplied on request.
Any Ideas?
Thanks a lot
Module Mod_Direct
contains
Subroutine SubDrop(ISUBound1,ISUBound2,RMInOut,ivi,ivs&&,ivd,rvp,ISNThreads)
!$ use omp_lib
Implicit None
Integer*8, Intent(In) :: ISUbound1, ISUBound2
Real*8, Intent(InOut) :: RMInOut(0:ISUbound1,1:ISUBound2)
Integer*8, Intent(In) :: ivs(:), ivd(:), ivi(:)
Real*8, Intent(In) :: RVp(:)
Integer*4, intent(in), optional :: ISNThreads
Integer*8 :: c1, c2, ss, dd, ii
outer:block
RMInOut(0,:)=0.0D00
!$ if(present(ISNThreads)) Then
!$ if(ISNThreads>size(RMInOUt,2)) Then
!$ call omp_set_num_threads(size(RMInOut,2))
!$ else
!$ call omp_set_num_threads(ISNThreads)
!$ End if
!$ else
!$ c1=omp_get_max_threads()
!$ if(c1>size(RMInout,2)) Then
!$ call omp_set_num_threads(size(RMInout,2))
!$ else
!$ call omp_set_num_threads(int(c1,4))
!$ End if
!$ end if
!$OMP PARALLEL DO PRIVATE(ss,dd,c2,c1)
Do c1=1,size(RMInOut,2)
Do c2=1,size(IVI,1)
ss=ivs(c2)
dd=ivd(c2)
ii=ivi(c2)
RMInOut(ii,c1)=RMInOut(ii,c1)+rvp(c2)*(RMInOut(ss,c1)&&+RMInOut(dd,c1))
End Do
End Do
!$OMP END PARALLEL DO
End block outer
End Subroutine SubDrop
end Module Mod_Direct
Module Mod_Type
Type :: Testtype
Integer*8, Allocatable :: ivi(:), ivs(:), ivd(:)
Integer*8 :: isn
Integer*4 :: ISSubStat
Real*8, Allocatable :: rvp(:)
Real*8, Pointer, contiguous :: RMInout(:,:)
Character(:), allocatable :: csmsg
contains
procedure, pass, public :: drop=>subdrop
End type Testtype
Interface
Module Subroutine SubDrop(this,ISNThreads)
Class(TestType) :: this
Integer*4, optional :: ISNThreads
end Subroutine
End Interface
Private :: SubDrop
end Module Mod_Type
SubModule(Mod_Type) Drop
contains
Module Procedure SubDrop
!$ use omp_lib
Implicit None
Integer*8 :: c1, c2, ss, dd, ii
outer:block
if(.not.associated(this%RMInOut)) Then
this%CSMSG="ERROR"
this%ISSubStat=1;exit outer
end if
if(lbound(this%RMInOut,1)/=0) Then
this%CSMSG="ERROR"
this%ISSubStat=1;exit outer
End if
if(ubound(this%RMInOut,1)/=this%isn) Then
this%CSMSG="ERROR"
this%ISSubStat=1;exit outer
End if
this%RMInOut(0,:)=0.0D0
!$ if(present(ISNThreads)) Then
!$ if(ISNThreads>size(this%RMInOUt,2)) Then
!$ call omp_set_num_threads(size(this%RMInOut,2))
!$ else
!$ call omp_set_num_threads(ISNThreads)
!$ End if
!$ else
!$ c1=omp_get_max_threads()
!$ if(c1>size(this%RMInout,2)) Then
!$ call omp_set_num_threads(size(this%RMInout,2))
!$ else
!$ call omp_set_num_threads(int(c1,4))
!$ End if
!$ end if
!$OMP PARALLEL DO PRIVATE(ss,dd,c2,c1)
Do c1=1,size(this%RMInOut,2)
Do c2=1,size(this%ivi,1)
ss=this%ivs(c2)
dd=this%ivd(c2)
ii=this%Ivi(c2)
this%RMInOut(ii,c1)=this%RMInOut(ii,c1)+this%RVP(c2)&&*(this%RMInOut(ss,c1)+this%RMInOut(dd,c1))
End Do
End Do
!$OMP END PARALLEL DO
End block outer
end Procedure
End SubModule Drop
Program Test
use Mod_Type
use Mod_Direct
Implicit none
Type(TestType) :: TST
integer :: dim=4876565, dim3=100, c1
real*8, target, allocatable :: rmtmp(:,:)
real*8 :: t0, t1
!$ call omp_set_nested(.TRUE.)
Allocate(TST%ivi(dim),TST%ivs(dim),TST%ivd(dim),TST&&%rvp(dim))
open(55,file="input.txt",action="read")
Do c1=1,dim
read(55,*) TST%ivi(c1),tst%ivs(c1),tst%ivd(c1),tst%rvp(c1)
end Do
tst%isn=maxval(tst%ivi)
Allocate(rmtmp(0:tst%isn,dim3),source=0.0D0)
!!@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
TST%RMInOut=>rmtmp
call TST%drop()
!!@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
!!call SubDrop(ISUBound1=Int(tst%isn,8),ISUBound2=Int(dim3,8),RMInout&
!! &=rmtmp,ivi=tst%ivi,ivs=tst%ivs,ivd=tst%ivd,rvp=tst%rvp)
End Program Test