Watching this conversation between Mark and Hernan, I realized today that it has been approximately
10 years (no kidding!) since I eliminated all C$OMP PARALLEL DO directives from my code. Back then
I was motivated by making the code more explicit (and, frankly by beauty as well), but it appears that
it helped to avoid problems later (although initially it brought some problems).
May be you should do the same in Rutgers ROMS as well.
Obviously, the old semantics with the double-nested loops,
Code: Select all
!$OMP PARALLEL DO PRIVATE(thread,chunk_size,Lstr,Lend) &
!$OMP& SHARED(numthreads,Nfloats)
do thread=...
do tile=...
is the product of evolution from SGI C$DOACROSS directive -- back then
it was the only way to
create parallel threads. Open MP is actually more in line with older Cray directives (also known as
"workshop" directives) which has two levels: C$DIR PARALLEL (outer) and C$DIR DO (inner).
Interestingly enough, that BARRIER was acceptable inside SGI C$DOACROSS (which is most similar to
C$OMP PARALLEL DO), but is no longer acceptable in Open MP standard. Same applies to SINGLE ...
END SINGLE.
As of today, here fundamentally only one C$OMP PARALLEL ... C$OMP END PARALLEL region in my
code (there is another one inside mpi_setup routine executed by MPI_rank=0 -- this needed only to
find out how many parallel threads are to use on rank=0 in order to broadcast to to all other MPI nodes).
The elementary structure is
Code: Select all
....
do tile=my_fist_tile,my_last_tile,+1
call set_HUV (tile)
enddo
C$OMP BARRIER
do tile=my_last_tile,my_first_tile,-1
call omega (tile)
call rho_eos (tile)
enddo
C$OMP BARRIER
.....
.....
encapsulated into an C$OMP PARALLEL ... C$OMP END PARALLEL region on a subroutine-level above.
The encapsulation completely eliminates the need for SHARED and PRIVATE lists: the code relies on
default rules -- what is in common blocks and Fortran 90 modules is shared by default (unless explicitly
specified as threadprivate); all internal variables inside subroutines are private.
my_tile_range = [my_last_tile,my_first_tile] is determined from the total number of tiles
and the thread number of particular thread. It the actual code the zig-zag order of tile
processing is inserted by "mpi.F" instead of manually, so you literally see loops tile
so "mpc" (which post-processes code after CPP) reverses the successive loops. This is done to
essentially make zig-zag "always straight", regardless whether some pieces of the code, including
some of the tile loops may be taken out by CPP.
The entire "main.F" looks like this:
Code: Select all
#include "cppdefs.h"
!! program main ! Open MP version of ROMS driver
implicit none ! with single parallel region using
integer ierr ! explicit barrier synchronization.
#include "param.h"
#ifdef MPI
real*8 tstart, tend
C$ integer level,req_lev
# include "mpif.h"
c** call system('uname -nmr')
ierr=1
C$ req_lev=MPI_THREAD_MULTIPLE
C$ call MPI_Init_thread (req_lev, level, ierr)
C$ mpi_master_only write(*,*) 'MPI thread support levels =',
C$ & req_lev,level
C$ ierr=0
if (ierr.eq.1) call MPI_Init (ierr)
call mpi_setup (ierr)
tstart=MPI_Wtime()
if (ierr.eq.0) then
#endif
call init_scalars (ierr) ! Initialize global scalars,
if (ierr.eq.0) then ! model tunable paparameters,
C$ call omp_set_dynamic(.false.)
C$OMP PARALLEL ! fast-time averaging weights
call roms_thread ! for barotropic mode, and
C$OMP END PARALLEL ! launch the model in OpenMP
endif ! parallel regime.
#ifdef MPI
endif
call MPI_Barrier(ocean_grid_comm, ierr)
tend=MPI_Wtime()
mpi_master_only write(*,*) 'MPI_run_time =', tend-tstart
call MPI_Finalize (ierr)
#endif
stop
end
subroutine roms_thread
implicit none
#include "param.h"
#include "scalars.h"
! Note: Because there is
call start_timers () ! a possibility of I/O error
call roms_init ! occurring on some MPI nodes,
if (may_day_flag.ne.0) goto 99 ! but not simultaneously on
do iic=ntstart,ntstart+ntimes ! all, exiting is deferred
call roms_step ! until "may_day_flag" is
if (may_day_flag.ne.0 .and. ! summarized among the nodes
& diag_sync_flag) goto 99 ! and broadcasted by "diag"
enddo ! (c.f. the control logic
99 call stop_timers() ! here and in "diag").
C$OMP BARRIER
C$OMP MASTER
call closecdf
C$OMP END MASTER
return
end
subroutine roms_init
implicit none
integer trd, tile, my_first, my_last, range
C$ integer omp_get_thread_num, omp_get_num_threads
#include "param.h"
#include "scalars.h"
# include "ncvars.h"
#ifdef FLOATS
! grid.h is needed so that lonr and latr are readily available
# include "grid.h"
# include "floats.h"
# include "ncvars_floats.h"
#endif
numthreads=1
C$ numthreads=omp_get_num_threads()
trd=0
C$ trd=omp_get_thread_num()
proc(2)=trd
if (mod(NSUB_X*NSUB_E,numthreads).ne.0) then
C$OMP MASTER
mpi_master_only write(*,'(/3(1x,A,I3),A/)')
& '### ERROR: wrong choice of numthreads =', numthreads,
& 'while NSUB_X =', NSUB_X, 'NSUB_E =', NSUB_E,'.'
may_day_flag=8
C$OMP END MASTER
C$OMP BARRIER
goto 99 !--> EXIT
endif
iic=0 ! WARNING: This code is written
kstp=1 ! under assumption that the scalars
knew=1 ! on the left -- numthreads, iic,
#ifdef SOLVE3D
iif=1 ! kstp, knew, iif, nstp, nnew --
nstp=1 ! belong to a THREADPRIVATE common
nrhs=1 ! block, so there no false sharing
nnew=1 ! here.
nnew=1
#endif
synchro_flag=.true.
diag_sync_flag=.false.
priv_count=0
range=(NSUB_X*NSUB_E+numthreads-1)/numthreads
my_first=trd*range
my_last=min(my_first + range-1, NSUB_X*NSUB_E-1)
#define my_tile_range my_first,my_last
do tile=my_tile_range ! Initialize (FIRST-TOUCH) model
call init_arrays (tile) ! global arrays (most of them are
enddo ! just set to to zero).
C$OMP BARRIER
c--#define CR
CR write(*,*) '-11' MYID
#ifdef ANA_GRID
do tile=my_tile_range ! Set horizontal curvilinear
call ana_grid (tile) ! grid and model bathymetry
enddo ! (either analyticaly or read
C$OMP BARRIER ! from grid netCDF file).
#else
C$OMP MASTER
call get_grid
C$OMP END MASTER
C$OMP BARRIER
if (may_day_flag.ne.0) goto 99 !--> EXIT
#endif
do tile=my_tile_range ! Compute various metric terms
call setup_grid1 (tile) ! and their combinations.
enddo
C$OMP BARRIER
CR write(*,*) '-10' MYID
do tile=my_tile_range
call setup_grid2 (tile)
enddo
C$OMP BARRIER
CR write(*,*) '-9' MYID
#ifdef SOLVE3D
C$OMP MASTER ! Setup vertical grid-stretching
call set_scoord ! functions for S-coordinate
C$OMP END MASTER ! system
C$OMP BARRIER
if (may_day_flag.ne.0) goto 99
#endif
CR write(*,*) ' -8' MYID
#if (defined UV_VIS2 && defined VIS_GRID) ||\
(defined TS_DIF2 && defined DIF_GRID)
do tile=my_tile_range ! Rescale horizontal mixing
call visc_rescale (tile) ! coefficients according to
enddo ! local grid size.
C$OMP BARRIER
CR write(*,*) ' -7' MYID
#endif
#ifdef SOLVE3D
do tile=my_tile_range ! Create three-dimensional
call set_depth (tile) ! S-coordinate system, which
#ifdef LMD_KPP
call swr_frac (tile) ! may be needed by ana_init.
#endif
enddo
C$OMP BARRIER ! Here it is assumed that free
do tile=my_tile_range ! surface zeta is at rest state,
call grid_stiffness (tile) ! 'zeta=0). Also find and report
enddo ! extremal values of topographic
C$OMP BARRIER ! slope parameters "rx0", "rx1".
CR write(*,*) ' -6' MYID
#endif
! Set initial conditions for
#ifdef ANA_INITIAL
do tile=my_tile_range ! model prognostic variables,
call ana_init (tile) ! either analytically or read
enddo ! from netCDF file. Note that
C$OMP BARRIER
if (nrrec.gt.0) then ! because ana_init may also
#endif
#ifdef EXACT_RESTART
C$OMP MASTER ! setup environmental variables
call get_init (nrrec-1,2) ! (e.g. analytical boundary
C$OMP END MASTER ! forcing), call it first, even
C$OMP BARRIER ! in the case of restart run.
# ifdef SOLVE3D
do tile=my_tile_range
call set_depth (tile) !<-- needed to initialize Hz_bak
enddo
C$OMP BARRIER
# endif
#endif
C$OMP MASTER
call get_init (nrrec, 1)
C$OMP END MASTER
#ifdef ANA_INITIAL
endif !<-- nrrec.gt.0
#endif
C$OMP BARRIER
if (may_day_flag.ne.0) goto 99 !--> ERROR
CR write(*,*) ' -5' MYID
! Set initial model clock: at this
time=start_time ! moment "start_time" (global scalar)
tdays=time*sec2day ! is set by get_init or analytically
! copy it into threadprivate "time"
#ifdef SOLVE3D
do tile=my_tile_range ! Re-compute three-dimensional S-
call set_depth (tile) ! coordinate system: at this moment
enddo ! free surface has non-zero status
C$OMP BARRIER
CR write(*,*) ' -4' MYID
do tile=my_tile_range
call set_HUV (tile)
enddo
C$OMP BARRIER
CR write(*,*) ' -3' MYID
do tile=my_tile_range
call omega (tile)
call rho_eos (tile)
enddo
C$OMP BARRIER
CR write(*,*) ' -2' MYID
#endif
! Set up climatological environment: Set nudging coefficient for
!==== == ============== ============ sea-surface hight and tracer
! climatology; create analytical tracer and sea-surface hight
! climatology fields (if applicable); set bottom sediment grain
! size [m] and density [kg/m^3] used for bottom boundary layer
! formulation;
#if defined SPONGE || defined TCLIMATOLOGY \
|| (defined SG_BBL96 && defined ANA_BSEDIM)\
|| (defined TCLIMATOLOGY && defined ANA_TCLIMA)\
|| defined ANA_SSH
do tile=my_tile_range
# if defined SPONGE || defined TCLIMATOLOGY
call set_nudgcof (tile)
# endif
# if defined TCLIMATOLOGY && defined ANA_TCLIMA && defined SOLVE3D
call ana_tclima (tile)
# endif
# ifdef ANA_SSH
call ana_ssh (tile)
# endif
# if defined SG_BBL96 && defined ANA_BSEDIM
call ana_bsedim (tile)
# endif
enddo
C$OMP BARRIER
#endif
CR write(*,*) ' -1' MYID
! Read initial input data for forcing fields; tracer and sea surface
! climatology; bottom sediment grain size and density (if applicable)
! from input netCDF files. Recall that CPP-logic here is mutually
! exclussive with respect to cals ana_tclima, ana_ssh, and ana_bsedim
! just above.
C$OMP MASTER
#ifdef ANA_GRID
call wrt_ana_grid
#endif
if (ldefhis .and. wrthis(indxTime)) call wrt_his
#ifdef FLOATS
! Initialization for Lagrangian floats
!-------------------------------------------------------
nrecflt=0 ! initialization done here and not in
ncidflt=-1 ! init_scalars since it must be done only
! once (whether child levels exist or not)
spval=1.E15 ! spval is the nodata flag for float variables
deltac2p=2.3 ! distance from the boundary at which a float
! is transferred from child to parent
deltap2c=2.5 ! same for transfer from parent to child
call init_arrays_floats
call init_floats
# ifdef SPHERICAL
call interp_r2d_type_ini (lonr(START_2D_ARRAY), iflon)
call interp_r2d_type_ini (latr(START_2D_ARRAY), iflat)
# else
call interp_r2d_type_ini ( xr(START_2D_ARRAY), iflon)
call interp_r2d_type_ini ( yr(START_2D_ARRAY), iflat)
# endif
# ifdef SOLVE3D
call fill_ini ! fills in trackaux for ixgrd,iygrd,izgrd
! and ifld (either izgrd or ifld is meaningful)
# endif
if (ldefflt) call wrt_floats
#endif /* FLOATS */
C$OMP END MASTER
C$OMP BARRIER
CR write(*,*) ' 0' MYID
if (may_day_flag.ne.0) goto 99 !--> EXIT
C$OMP MASTER
mpi_master_only write(*,'(/1x,A/)')
& 'main :: initialization complete, started time-steping.'
C$OMP END MASTER
99 return
end
! ***** ********* ****** ******* *********
! *** *** * *** * * *** *** *** * *** *
! *** *** ** *** *** *** ***
! ***** *** *** *** *** ** ***
! *** *** ********* ****** ***
! *** *** *** *** *** *** ** ***
! ***** *** *** *** *** *** ***
subroutine roms_step
implicit none
integer trd, tile, my_first, my_last, range
#include "param.h"
#include "scalars.h"
#include "ncvars.h"
#ifdef FLOATS
# include "ncvars_floats.h"
# include "floats.h"
integer chunk_size_flt, Lstr,Lend, flt_str
common /floats_step/ flt_str
#endif
#ifdef GRID_LEVEL
integer iter
#endif
trd=proc(2)
range=(NSUB_X*NSUB_E+numthreads-1)/numthreads
my_first=trd*range
my_last=min(my_first + range-1, NSUB_X*NSUB_E-1)
! Increment time-step index and set model clock. Note that "time" set
! below corresponds to step "n" (denoted here as "nstp"), while counter
! "iic" corresponds to "n+1", so normally, assuming that time is
! counted from zero, the following relation holds: time=dt*(iic-1).
! Also note that the output history/restart/averages routines write
! time and all the fields at step "n" (not n+1), while the first
! element of structure "time_index" written into the files is actually
! iic-1, hence normally time=time_index*dt there. Same rule applies
! to the diagnostic routine "diag" which prints time and time step
! (actually iic-1) on the screen.
time=start_time+dt*float(iic-ntstart) !<-- corresp. to "nstp"
tdays=time*sec2day
#ifdef SOLVE3D
nstp=1+mod(iic-ntstart,2)
nrhs=nstp
nnew=3
#endif
#ifdef FLOATS
nfp1=mod(nfp1+1,NFT+1) ! rotate time
nf =mod(nf +1,NFT+1) ! indices for
nfm1=mod(nfm1+1,NFT+1) ! floats
nfm2=mod(nfm2+1,NFT+1)
nfm3=mod(nfm3+1,NFT+1)
C$OMP MASTER
flt_str=0
C$OMP END MASTER
#endif
! Read forcing and climatology
if (synchro_flag) then ! data. Note: this operation may
synchro_flag=.false. ! raise "may_day_flag" in the
C$OMP MASTER ! case of IO errors, however the
call get_forces ! error condition may occur on
C$OMP END MASTER ! some nodes, but not on all at
C$OMP BARRIER ! the same time, so the exit is
endif ! deferred until after broadcast
! of "may_day_flag" inside diag.
#ifdef SOLVE3D
do tile=my_tile_range ! Interpolate forcing date
call set_forces (tile) ! to model time and compute
# if defined SSH_TIDES || defined UV_TIDES
call set_tides (tile) ! surface fluxes.
# endif
call rho_eos (tile)
call set_HUV (tile)
call diag (tile)
#ifdef BIOLOGY
call bio_diag (tile)
#endif
enddo
C$OMP BARRIER
do tile=my_tile_range
call omega (tile)
# if defined ANA_VMIX
call ana_vmix (tile)
# elif defined LMD_MIXING
call lmd_vmix (tile)
c call lmd_kmix (tile)
# elif defined BVF_MIXING
call bvf_mix (tile)
# endif
enddo
C$OMP BARRIER
do tile=my_tile_range
call prsgrd (tile)
call rhs3d (tile)
call pre_step3d (tile)
# ifdef PRED_COUPLED_MODE
# ifdef UV_VIS2
call visc3d (tile)
# endif
# endif
# ifdef AVERAGES
call set_avg (tile)
# endif
enddo
C$OMP BARRIER
# ifdef CORR_COUPLED_MODE
do tile=my_tile_range
call set_HUV1 (tile)
enddo
C$OMP BARRIER
nrhs=3
nnew=3-nstp !!! WARNING
do tile=my_tile_range
call omega (tile)
call rho_eos (tile)
enddo
C$OMP BARRIER
do tile=my_tile_range
call prsgrd (tile)
call rhs3d (tile)
call step3d_uv1 (tile)
# ifdef UV_VIS2
call visc3d (tile)
# endif
enddo
C$OMP BARRIER
# endif
#endif /* SOLVE3D */
! Output block: write restart/history files.
!======= ====== ===== =============== ======
if ( iic.gt.ntstart .and. ( mod(iic-ntstart,nrst).eq.0
#ifdef EXACT_RESTART
& .or. mod(iic-ntstart+1,nrst).eq.0
#endif
& .or. (mod(iic-ntstart,nwrt).eq.0 .and. wrthis(indxTime))
#ifdef AVERAGES
& .or. (mod(iic-ntsavg,navg).eq.0 .and. wrtavg(indxTime))
#endif
#ifdef STATIONS
& .or. (mod(iic-ntstart,nsta).eq.0 .and. nstation.gt.0)
#endif
#ifdef FLOATS
& .or. (mod(iic-ntstart,nflt).eq.0 .and. nfloats.gt.0)
#endif
& )) then
C$OMP MASTER
if (mod(iic-ntstart,nrst).eq.0
#ifdef EXACT_RESTART
& .or. mod(iic-ntstart+1,nrst).eq.0
#endif
& ) nrecrst=nrecrst+1
if (mod(iic-ntstart,nwrt).eq.0) nrechis=nrechis+1
#ifdef AVERAGES
if (mod(iic-ntsavg,navg) .eq.0) nrecavg=nrecavg+1
#endif
#ifdef STATIONS
if (mod(iic-ntstart,nsta).eq.0) nrecstn=nrecstn+1
#endif
#ifdef FLOATS
if (mod(iic-ntstart,nflt).eq.0) nrecflt=nrecflt+1
#endif
if (mod(iic-ntstart,nrst).eq.0
#ifdef EXACT_RESTART
& .or. mod(iic-ntstart+1,nrst).eq.0
#endif
& ) call wrt_rst
if (mod(iic-ntstart,nwrt).eq.0 .and. wrthis(indxTime)) then
call wrt_his
c if (iic.gt.60) nwrt=1 !<-- useful for debugging
endif
#ifdef AVERAGES
if (mod(iic-ntsavg,navg) .eq.0 .and. wrtavg(indxTime))
& call wrt_avg
#endif
#ifdef STATIONS
if (mod(iic-ntstart,nsta).eq.0 .and. nstation.gt.0)
& call wrt_statn
#endif
#ifdef FLOATS
if (mod(iic-ntstart,nflt).eq.0 .and. nfloats.gt.0)
& call wrt_floats
diagfloats=.false.
#endif
C$OMP END MASTER
C$OMP BARRIER
if (iic-ntstart .gt. ntimes) goto 99 !--> DONE
endif
#ifdef FLOATS
! flag for diagnostic computation (for writing at next time step)
if (mod(iic-ntstart,nflt).eq.0) then
diagfloats=.true.
endif
#endif
! Solve the 2D equations for the barotropic mode.
!------ --- -- --------- --- --- ---------- -----
#ifdef SOLVE3D
do iif=1,nfast
#endif
#define FORW_BAK
#ifdef FORW_BAK
kstp=knew ! This might look a bit silly,
knew=kstp+1 ! since both branches of this
if (knew.gt.4) knew=1 ! if statement are identical.
if (mod(knew,2).eq.0) then ! Nevertheless, it makes sense,
do tile=my_tile_range ! since mpc will reverse one of
# ifndef SOLVE3D
call set_forces (tile) ! these loops to make zig-zag
# endif
call step2d (tile) ! tile-processing sequence.
enddo
C$OMP BARRIER
else
do tile=my_tile_range
# ifndef SOLVE3D
call set_forces (tile)
# endif
call step2d (tile)
enddo
C$OMP BARRIER
endif
#else
kstp=knew
knew=3
do tile=my_tile_range
# ifndef SOLVE3D
call set_forces (tile)
# endif
call step2d (tile)
enddo
C$OMP BARRIER
knew=3-kstp
do tile=my_tile_range
call step2d (tile)
enddo
C$OMP BARRIER
#endif
#ifdef SOLVE3D
enddo ! <-- iif
# ifdef PRED_COUPLED_MODE
do tile=my_tile_range ! This code segment is for
call set_HUV1 (tile) ! predictor-coupled version.
enddo
C$OMP BARRIER
nrhs=3
nnew=3-nstp
do tile=my_tile_range
call omega (tile)
call rho_eos (tile)
enddo
C$OMP BARRIER
do tile=my_tile_range
call prsgrd (tile)
call rhs3d (tile)
call step3d_uv1 (tile)
enddo
C$OMP BARRIER
# endif
do tile=my_tile_range ! Continue solution of the
call step3d_uv2 (tile) ! three-dimensional equations:
enddo ! finalize time step for momenta
C$OMP BARRIER ! and tracers
do tile=my_tile_range
call omega (tile)
call step3d_t (tile)
# if defined TS_DIF2 || defined TS_DIF4
call t3dmix (tile)
# endif
enddo
C$OMP BARRIER
#endif /* SOLVE3D */
#ifdef FLOATS
chunk_size_flt=32
do while (flt_str.lt.nfloats)
C$OMP CRITICAL
Lstr=flt_str+1
flt_str=Lstr+chunk_size_flt-1
C$OMP END CRITICAL
Lend=min(Lstr+chunk_size_flt-1,nfloats)
call step_floats (Lstr,Lend)
enddo
c** call step_floats (1,nfloats) ! serial version for debugging
#endif
99 return
end
Notes:
1. In the code above all time variables, time indices are THREADPRIVATE, that is every thread has its own
clock and advances it, therefore "clalars.h" contains among other things,
Code: Select all
real*4 cpu_time(4)
real WallClock, time, tdays
integer proc(2), numthreads, iic, kstp, knew
#ifdef SOLVE3D
& , iif, nstp, nnew, nrhs
#endif
#ifdef FLOATS
& , nfp1, nf, nfm1, nfm2, nfm3
#endif
& , priv_count(16)
logical synchro_flag, diag_sync_flag
common /priv_scalars/ WallClock, cpu_time, proc,
& time, tdays, numthreads, iic, kstp, knew
#ifdef SOLVE3D
& , iif, nstp, nnew, nrhs
#endif
#ifdef FLOATS
& ,nfp1, nf, nfm1, nfm2, nfm3
#endif
& , priv_count, synchro_flag, diag_sync_flag
C$OMP THREADPRIVATE(/priv_scalars/)
2. As said above, "mpi_setup.F" contains among other things
Code: Select all
#include "cppdefs.h"
#ifdef MPI
C$ subroutine master_num_threads (nthrds)
C$ implicit none
C$ integer nthrds, trd, omp_get_num_threads, omp_get_thread_num
C$ trd=omp_get_thread_num()
C$ if (trd.eq.0) nthrds=omp_get_num_threads()
C$ return
C$ end
subroutine mpi_setup (ierr)
implicit none
integer ierr, nsize, i_W, i_E, j_S, j_N, off_XI, off_ETA
# include "param.h"
# include "hidden_mpi_vars.h"
# include "ncvars.h"
# include "mpif.h"
C$ integer nthrds
ocean_grid_comm=MPI_COMM_WORLD
call MPI_Comm_size (ocean_grid_comm, nsize, ierr)
call MPI_Comm_rank (ocean_grid_comm, mynode, ierr)
#ifdef VERBOSE
mpi_master_only call system('echo Starting mpi_setup at `date`')
write(*,*) 'Starting mpi_setup, node ', mynode,' out of', nsize
#endif
exc_call_count=0
! Determine number of threads
C$ if (mynode.eq.0) then ! on MPI node rank=0, then
C$OMP PARALLEL SHARED(nthrds) ! broadcast it, so all others
C$ call master_num_threads (nthrds) ! can set the same number of
C$OMP END PARALLEL ! threads as the rank=0 node.
C$ endif
C$ call MPI_Bcast(nthrds, 1, MPI_INTEGER, 0, ocean_grid_comm, ierr)
C$ if (mynode.gt.0) then
C$ call omp_set_num_threads (nthrds)
C$ endif
....
return
end
#else
subroutine MPI_Setup_empty
end
#endif /* MPI */