ROMS runtime error on Cray XT3

Report or discuss software problems and other woes

Moderators: arango, robertson

Post Reply
Message
Author
jasont
Posts: 1
Joined: Fri Jun 01, 2007 4:36 am
Location: University of Western Australia

ROMS runtime error on Cray XT3

#1 Unread post by jasont »

Hi, i've got a runtime error. any idea? it's a Cray XT3 using CNL. i've run it with strace to see the error messages


execve("./oceanM", ["oceanM", "ROMS/External/ocean_upwelling.in"...], [/* 129 vars */]execve("./oceanM", ["oceanM", "ROMS/External/ocean_upwelling.in"...], [/* 129 vars */]) = 0
uname() = 0
{sys="Linux", node="nid00177", ...}) = 0
brk(0) = 0xb3e000
brk(0xb3ef30) = 0xb3ef30
arch_prctl(ARCH_SET_FS, 0xb3e860) = 0
set_tid_address(0xb3e8f0) = 4535
rt_sigaction(SIGRTMIN, {0x5ae900, [], SA_RESTORER|SA_SIGINFO, 0x5ae5f0}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {0x5ae850, [], SA_RESTORER|SA_RESTART|SA_SIGINFO, 0x5ae5f0}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM_INFINITY}) = 0
_sysctl({{CTL_KERN, KERN_VERSION}, 2, 0x7fffffffc9f0, 35, (nil), 0}) = 0
uname(brk(0xb5ff30) = 0xb5ff30
brk(0xb60000) = 0xb60000
{sys="Linux", node="nid00176", ...}) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM_INFINITY}) = 0
brk(0fcntl(108, F_SETLKW, {type=F_WRLCK, whence=SEEK_SET, start=0, len=0}) = 0
write(108, "\3\0\0\0\267\21\0\0\0\0\0\0\0\0\0\0", 16) = 16
read(110, "\0\0\0\0\377\177\0\0\10\0\0\0\0\0\0\0", 16) = 16
read(110, "}\22\0\0\0\0\0\0", 8) = 8
fcntl(108, F_SETLK, {type=F_UNLCK, whence=SEEK_SET, start=0, len=0}) = 0
) = 0xb3e000
open("/var/spool/alps/places4733", O_RDONLY) = 0
lseek(0, 0, SEEK_END) = 160
lseek(0, 0, SEEK_SET) = 0
mmap(NULL, 160, PROT_READ, MAP_SHARED, 0, 0) = 0x2aaaaaaab000
munmap(0x2aaaaaaab000, 160) = 0
close(0) = 0
read(100, "\2\0\0\0\1\0\0\0", 8) = 8
open("/proc/cray_xt/nid", O_RDONLY) = 0
read(0, "177\n", 64) = 4
close(0) = 0
mmap(NULL, 16704, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x2aaaaaaac000
brk(0xb3ef30mmap(NULL, 1589247, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x2aaaaaab1000
pipe([0, 4]) = 0
pipe([5, 6]) = 0
clone() = 0xb3ef30
arch_prctl(ARCH_SET_FS, 0xb3e860) = 0
child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0xb3e8f0) = 4536
close(4) = 0
close(5) = 0
set_tid_address(0xb3e8f0rt_sigaction(SIGCHLD, {0x5a5fe4, [], SA_RESTORER|SA_NOCLDSTOP, 0x5ae5f0}, NULL, 8) = 0
rt_sigaction(SIGHUP, {SIG_IGN}, NULL, 8) = 0
rt_sigaction(SIGINT, {SIG_IGN}, NULL, 8) = 0
rt_sigaction(SIGQUIT, {SIG_IGN}, NULL, 8) = 0
rt_sigaction(SIGTERM, {SIG_IGN}, NULL, 8) = 0
) = 4219
rt_sigaction(SIGABRT, {SIG_IGN}, NULL, 8) = 0
rt_sigaction(SIGUSR1, {SIG_IGN}, NULL, 8) = 0
rt_sigaction(SIGUSR2, {SIG_IGN}, NULL, 8) = 0
read(0, "\0\0\0\0\1\0\0\0\0\0\30\0", 12) = 12
write(6, "\0\0\0\0\1\0\0\0\0\0\30\0", 12) = 12
fcntl(108, F_SETLKW, {type=F_WRLCK, whence=SEEK_SET, start=0, len=0}) = 0
write(108, "\5\0\0\0\267\21\0\0\0\0\0\0\0\0\0\0", 16) = 16
read(110, rt_sigaction(SIGRTMIN, {0x5ae900, [], SA_RESTORER|SA_SIGINFO, 0x5ae5f0}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {0x5ae850, [], SA_RESTORER|SA_RESTART|SA_SIGINFO, 0x5ae5f0}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM_INFINITY}) = 0
_sysctl({{CTL_KERN, KERN_VERSION}, 2, 0x7fffffffc9f0, 35, (nil), 0}) = 0
brk(0xb5ff30) = 0xb5ff30
brk(0xb60000) = 0xb60000
getrlimit(RLIMIT_STACK, {rlim_cur=8192*1024, rlim_max=RLIM_INFINITY}) = 0
fcntl(108, F_SETLKW, {type=F_WRLCK, whence=SEEK_SET, start=0, len=0}) = 0
write(108, "\3\0\0\0{\20\0\0\0\0\0\0\0\0\0\0", 16) = 16
read(110, "\0\0\0\0\377\177\0\0\10\0\0\0\0\0\0\0", 16) = 16
read(110, "}\22\0\0\0\0\0\0", 8) = 8
fcntl(108, F_SETLK, {type=F_UNLCK, whence=SEEK_SET, start=0, len=0}) = 0
open("/var/spool/alps/places4733", O_RDONLY) = 4
lseek(4, 0, SEEK_END) = 176
lseek(4, 0, SEEK_SET) = 0
mmap(NULL, 176, PROT_READ, MAP_SHARED, 4, 0) = 0x2aaaaaaab000
munmap(0x2aaaaaaab000, 176) = 0
close(4) = 0
read(100, "\2\0\0\0\0\0\0\0", 8) = 8
open("/proc/cray_xt/nid", O_RDONLY) = 4
read(4, "176\n", 64) = 4
close(4) = 0
mmap(NULL, 16704, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x2aaaaaaac000
mmap(NULL, 1589247, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0) = 0x2aaaaaab1000
pipe([4, 5]) = 0
pipe([6, 7]) = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0xb3e8f0) = 4220
close(5) = 0
close(6) = 0
rt_sigaction(SIGCHLD, {0x5a5fe4, [], SA_RESTORER|SA_NOCLDSTOP, 0x5ae5f0}, NULL, 8) = 0
rt_sigaction(SIGHUP, {SIG_IGN}, NULL, 8) = 0
rt_sigaction(SIGINT, {SIG_IGN}, NULL, 8) = 0
rt_sigaction(SIGQUIT, {SIG_IGN}, NULL, 8) = 0
rt_sigaction(SIGTERM, {SIG_IGN}, NULL, 8) = 0
rt_sigaction(SIGABRT, {SIG_IGN}, NULL, 8) = 0
rt_sigaction(SIGUSR1, {SIG_IGN}, NULL, 8) = 0
rt_sigaction(SIGUSR2, {SIG_IGN}, NULL, 8) = 0
read(4, "\0\0\0\0\1\0\0\0%eZ\0", 12) = 12
write(7, "\0\0\0\0\1\0\0\0%eZ\0", 12) = 12
fcntl(108, F_SETLKW, {type=F_WRLCK, whence=SEEK_SET, start=0, len=0}) = 0
write(108, "\5\0\0\0{\20\0\0\0\0\0\0\0\0\0\0", 16) = 16
read(110, "\0\0\0\0 \0\0\0\0\0\0\0\0\0\0\0", 16) = 16
"\0\0\0\0 \0\0\0\0\0\0\0\0\0\0\0", 16) = 16
fcntl(108, F_SETLK, {type=F_UNLCK, whence=SEEK_SET, start=0, len=0}) = 0
read(4, "\0\0\0\0\1\0\0\0%eZ\0", 12) = 12
write(7, "\0\0\0\0\1\0\0\0%eZ\0", 12) = 12
rt_sigsuspend([]fcntl(108, F_SETLK, {type=F_UNLCK, whence=SEEK_SET, start=0, len=0}) = 0
read(0, "\0\0\0\0\1\0\0\0\0\0\30\0", 12) = 12
write(6, "\0\0\0\0\1\0\0\0\0\0\30\0", 12) = 12
rt_sigsuspend([] Process Information:

Node # 0 (pid= 4220) is active.
Node # 1 (pid= 4536) is active.

Model Input Parameters: ROMS/TOMS version 3.0
Wednesday - December 3, 2008 - 1:00:45 PM
-----------------------------------------------------------------------------

Wind-Driven Upwelling/Downwelling over a Periodic Channel

Operating system : Linux
CPU/hardware : x86_64
Compiler system : ftn
Compiler command : /opt/cray/xt-asyncpe/1.2/bin/ftn
Compiler flags :

Input Script : ROMS/External/ocean_upwelling.in

SVN Root URL : https://www.myroms.org/svn/src/trunk
SVN Revision : 275M

Local Root : /lus/nid00036/jasont/ROMS-new
Header Dir : ./ROMS/Include
Header file : upwelling.h
Analytical Dir: /lus/nid00036/jasont/ROMS-new/ROMS/Functionals

Resolution, Grid 01: 0041x0080x016, Parallel Nodes: 2, Tiling: 001x001

ROMS/TOMS: Wrong choice of domain 01 partition or number of parallel threads.
NtileI * NtileJ must be equal to the number of parallel nodes.
Change -np value to mpirun or
change domain partition in input script.

Elapsed CPU time (seconds):

Node # 0 CPU: 0.017
Node # 1 CPU: 0.017
*** glibc detected *** oceanM: double free or corruption (!prev): 0x0000000000b4b8d0 ***
*** glibc detected *** oceanM: double free or corruption (!prev): 0x0000000000b4b3e0 ***
======= Backtrace: =========
[0x611941]
[0x612e9a]
[0x47febd]
======= Backtrace: =========
======= Memory map: ========
[0x611941]
00400000-006b8000 r-xp 00000000 def:4d662 100319710 /lus/nid00036/jasont/ROMS-new/oceanM
007b8000-0084e000 rwxp 002b8000 def:4d662 100319710 /lus/nid00036/jasont/ROMS-new/oceanM
0084e000-00b60000 rwxp 0084e000 00:00 0 [heap]
2aaaaaaac000-2aaaaaab1000 rwxs 00000000 00:09 8425 /dev/zero (deleted)
2aaaaaab1000-2aaaaac35000 rwxs 00000000 00:09 8426 /dev/zero (deleted)
2aaaaac35000-2aaaaae36000 rwxp 2aaaaac35000 00:00 0
2aaaaae36000-2aaaab0b7000 rwxs 00000000 00:0f 935 /dev/ukbridge1
2aaaab0b7000-2aaaaecbf000 rwxp 2aaaab0b7000 00:00 0
2aaaaecbf000-2aaaaed00000 rwxs 00000000 00:0f 935 /dev/ukbridge1
2aaaaed00000-2aaaaed41000 rwxs 00000000 00:0f 935 /dev/ukbridge1
2aaaaed41000-2aaaaed82000 rwxs 00000000 00:0f 935 /dev/ukbridge1
2aaaaed82000-2aaaaf02a000 rwxp 2aaaaed82000 00:00 0
2aaaaf100000-2aaaaf126000 rwxp 2aaaaf100000 00:00 0
2aaaaf126000-2aaaaf200000 ---p 2aaaaf126000 00:00 0
2aaaaf22a000-2aaaaf32b000 rwxp 2aaaaf22a000 00:00 0
7ffffffe9000-7ffffffff000 rwxp 7ffffffe9000 00:00 0 [stack]
ffffffffff600000-ffffffffffe00000 ---p 00000000 00:00 0 [vdso]
[0x612e9a]
[0x47febd]
======= Memory map: ========
00400000-006b8000 r-xp 00000000 def:4d662 100319710 /lus/nid00036/jasont/ROMS-new/oceanM
007b8000-0084e000 rwxp 002b8000 def:4d662 100319710 /lus/nid00036/jasont/ROMS-new/oceanM
0084e000-00b60000 rwxp 0084e000 00:00 0 [heap]
2aaaaaaac000-2aaaaaab1000 rwxs 00000000 00:09 9295 /dev/zero (deleted)
2aaaaaab1000-2aaaaac35000 rwxs 00000000 00:09 9296 /dev/zero (deleted)
2aaaaac35000-2aaaaae36000 rwxp 2aaaaac35000 00:00 0
2aaaaae36000-2aaaab0b7000 rwxs 00000000 00:0f 935 /dev/ukbridge1
2aaaab0b7000-2aaaaecbf000 rwxp 2aaaab0b7000 00:00 0
2aaaaecbf000-2aaaaed00000 rwxs 00000000 00:0f 935 /dev/ukbridge1
2aaaaed00000-2aaaaed41000 rwxs 00000000 00:0f 935 /dev/ukbridge1
2aaaaed41000-2aaaaed82000 rwxs 00000000 00:0f 935 /dev/ukbridge1
2aaaaed82000-2aaaaf12b000 rwxp 2aaaaed82000 00:00 0
2aaaaf200000-2aaaaf226000 rwxp 2aaaaf200000 00:00 0
2aaaaf226000-2aaaaf300000 ---p 2aaaaf226000 00:00 0
7ffffffe9000-7ffffffff000 rwxp 7ffffffe9000 00:00 0 [stack]
ffffffffff600000-ffffffffffe00000 ---p 00000000 00:00 0 [vdso]
<unfinished ...>
--- SIGCHLD (Child exited) @ 0 (0) ---
<unfinished ...>
--- SIGCHLD (Child exited) @ 0 (0) ---
<... rt_sigsuspend resumed> ) = -1 EINTR (Interrupted system call)
wait4(4536, [{WIFSIGNALED(s) && WTERMSIG(s) == SIGABRT}], WNOHANG, NULL) = 4536
write(2, "_pmii_daemon(SIGCHLD): PE 1 exit"..., 48_pmii_daemon(SIGCHLD): PE 1 exit signal Aborted
) = 48
close(0) = 0
rt_sigreturn(0<... rt_sigsuspend resumed> ) = -1 EINTR (Interrupted system call)
wait4(4220, [{WIFSIGNALED(s) && WTERMSIG(s) == SIGABRT}], WNOHANG, NULL) = 4220
write(2, "_pmii_daemon(SIGCHLD): PE 0 exit"..., 48_pmii_daemon(SIGCHLD): PE 0 exit signal Aborted
) = 48
close(4) = 0
rt_sigreturn(0) = -1 EINTR (Interrupted system call)
exit_group(134) = ?
Process 4219 detached
[NID 176]Apid 4733: initiated application termination
) = -1 EINTR (Interrupted system call)
exit_group(134) = ?
Process 4535 detached
Application 4733 resources: utime 0, stime 0

User avatar
kate
Posts: 4091
Joined: Wed Jul 02, 2003 5:29 pm
Location: CFOS/UAF, USA

Re: ROMS runtime error on Cray XT3

#2 Unread post by kate »

ROMS has told you your problem:
Resolution, Grid 01: 0041x0080x016, Parallel Nodes: 2, Tiling: 001x001

ROMS/TOMS: Wrong choice of domain 01 partition or number of parallel threads.
NtileI * NtileJ must be equal to the number of parallel nodes.

Post Reply