I found multiple deadlocks which can be triggered by using large array jobs and/or job dependencies.
The first one has a chance to occur when a large array job is submitted and in the process of being created and a second job is submitted with a dependency on the first, before its
creation is finished.
(gdb) info threads
19 Thread 16091 0x00007fbdc2350c5d in nanosleep () at ../sysdeps/unix/syscall-template.S:82
18 Thread 16120 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:211
17 Thread 16119 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:211
16 Thread 16118 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:211
15 Thread 16117 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
14 Thread 16116 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
13 Thread 16115 0x00007fbdc2374c13 in *__GI___poll (fds=<value optimized out>, nfds=<value optimized out>, timeout=120000) at ../sysdeps/unix/sysv/linux/poll.c:87
12 Thread 16114 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:211
11 Thread 16113 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:211
10 Thread 16112 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
9 Thread 16111 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
8 Thread 16110 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
7 Thread 16109 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:211
6 Thread 16108 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
5 Thread 16107 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:211
4 Thread 16104 0x00007fbdc2350c5d in nanosleep () at ../sysdeps/unix/syscall-template.S:82
3 Thread 16103 0x00007fbdc2350c5d in nanosleep () at ../sysdeps/unix/syscall-template.S:82
2 Thread 16102 0x00007fbdc282838d in accept () at ../sysdeps/unix/syscall-template.S:82
* 1 Thread 16099 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
Thread 15 and 8 are in a deadlock. This occurs when a array job is in
the process of being generated and a second job is submitted with a
dependency on the first job is submitted.
(gdb) thread 15
[Switching to thread 15 (Thread 16117)]#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
136 in ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
(gdb) bt
#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
#1 0x00007fbdc2823179 in _L_lock_953 () from /lib/libpthread.so.0
#2 0x00007fbdc2822f9b in __pthread_mutex_lock (mutex=0x70cd800) at pthread_mutex_lock.c:61
#3 0x000000000040a6b4 in is_array (id=Unhandled dwarf expression opcode 0xf3
) at array_func.c:137
#4 0x0000000000425e3c in dispatch_request (sfds=65535, request=0x7fbd94048ff0) at process_request.c:927
#5 0x000000000040eaac in issue_to_svr (servern=Unhandled dwarf expression opcode 0xf3
) at issue_request.c:324
#6 0x000000000043a713 in send_depend_req (pjob=0x7c876d0, pparent=0x7fbd94043160, type=12, op=1, schedhint=0, postfunc=0x439d40 <post_doq>) at req_register.c:2262
#7 0x000000000043bbb9 in depend_on_que (pattr=Unhandled dwarf expression opcode 0xf3
) at req_register.c:1237
#8 0x000000000044a5c5 in svr_enquejob (pjob=0x7c876d0, has_sv_qs_mutex=0, prev_job_index=65) at svr_jobfunc.c:575
#9 0x00000000004121fc in job_clone_wt (cloned_id=Unhandled dwarf expression opcode 0xf3
) at job_func.c:1153
#10 0x000000000045acd2 in work_thread (a=0x7fff7fa38120) at u_threadpool.c:307
#11 0x00007fbdc28208ca in start_thread (arg=<value optimized out>) at pthread_create.c:300
#12 0x00007fbdc237fb6d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:112
#13 0x0000000000000000 in ?? ()
(gdb) print *(pthread_mutex_t*)0x70cd800
$2 = {__data = {__lock = 2, __count = 0, __owner = 16110, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\356>\000\000\001", '\000' <repeats 26 times>, __align = 2}
Lock is hold by thread 8
(gdb) thread 8
[Switching to thread 8 (Thread 16110)]#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
136 in ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
(gdb) bt
#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
#1 0x00007fbdc2823179 in _L_lock_953 () from /lib/libpthread.so.0
#2 0x00007fbdc2822f9b in __pthread_mutex_lock (mutex=0x7fbdb02858b0) at pthread_mutex_lock.c:61
#3 0x000000000044a61d in lock_ai_mutex (pa=0x7fbdb02103f0, id=Unhandled dwarf expression opcode 0xf3
) at svr_jobfunc.c:2940
#4 0x000000000040a77c in get_array (id=Unhandled dwarf expression opcode 0xf3
) at array_func.c:164
#5 0x000000000043a342 in req_registerarray (preq=0x7fbdb82e0170) at req_register.c:718
#6 0x0000000000425e4c in dispatch_request (sfds=65535, request=0x7fbdb82e0170) at process_request.c:929
#7 0x000000000040eaac in issue_to_svr (servern=Unhandled dwarf expression opcode 0xf3
) at issue_request.c:324
#8 0x000000000043a713 in send_depend_req (pjob=0x7fbdb82e32e0, pparent=0x7fbdb80a5ed0, type=12, op=1, schedhint=0, postfunc=0x439d40 <post_doq>) at req_register.c:2262
#9 0x000000000043bbb9 in depend_on_que (pattr=Unhandled dwarf expression opcode 0xf3
) at req_register.c:1237
#10 0x000000000044a5c5 in svr_enquejob (pjob=0x7fbdb82e32e0, has_sv_qs_mutex=0, prev_job_index=-1) at svr_jobfunc.c:575
#11 0x0000000000438cf4 in req_commit (preq=0x7fbdb81bfb80) at req_quejob.c:2141
#12 0x0000000000426119 in dispatch_request (sfds=11, request=0x7fbdb81bfb80) at process_request.c:741
#13 0x000000000042687e in process_request (chan=Unhandled dwarf expression opcode 0xf3
) at process_request.c:662
#14 0x0000000000422da8 in process_pbs_server_port (sock=11, is_scheduler_port=0) at pbsd_main.c:410
#15 0x0000000000422e9a in start_process_pbs_server_port (new_sock=Unhandled dwarf expression opcode 0xf3
) at pbsd_main.c:541
#16 0x000000000045acd2 in work_thread (a=0x7fff7fa38120) at u_threadpool.c:307
#17 0x00007fbdc28208ca in start_thread (arg=<value optimized out>) at pthread_create.c:300
#18 0x00007fbdc237fb6d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:112
#19 0x0000000000000000 in ?? ()
(gdb) print *(pthread_mutex_t*)0x7fbdb02858b0
$3 = {__data = {__lock = 2, __count = 0, __owner = 16117, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000\365>\000\000\001", '\000' <repeats 26 times>, __align = 2}
Lock is hold by thread 15
Thread 15 holds the ai lock. I do not know where the lock is aquired,
but it is release in job_clone_wt shortly after all jobs are enqueued.
(gdb) thread 8
[Switching to thread 8 (Thread 16110)]#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
136 in ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
(gdb) frame 4
#4 0x000000000040a77c in get_array (id=Unhandled dwarf expression opcode 0xf3
) at array_func.c:164
164 lock_ai_mutex(pa, __func__, NULL, LOGLEVEL);
Current language: auto
The current source language is "auto; currently c".
(gdb) print pa
$4 = (job_array *) 0x7fbdb02103f0
(gdb) thread 15
[Switching to thread 15 (Thread 16117)]#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
136 ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: No such file or directory.
in ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
Current language: auto
The current source language is "auto; currently asm".
(gdb) frame 9
#9 0x00000000004121fc in job_clone_wt (cloned_id=Unhandled dwarf expression opcode 0xf3
) at job_func.c:1153
1153 if ((rc = svr_enquejob(pjobclone, FALSE, prev_index)))
Current language: auto
The current source language is "auto; currently c".
(gdb) print pa
$5 = (job_array *) 0x7fbdb02103f0
(gdb) info threads
19 Thread 13950 0x00007f3ef94cdc5d in nanosleep () at ../sysdeps/unix/syscall-template.S:82
18 Thread 14161 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
17 Thread 14160 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
16 Thread 14159 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
15 Thread 14158 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
14 Thread 14157 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
13 Thread 14156 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
12 Thread 14155 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
11 Thread 14154 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
10 Thread 14153 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
9 Thread 14152 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
8 Thread 14151 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
7 Thread 14150 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
6 Thread 14149 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
5 Thread 14148 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
4 Thread 14145 0x00007f3ef94cdc5d in nanosleep () at ../sysdeps/unix/syscall-template.S:82
3 Thread 14144 0x00007f3ef94cdc5d in nanosleep () at ../sysdeps/unix/syscall-template.S:82
2 Thread 14143 0x00007f3ef99a538d in accept () at ../sysdeps/unix/syscall-template.S:82
* 1 Thread 14142 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
(gdb) thread 17
[Switching to thread 17 (Thread 14160)]#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
136 in ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
(gdb) bt
#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
#1 0x00007f3ef99a0179 in _L_lock_953 () from /lib/libpthread.so.0
#2 0x00007f3ef999ff9b in __pthread_mutex_lock (mutex=0x5729cb0) at pthread_mutex_lock.c:61
#3 0x0000000000448760 in lock_ji_mutex (pjob=0x5731100, id=Unhandled dwarf expression opcode 0xf3
) at svr_jobfunc.c:2863
#4 0x0000000000411370 in remove_job (aj=0xa972c0, pjob=0x5731100) at job_func.c:2562
#5 0x0000000000449aac in svr_dequejob (pjob=0x5731100, parent_queue_mutex_held=0) at svr_jobfunc.c:758
#6 0x0000000000411e17 in svr_job_purge (pjob=0x5731100) at job_func.c:1776
#7 0x000000000042c5c8 in handle_complete_second_time (ptask=Unhandled dwarf expression opcode 0xf3
) at req_jobobit.c:1800
#8 0x000000000045acd2 in work_thread (a=0x7fff14ff7180) at u_threadpool.c:307
#9 0x00007f3ef999d8ca in start_thread (arg=<value optimized out>) at pthread_create.c:300
#10 0x00007f3ef94fcb6d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:112
#11 0x0000000000000000 in ?? ()
(gdb) print *(pthread_mutex_t*)0x5729cb0
$4 = {__data = {__lock = 2, __count = 0, __owner = 14154, __nusers = 0, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000J7", '\000' <repeats 29 times>, __align = 2}
(gdb) thread 11
[Switching to thread 11 (Thread 14154)]#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
136 in ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
(gdb) bt
#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
#1 0x00007f3ef99a0179 in _L_lock_953 () from /lib/libpthread.so.0
#2 0x00007f3ef999ff9b in __pthread_mutex_lock (mutex=0x51cde90) at pthread_mutex_lock.c:61
#3 0x000000000044a864 in lock_alljobs_mutex (aj=0xa972c0, id=Unhandled dwarf expression opcode 0xf3
) at svr_jobfunc.c:3017
#4 0x0000000000410aee in find_job_by_array (aj=0xa972c0, job_id=0x7f3ee41f99a0 "30979[34].glorim-1.cluster", get_subjob=1) at job_func.c:2140
#5 0x0000000000410e16 in svr_find_job (jobid=0x7f3ee41f99a0 "30979[34].glorim-1.cluster", get_subjob=1) at job_func.c:2245
#6 0x000000000042c53a in handle_complete_second_time (ptask=0x7f3ee40361d0) at req_jobobit.c:1765
#7 0x000000000045acd2 in work_thread (a=0x7fff14ff7180) at u_threadpool.c:307
#8 0x00007f3ef999d8ca in start_thread (arg=<value optimized out>) at pthread_create.c:300
#9 0x00007f3ef94fcb6d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:112
#10 0x0000000000000000 in ?? ()
(gdb) print *(pthread_mutex_t*)0x51cde90
$5 = {__data = {__lock = 2, __count = 0, __owner = 14160, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000P7\000\000\001", '\000' <repeats 26 times>, __align = 2}
(gdb) info threads
19 Thread 23839 0x00007f18f4083c5d in nanosleep () at ../sysdeps/unix/syscall-template.S:82
18 Thread 23882 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
17 Thread 23881 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:211
16 Thread 23880 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
15 Thread 23879 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
14 Thread 23878 0x00007f18f40a7c13 in *__GI___poll (fds=<value optimized out>, nfds=<value optimized out>, timeout=120000) at ../sysdeps/unix/sysv/linux/poll.c:87
13 Thread 23877 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:211
12 Thread 23876 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
11 Thread 23875 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
10 Thread 23874 pthread_cond_timedwait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:211
9 Thread 23873 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
8 Thread 23872 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
7 Thread 23871 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
6 Thread 23870 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
5 Thread 23869 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
4 Thread 23868 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
3 Thread 23867 0x00007f18f4083c5d in nanosleep () at ../sysdeps/unix/syscall-template.S:82
2 Thread 23866 0x00007f18f4083c5d in nanosleep () at ../sysdeps/unix/syscall-template.S:82
* 1 Thread 23865 0x00007f18f455b38d in accept () at ../sysdeps/unix/syscall-template.S:82
Current language: auto
The current source language is "auto; currently asm".
(gdb) thread 18
[Switching to thread 18 (Thread 23882)]#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
136 ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: No such file or directory.
in ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
(gdb) bt
#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
#1 0x00007f18f4556179 in _L_lock_953 () from /lib/libpthread.so.0
#2 0x00007f18f4555f9b in __pthread_mutex_lock (mutex=0x5eaf850) at pthread_mutex_lock.c:61
#3 0x000000000040a7d3 in get_array (id=0x7f18e1ff5490 "31008[].glorim-1.cluster") at array_func.c:163
#4 0x00000000004118fa in get_jobs_array (pjob_ptr=0x7f18e1ff6cd8) at job_func.c:2760
#5 0x0000000000411d1e in svr_job_purge (pjob=0x7f18d405c260) at job_func.c:1730
#6 0x000000000042c6f8 in handle_complete_second_time (ptask=Unhandled dwarf expression opcode 0xf3
) at req_jobobit.c:1800
#7 0x000000000045b0d2 in work_thread (a=0x7fff26c6a610) at u_threadpool.c:307
#8 0x00007f18f45538ca in start_thread (arg=<value optimized out>) at pthread_create.c:300
#9 0x00007f18f40b2b6d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:112
#10 0x0000000000000000 in ?? ()
(gdb) print *(pthread_mutex_t*)0x5eaf850
$1 = {__data = {__lock = 2, __count = 0, __owner = 23870, __nusers = 1, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}},
__size = "\002\000\000\000\000\000\000\000>]\000\000\001", '\000' <repeats 26 times>, __align = 2}
(gdb) thread 6
[Switching to thread 6 (Thread 23870)]#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
136 in ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
(gdb) bt
#0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:136
#1 0x00007f18f4556179 in _L_lock_953 () from /lib/libpthread.so.0
#2 0x00007f18f4555f9b in __pthread_mutex_lock (mutex=0x7f18ec05a5f0) at pthread_mutex_lock.c:61
#3 0x000000000044a8bd in lock_ai_mutex (pa=0x7f18ec0598f0, id=Unhandled dwarf expression opcode 0xf3
) at svr_jobfunc.c:2986
#4 0x000000000040cb3c in next_array (iter=Unhandled dwarf expression opcode 0xf3
) at array_func.c:1895
#5 0x000000000040cbbe in update_array_statuses () at array_func.c:1706
#6 0x00000000004425f1 in req_stat_job_step2 (cntl=0x7f18ec05d940) at req_stat.c:586
#7 0x0000000000442875 in req_stat_job (preq=0x7f18ec05ed40) at req_stat.c:331
#8 0x0000000000426038 in dispatch_request (sfds=12, request=0x7f18ec05ed40) at process_request.c:963
#9 0x00000000004269ae in process_request (chan=Unhandled dwarf expression opcode 0xf3
) at process_request.c:662
#10 0x0000000000422ed8 in process_pbs_server_port (sock=12, is_scheduler_port=0) at pbsd_main.c:410
#11 0x0000000000422fca in start_process_pbs_server_port (new_sock=Unhandled dwarf expression opcode 0xf3
) at pbsd_main.c:541
#12 0x000000000045b0d2 in work_thread (a=0x7fff26c6a610) at u_threadpool.c:307
#13 0x00007f18f45538ca in start_thread (arg=<value optimized out>) at pthread_create.c:300
#14 0x00007f18f40b2b6d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:112
#15 0x0000000000000000 in ?? ()