Hello everyone: I'm using openstack mitaka and ceph-jewel-10.2.6 to do snapshot actions. And sometimes it occurs below deadlock condition. ps -ef | grep cinder-volume gdb -q python-dbg -p xx I found 3 frames are racing the lock: Thread 1 (Thread 0x7ffab0fac740 (LWP 1029)): #0 0x00007ffab07bf6d5 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007ffa90c936d0 in Wait (mutex=..., this=0x7ffd91027aa0) at ./common/Cond.h:56 #2 C_SaferCond::wait (this=this@entry=0x7ffd91027a40) at ./common/Cond.h:202 #3 0x00007ffa90cbd7fc in librbd::ImageState::refresh_if_required (this=0x3fb2fb0) at librbd/ImageState.cc:354 #4 0x00007ffa90ce3c10 in librbd::list_children_info (ictx=ictx@entry=0x5257530, parent_spec=..., image_info=std::map with 0 elements) at librbd/internal.cc:926 #5 0x00007ffa90ce5614 in librbd::list_children (ictx=ictx@entry=0x5257530, names=std::set with 0 elements) at librbd/internal.cc:892 #6 0x00007ffa90c79e93 in rbd_list_children (image=0x5257530, pools=pools@entry=0x584fbb0 "hJ\n\260\372\177", pools_len=pools_len@entry=0x7ffd91028118, images=images@entry=0x55f9640 "\250K\n\260\372\177", images_len=images_len@entry=0x7ffd91028120) at librbd/librbd.cc:2423 #7 0x00007ffa9a5a3620 in __pyx_pf_3rbd_5Image_76list_children (__pyx_v_self=0x59110a8) at rbd.c:15450 #8 __pyx_pw_3rbd_5Image_77list_children (__pyx_v_self=0x59110a8, unused=) at rbd.c:15329 #9 0x00007ffab0ab487c in PyEval_EvalFrameEx () from /lib64/libpython2.7.so.1.0 #10 0x00007ffab0ab6e3d in PyEval_EvalCodeEx () from /lib64/libpython2.7.so.1.0 #11 0x00007ffab0ab433c in PyEval_EvalFrameEx () from /lib64/libpython2.7.so.1.0 #12 0x00007ffab0ab6e3d in PyEval_EvalCodeEx () from /lib64/libpython2.7.so.1.0 #13 0x00007ffab0ab433c in PyEval_EvalFrameEx () from /lib64/libpython2.7.so.1.0 #14 0x00007ffab0ab6e3d in PyEval_EvalCodeEx () from /lib64/libpython2.7.so.1.0 #15 0x00007ffab0ab433c in PyEval_EvalFrameEx () from /lib64/libpython2.7.so.1.0 #16 0x00007ffab0ab6e3d in PyEval_EvalCodeEx () from /lib64/libpython2.7.so.1.0 .... Thread 12 (Thread 0x7ffa84ac4700 (LWP 1479895)): #0 0x00007ffab07bf03e in pthread_rwlock_wrlock () from /lib64/libpthread.so.0 #1 0x00007ffa90cc32dc in RWLock::get_write (this=0x52577e0, lockdep=) at ./common/RWLock.h:123 #2 0x00007ffa90d4e7b3 in WLocker (lock=..., this=) at ./common/RWLock.h:183 #3 librbd::image::RefreshRequest::apply (this=this@entry=0x55f9850) at librbd/image/RefreshRequest.cc:860 #4 0x00007ffa90d4f778 in librbd::image::RefreshRequest::handle_v2_apply (this=0x55f9850, result=result@entry=0x7ffa84ac3b2c) at librbd/image/RefreshRequest.cc:655 #5 0x00007ffa90d4f92b in librbd::util::detail::C_StateCallbackAdapter, &librbd::image::RefreshRequest::handle_v2_apply, true>::complete (this=0x7ff9e4002f50, r=0) at ./librbd/Utils.h:66#6 0x00007ffa90cb7644 in ContextWQ::process (this=0x3fd9bf0, ctx=0x7ff9e4002f50) at ./common/WorkQueue.h:608 #7 0x00007ffa90eae54e in ThreadPool::worker (this=0x5180a60, wt=0x56c5430) at common/WorkQueue.cc:128 #8 0x00007ffa90eaf420 in ThreadPool::WorkThread::entry (this=) at common/WorkQueue.h:445 #9 0x00007ffab07bbdc5 in start_thread () from /lib64/libpthread.so.0 #10 0x00007ffaafde176d in clone () from /lib64/libc.so.6 Thread 13 (Thread 0x7ffa852c5700 (LWP 1479894)): #0 0x00007ffab07bee24 in pthread_rwlock_rdlock () from /lib64/libpthread.so.0 #1 0x00007ffa90c89d2f in RWLock::get_read (this=0x52576c8) at ./common/RWLock.h:100 #2 0x00007ffa90cce5d4 in RLocker (lock=..., this=) at ./common/RWLock.h:161 #3 librbd::ImageWatcher::handle_payload (this=this@entry=0x7ff9e4001d10, payload=..., ack_ctx=ack_ctx@entry=0x7ff9e4001690) at librbd/ImageWatcher.cc:673 #4 0x00007ffa90ccee3c in operator() (payload=..., this=) at ./librbd/ImageWatcher.h:218 #5 internal_visit (operand=..., this=) at /usr/include/boost/variant/variant.hpp:1017 #6 visitation_impl_invoke_impl::HandlePayloadVisitor const>, void const*, librbd::w atch_notify::ReleasedLockPayload> (storage=0x7ffa852c4838, visitor=) at /usr/include/boost/variant/detail/visitation_impl.hpp:130#7 visitation_impl_invoke::HandlePayloadVisitor const>, void const*, librbd::watch_ notify::ReleasedLockPayload, boost::variant::has_fallback_type_> (internal_which=, t=0x0, storage=0x7ffa852c4838, visitor=) at /usr/include/boost/variant/detail/visitation_impl.hpp:173 #8 visitation_impl, boost::detail::variant::visitation_impl_step, librbd::watch_notify::Acqu iredLockPayload, boost::mpl::l_item, librbd::watch_notify::ReleasedLockPayload, boost::mpl::l_item, librbd::watch_notify::RequestLockPayload, boost::mpl::l_item, librbd::watch_notify::HeaderUpdatePayload, boost::mpl::l_item, librbd::watch_notify::AsyncProgressPayload, boost::mpl::l_item, librbd::watch_notify::AsyncCompletePayload, boost::mpl::l_item, librbd::watch_notify::FlattenPayload, boost::mpl::l_item, librbd::watch_notify::ResizePayload, boost::mpl::l_item, librbd::watch_notify::SnapCreatePayload, boost::mpl::l_item, librbd::watch_notify::SnapRemovePayload, boost::mpl::l_item, librbd::watch_notify::SnapRenamePayload, boost::mpl::l_item, librbd::watch_notify::SnapProtectPayload, boost::mpl::l_item, librbd::watch_notify::SnapUnprotectPayload, boost::mpl::l_item, librbd::watch_notify::RebuildObjectMapPayload, boost::mpl::l_item, librbd::watch_notify::RenamePayload, boost::mpl::l_item, librbd::watch_notify::UnknownPayload, boost::mpl::l_end> > > > > > > > > > > > > > > > >, boost::mpl::l_iter >, boost::detail::variant::invoke_visitor::HandlePayloadVisitor const>, void const*, boost::variant::has_fallback_type_> (no_backup_flag=..., storage=0x7ffa852c4838, visitor=, logical_which=, internal_which=) at /usr/include/boost/variant/detail/visitation_impl.hpp:256 #9 internal_apply_visitor_impl::HandlePayloadVisitor const>, void const*> ( storage=0x7ffa852c4838, visitor=, logical_which=, internal_which=) at /usr/include/boost/variant/variant.hpp:2326 #10 internal_apply_visitor::HandlePayloadVisitor const> > ( visitor=, this=0x7ffa852c4830) at /usr/include/boost/variant/variant.hpp:2348 #11 apply_visitor::HandlePayloadVisitor const> (visitor=..., this=0x7ffa852c4830) at /usr/include/boost/variant/variant.hpp:2370 #12 apply_visitor::HandlePayloadVisitor, boost::variant const> (visitable=..., visitor=...) at /usr/include/boost/variant/detail/apply_visitor_unary.hpp:76 #13 librbd::ImageWatcher::process_payload (this=this@entry=0x7ff9e4001d10, notify_id=notify_id@entry=1554778161239, handle=handle@entry=140711248795552, payload=..., r=r@entry=0) at librbd/ImageWatcher.cc:974 #14 0x00007ffa90ccf007 in librbd::ImageWatcher::handle_notify (this=0x7ff9e4001d10, notify_id=1554778161239, handle=140711248795552, bl=...) at librbd/ImageWatcher.cc:1002 #15 0x00007ffa9d4581b5 in ?? () from /lib64/librados.so.2 #16 0x00007ffa9d4687cc in ?? () from /lib64/librados.so.2 #17 0x00007ffa9d4280c9 in ?? () from /lib64/librados.so.2 #18 0x00007ffa9d515546 in ?? () from /lib64/librados.so.2 #19 0x00007ffab07bbdc5 in start_thread () from /lib64/libpthread.so.0 #20 0x00007ffaafde176d in clone () from /lib64/libc.so.6 I found this: librbd: possible deadlock with flush if refresh in-progress https://github.com/ceph/ceph/commit/547e867628975c7144590e9332aa62b0ef82a433#diff-7a45ddfe5905a26e5abed55fe6a17b1d but i am not sure if this bugfix is for my contiditon. Thancks a lot if anyone can give advise, ^ _ ^