HomeStore icon indicating copy to clipboard operation
HomeStore copied to clipboard

LogDev hang on stop

Open xiaoxichen opened this issue 1 year ago • 0 comments

log

[03/01/24 00:30:23-07:00] [D] [397827] [index_kv.cpp:70:get_blob_from_index_table] Failed to get from index table [route=0001:000000000001:0000000000000000]
[03/01/24 00:30:23-07:00] [W] [397827] [hs_blob_manager.cpp:207:_get_blob] [pg=1,shard=1,blob=0] Blob not found in index
[03/01/24 00:30:23-07:00] [D] [397827] [fixture_app.cpp:57:SetUp] Insert Blob to: 281474976710657
[03/01/24 00:30:23-07:00] [D] [397827] [hs_blob_manager.cpp:290:blob_put_get_blk_alloc_hints] [pg=1,shard=1,blob=n/a] Picked chunk_id=643
[03/01/24 00:30:23-07:00] [I] [397828] [replication_state_machine.cpp:35:on_pre_commit] on_pre_commit with lsn:5
[03/01/24 00:30:23-07:00] [I] [397852] [replication_state_machine.cpp:7:on_commit] applying raft log commit with lsn:5
[03/01/24 00:30:23-07:00] [I] [397827] [homestore.cpp:233:shutdown] Homestore shutdown is started
[03/01/24 00:30:23-07:00] [I] [397827] [cp_mgr.cpp:76:shutdown] Stopping cp timer
[03/01/24 00:30:23-07:00] [I] [397827] [iomgr_timer.cpp:126:operator()] Removing recurring global timer fd 45 device
[03/01/24 00:30:23-07:00] [I] [397827] [cp_mgr.cpp:81:shutdown] Trigger cp flush
[03/01/24 00:30:23-07:00] [I] [397827] [cp_mgr.cpp:84:shutdown] Trigger cp done
[03/01/24 00:30:23-07:00] [I] [397827] [iomgr_timer.cpp:126:operator()] Removing recurring global timer fd 17 device

GDB

#6  0x00005594f5613579 in homestore::LogDev::stop (this=0x5594f870d150) at /var/home/centos/.conan/data/homestore/5.1.9-31/oss/master/build/184931f8962d502a27b6183756ace784abafe634/src/lib/logstore/log_dev.cpp:130
130	        m_block_flush_q_cv.wait(lk, [&] { return m_stopped; });
(gdb) p m_stopped
$3 = false
(gdb) p m_is_flushing
$4 = std::atomic<bool> = { false }

the stop was blocked here. as m_is_flushing is false I tend to believe there is a racing that someone (w/o the flush lock) finished the flush and setting m_is_flushing to false after the compare_exchange call

        if (!m_is_flushing.compare_exchange_strong(expected_flushing, true, std::memory_order_acq_rel)) {
            // Flushing is blocked already, add it to the callback q
            if (m_block_flush_q == nullptr) { m_block_flush_q = sisl::VectorPool< flush_blocked_callback >::alloc(); }
            m_block_flush_q->emplace_back(cb);
            return false;
        }

Call stack

Thread 1 (Thread 0x7fb30dad7f80 (LWP 391796) "homestore_test"):
#0  __futex_abstimed_wait_common64 (private=0, cancel=true, abstime=0x0, op=393, expected=0, futex_word=0x5594f870d4b0) at ./nptl/futex-internal.c:57
#1  __futex_abstimed_wait_common (cancel=true, private=0, abstime=0x0, clockid=0, expected=0, futex_word=0x5594f870d4b0) at ./nptl/futex-internal.c:87
--Type <RET> for more, q to quit, c to continue without paging--
#2  __GI___futex_abstimed_wait_cancelable64 (futex_word=futex_word@entry=0x5594f870d4b0, expected=expected@entry=0, clockid=clockid@entry=0, abstime=abstime@entry=0x0, private=private@entry=0) at ./nptl/futex-internal.c:139
#3  0x00007fb30db6fa41 in __pthread_cond_wait_common (abstime=0x0, clockid=0, mutex=0x5594f870d460, cond=0x5594f870d488) at ./nptl/pthread_cond_wait.c:503
#4  ___pthread_cond_wait (cond=0x5594f870d488, mutex=0x5594f870d460) at ./nptl/pthread_cond_wait.c:627
#5  0x00005594f5627f4d in std::condition_variable::wait<homestore::LogDev::stop()::<lambda()> >(std::unique_lock<std::mutex> &, struct {...}) (this=0x5594f870d488, __lock=..., __p=...) at /usr/include/c++/11/condition_variable:103
#6  0x00005594f5613579 in homestore::LogDev::stop (this=0x5594f870d150) at /var/home/centos/.conan/data/homestore/5.1.9-31/oss/master/build/184931f8962d502a27b6183756ace784abafe634/src/lib/logstore/log_dev.cpp:130
#7  0x00005594f5381461 in homestore::LogStoreService::stop (this=0x5594f93eb100) at /var/home/centos/.conan/data/homestore/5.1.9-31/oss/master/build/184931f8962d502a27b6183756ace784abafe634/src/lib/logstore/log_store_service.cpp:103
#8  0x00005594f54743e3 in homestore::RaftReplService::stop (this=0x7fb2f0014080) at /var/home/centos/.conan/data/homestore/5.1.9-31/oss/master/build/184931f8962d502a27b6183756ace784abafe634/src/lib/replication/service/raft_repl_service.cpp:145
#9  0x00005594f52f6122 in homestore::HomeStore::shutdown (this=0x7fb304014250) at /var/home/centos/.conan/data/homestore/5.1.9-31/oss/master/build/184931f8962d502a27b6183756ace784abafe634/src/lib/homestore.cpp:240
#10 0x00005594f4f035a5 in homeobject::HSHomeObject::~HSHomeObject (this=0x5594f84d8dd0, __in_chrg=<optimized out>) at /sysroot/home/centos/HomeObject/src/lib/homestore_backend/hs_homeobject.cpp:233
#11 0x00005594f4f1c0a0 in std::destroy_at<homeobject::HSHomeObject> (__location=0x5594f84d8dd0) at /usr/include/c++/11/bits/stl_construct.h:88
#12 0x00005594f4f1bf76 in std::allocator_traits<std::allocator<homeobject::HSHomeObject> >::destroy<homeobject::HSHomeObject> (__a=..., __p=0x5594f84d8dd0) at /usr/include/c++/11/bits/alloc_traits.h:537
#13 0x00005594f4f1bbc7 in std::_Sp_counted_ptr_inplace<homeobject::HSHomeObject, std::allocator<homeobject::HSHomeObject>, (__gnu_cxx::_Lock_policy)2>::_M_dispose (this=0x5594f84d8dc0) at /usr/include/c++/11/bits/shared_ptr_base.h:528
#14 0x00005594f4e2c978 in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0x5594f84d8dc0) at /usr/include/c++/11/bits/shared_ptr_base.h:168
#15 0x00005594f4e282af in std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count (this=0x5594f85c8d68, __in_chrg=<optimized out>) at /usr/include/c++/11/bits/shared_ptr_base.h:705
#16 0x00005594f4e62556 in std::__shared_ptr<homeobject::HomeObject, (__gnu_cxx::_Lock_policy)2>::~__shared_ptr (this=0x5594f85c8d60, __in_chrg=<optimized out>) at /usr/include/c++/11/bits/shared_ptr_base.h:1154
#17 0x00005594f4e62576 in std::shared_ptr<homeobject::HomeObject>::~shared_ptr (this=0x5594f85c8d60, __in_chrg=<optimized out>) at /usr/include/c++/11/bits/shared_ptr.h:122
#18 0x00005594f4e625ae in TestFixture::~TestFixture (this=0x5594f85c8c70, __in_chrg=<optimized out>) at /sysroot/home/centos/HomeObject/src/./lib/tests/fixture_app.hpp:45
#19 0x00005594f4ec6902 in TestFixture_GetUnknownShard_Test::~TestFixture_GetUnknownShard_Test (this=0x5594f85c8c70, __in_chrg=<optimized out>) at /sysroot/home/centos/HomeObject/src/lib/tests/ShardManagerTest.cpp:22
#20 0x00005594f4ec6922 in TestFixture_GetUnknownShard_Test::~TestFixture_GetUnknownShard_Test (this=0x5594f85c8c70, __in_chrg=<optimized out>) at /sysroot/home/centos/HomeObject/src/lib/tests/ShardManagerTest.cpp:22
#21 0x00005594f6b8f512 in testing::Test::DeleteSelf_ (this=0x5594f85c8c70) at /var/home/centos/.conan/data/gtest/1.14.0/_/_/build/f582fedd6c1f55261c196855540ed7a5370c3a33/src/googletest/include/gtest/gtest.h:336
#22 0x00005594f6b9fa19 in testing::internal::HandleSehExceptionsInMethodIfSupported<testing::Test, void> (object=0x5594f85c8c70, method=(void (testing::Test::*)(testing::Test * const)) 0x5594f6b8f4e4 <testing::Test::DeleteSelf_()>, location=0x5594f6e9394f "the test fixture's destructor") at /var/home/centos/.conan/data/gtest/1.14.0/_/_/build/f582fedd6c1f55261c196855540ed7a5370c3a33/src/googletest/src/gtest.cc:2612
#23 0x00005594f6b992e9 in testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void> (object=0x5594f85c8c70, method=(void (testing::Test::*)(testing::Test * const)) 0x5594f6b8f4e4 <testing::Test::DeleteSelf_()>, location=0x5594f6e9394f "the test fixture's destructor") at /var/home/centos/.conan/data/gtest/1.14.0/_/_/build/f582fedd6c1f55261c196855540ed7a5370c3a33/src/googletest/src/gtest.cc:2648
#24 0x00005594f6b74e78 in testing::TestInfo::Run (this=0x5594f8367a70) at /var/home/centos/.conan/data/gtest/1.14.0/_/_/build/f582fedd6c1f55261c196855540ed7a5370c3a33/src/googletest/src/gtest.cc:2842
#25 0x00005594f6b75830 in testing::TestSuite::Run (this=0x5594f8368a70) at /var/home/centos/.conan/data/gtest/1.14.0/_/_/build/f582fedd6c1f55261c196855540ed7a5370c3a33/src/googletest/src/gtest.cc:3015
#26 0x00005594f6b85e21 in testing::internal::UnitTestImpl::RunAllTests (this=0x5594f8360900) at /var/home/centos/.conan/data/gtest/1.14.0/_/_/build/f582fedd6c1f55261c196855540ed7a5370c3a33/src/googletest/src/gtest.cc:5920
#27 0x00005594f6ba09a6 in testing::internal::HandleSehExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool> (object=0x5594f8360900, method=(bool (testing::internal::UnitTestImpl::*)(testing::internal::UnitTestImpl * const)) 0x5594f6b859f2 <testing::internal::UnitTestImpl::RunAllTests()>, location=0x5594f6e94210 "auxiliary test code (environments or event listeners)") at /var/home/centos/.conan/data/gtest/1.14.0/_/_/build/f582fedd6c1f55261c196855540ed7a5370c3a33/src/googletest/src/gtest.cc:2612
#28 0x00005594f6b9a427 in testing::internal::HandleExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool> (object=0x5594f8360900, method=(bool (testing::internal::UnitTestImpl::*)(testing::internal::UnitTestImpl * const)) 0x5594f6b859f2 <testing::internal::UnitTestImpl::RunAllTests()>, location=0x5594f6e94210 "auxiliary test code (environments or event listeners)") at /var/home/centos/.conan/data/gtest/1.14.0/_/_/build/f582fedd6c1f55261c196855540ed7a5370c3a33/src/googletest/src/gtest.cc:2648
#29 0x00005594f6b843e7 in testing::UnitTest::Run (this=0x5594f80a76a0 <testing::UnitTest::GetInstance()::instance>) at /var/home/centos/.conan/data/gtest/1.14.0/_/_/build/f582fedd6c1f55261c196855540ed7a5370c3a33/src/googletest/src/gtest.cc:5484
#30 0x00005594f4ed2014 in RUN_ALL_TESTS () at /var/home/centos/.conan/data/gtest/1.14.0/_/_/package/f582fedd6c1f55261c196855540ed7a5370c3a33/include/gtest/gtest.h:2317
#31 0x00005594f4ed0cb8 in main (argc=5, argv=0x7fff8b248568) at /sysroot/home/centos/HomeObject/src/lib/tests/fixture_app.cpp:90

xiaoxichen avatar Mar 01 '24 15:03 xiaoxichen