glusterfs shd does not start if gsyncd is missing

Trying to start healing of a standard replica, I got "Self-heal daemon is not running. Check self-heal daemon log file". But there was nothing to be found in glfsheal-volname.log. Then the troubleshooting and resolution below followed. The short of it is that volume heal calls gsyncd although the volume to be healed is not a georeplica and gsyncd should not be needed. shd then fails if gsyncd is not found.

If for some reason gsyncd is needed for healing normal non-geo replicas, then the bug is that the fedora gluster-server-11.1-1 rpm package lacks a "Requires: glusterfs-geo-replication = %{version}-%{release}".

Please note very long lines in the trace, need horizontal scrolling.

strace -v -f gluster volume heal gv0 full

[pid 666675] execve("/usr/libexec/glusterfs/gsyncd", ["/usr/libexec/glusterfs/gsyncd", "--version"], ["LS_COLORS=rs=0:di=38;5;33:ln=38;"..., "SSH_CONNECTION=192.168.1.20 3634"..., "MODULES_RUN_QUARANTINE=LD_LIBRAR"..., "LANG=en_US.UTF-8", "HISTCONTROL=ignoredups", "HOSTNAME=zephyrosaurus.nettheatr"..., "which_declare=declare -f", "XDG_SESSION_ID=45", "MODULES_CMD=/usr/share/Modules/l"..., "USER=root", "PWD=/root", "HOME=/root", "SSH_CLIENT=192.168.1.20 36344 22", "LOADEDMODULES=", "SSH_TTY=/dev/pts/0", "MAIL=/var/spool/mail/root", "TERM=xterm-256color", "SHELL=/bin/bash", "SHLVL=1", "MANPATH=:", "MODULEPATH=/etc/scl/modulefiles:"..., "LOGNAME=root", "DBUS_SESSION_BUS_ADDRESS=unix:pa"..., "XDG_RUNTIME_DIR=/run/user/0", "MODULEPATH_modshare=/usr/share/M"..., "PATH=/usr/share/Modules/bin:/usr"..., "DEBUGINFOD_URLS=https://debuginf"..., "MODULESHOME=/usr/share/Modules", "HISTSIZE=1000", "LESSOPEN=||/usr/bin/lesspipe.sh "..., "BASH_FUNC_which%%=() {  ( alias;"..., "BASH_FUNC_module%%=() {  _module"..., "BASH_FUNC__module_raw%%=() {  un"..., "BASH_FUNC_switchml%%=() {  types"..., "BASH_FUNC_scl%%=() {  if [ \"$1\" "..., "BASH_FUNC_ml%%=() {  module ml \""..., "_=/usr/bin/strace", "_GLUSTERD_CALLED_=1"]) = -1 ENOENT (No such file or directory)
[pid 666675] exit_group(127)            = ?
[pid 666673] <... clone resumed>)       = 666675
[pid 666675] +++ exited with 127 +++
[pid 666673] wait4(666675, NULL, 0, NULL) = 666675
[pid 666673] munmap(0x7f46816f9000, 36864) = 0
[pid 666673] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
[pid 666673] --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=666675, si_uid=0, si_status=127, si_utime=0, si_stime=0} ---
[pid 666673] close(7)                   = 0
[pid 666673] close(8)                   = 0
[pid 666673] close(9)                   = 0
[pid 666673] close(10)                  = 0
[pid 666673] write(5, "[2024-06-22 18:55:12.097238 +000"..., 119) = 119
[pid 666673] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f466e721000
[pid 666673] mprotect(0x7f466e722000, 8388608, PROT_READ|PROT_WRITE) = 0
[pid 666673] clone(child_stack=0x7f466ef20030, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTIDstrace: Process 666676 attached
, parent_tid=[666676], tls=0x7f466ef21700, child_tidptr=0x7f466ef219d0) = 666676
[pid 666676] set_robust_list(0x7f466ef219e0, 24 <unfinished ...>
[pid 666673] rt_sigprocmask(SIG_BLOCK, ~[ILL ABRT BUS FPE SEGV CONT SYS RTMIN RT_1],  <unfinished ...>
[pid 666676] <... set_robust_list resumed>) = 0
[pid 666673] <... rt_sigprocmask resumed>[], 8) = 0
[pid 666676] futex(0x556eb9d7eb08, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1719082632, tv_nsec=97574499}, FUTEX_BITSET_MATCH_ANY <unfinished ...>
[pid 666673] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f466df20000
[pid 666673] mprotect(0x7f466df21000, 8388608, PROT_READ|PROT_WRITE) = 0
[pid 666673] clone(child_stack=0x7f466e71f030, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTIDstrace: Process 666677 attached
, parent_tid=[666677], tls=0x7f466e720700, child_tidptr=0x7f466e7209d0) = 666677
[pid 666673] openat(AT_FDCWD, "/proc/self/task/666677/comm", O_RDWR <unfinished ...>
[pid 666677] set_robust_list(0x7f466e7209e0, 24) = 0
[pid 666673] <... openat resumed>)      = 7
[pid 666677] write(5, "[2024-06-22 18:55:12.097786 +000"..., 149 <unfinished ...>
[pid 666673] write(7, "glfs_epoll000", 13 <unfinished ...>
[pid 666677] <... write resumed>)       = 149
[pid 666673] <... write resumed>)       = 13
[pid 666677] futex(0x556ebb3f4068, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 666673] close(7)                   = 0
[pid 666673] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
[pid 666673] rt_sigprocmask(SIG_BLOCK, ~[ILL ABRT BUS FPE SEGV CONT SYS RTMIN RT_1], [], 8) = 0
[pid 666673] mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7f466d71f000
[pid 666673] mprotect(0x7f466d720000, 8388608, PROT_READ|PROT_WRITE) = 0
[pid 666673] clone(child_stack=0x7f466df1e030, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTIDstrace: Process 666678 attached
, parent_tid=[666678], tls=0x7f466df1f700, child_tidptr=0x7f466df1f9d0) = 666678
[pid 666678] set_robust_list(0x7f466df1f9e0, 24 <unfinished ...>
[pid 666673] openat(AT_FDCWD, "/proc/self/task/666678/comm", O_RDWR <unfinished ...>
[pid 666678] <... set_robust_list resumed>) = 0
[pid 666673] <... openat resumed>)      = 7
[pid 666673] write(7, "glfs_epoll001", 13 <unfinished ...>
[pid 666678] write(5, "[2024-06-22 18:55:12.098039 +000"..., 149) = 149
[pid 666678] futex(0x556ebb3f4068, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 666673] <... write resumed>)       = 13
[pid 666673] close(7)                   = 0
[pid 666673] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
[pid 666673] futex(0x556ebb3f4068, FUTEX_WAKE_PRIVATE, 1) = 1
[pid 666673] futex(0x7f466e7209d0, FUTEX_WAIT, 666677, NULL <unfinished ...>
[pid 666677] <... futex resumed>)       = 0
[pid 666677] futex(0x556ebb3f4068, FUTEX_WAKE_PRIVATE, 1) = 0
[pid 666677] epoll_wait(3, [{events=EPOLLOUT, data={u32=0, u64=4294967296}}], 1, -1) = 1
[pid 666677] getsockopt(6, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
[pid 666677] getsockname(6, {sa_family=AF_UNIX}, [128 => 2]) = 0
[pid 666677] futex(0x556eb9d7eb08, FUTEX_WAKE_PRIVATE, 2147483647) = 1
[pid 666676] <... futex resumed>)       = 0
[pid 666677] epoll_ctl(3, EPOLL_CTL_MOD, 6, {events=EPOLLIN|EPOLLPRI|EPOLLOUT|EPOLLERR|EPOLLHUP|EPOLLONESHOT, data={u32=0, u64=4294967296}} <unfinished ...>
[pid 666678] <... futex resumed>)       = -1 EAGAIN (Resource temporarily unavailable)
[pid 666676] futex(0x556eb9d7eaa0, FUTEX_WAKE_PRIVATE, 1 <unfinished ...>
[pid 666678] futex(0x556ebb3f4068, FUTEX_WAKE_PRIVATE, 1 <unfinished ...>
[pid 666677] <... epoll_ctl resumed>)   = 0
[pid 666676] <... futex resumed>)       = 0
[pid 666678] <... futex resumed>)       = 0
[pid 666677] epoll_wait(3, [{events=EPOLLOUT, data={u32=0, u64=4294967296}}], 1, -1) = 1
[pid 666678] epoll_wait(3,  <unfinished ...>
[pid 666676] writev(6, [{iov_base="\200\0\0\224", iov_len=4}, {iov_base="\0\0\0\2\0\0\0\0\0\0\0\2\0\22\345\277\0\0\0\2\0\0\0\37\0\5\363\227\0\0\0\30"..., iov_len=64}, {iov_base="\0\0\0O\0\0\0\3\0\0\0\7\0\0\0\25cmd-str\0volume h"..., iov_len=84}], 3 <unfinished ...>
[pid 666677] futex(0x556ebb47e028, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 666676] <... writev resumed>)      = 152
[pid 666676] futex(0x556ebb47e028, FUTEX_WAKE_PRIVATE, 1 <unfinished ...>
[pid 666677] <... futex resumed>)       = 0
[pid 666676] <... futex resumed>)       = 1
[pid 666676] futex(0x556eb9d7eb88, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1719083112, tv_nsec=98579364}, FUTEX_BITSET_MATCH_ANY <unfinished ...>
[pid 666677] futex(0x556ebb47e028, FUTEX_WAKE_PRIVATE, 1) = 0
[pid 666677] epoll_ctl(3, EPOLL_CTL_MOD, 6, {events=EPOLLIN|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLONESHOT, data={u32=0, u64=4294967296}}) = 0
[pid 666677] epoll_wait(3, [{events=EPOLLIN, data={u32=0, u64=4294967296}}], 1, -1) = 1
[pid 666677] readv(6, [{iov_base="\200\0\2$", iov_len=4}], 1) = 4
[pid 666677] readv(6, [{iov_base="\0\0\0\2\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\377\377\377\377\0\0xP"..., iov_len=548}], 1) = 548
[pid 666677] epoll_ctl(3, EPOLL_CTL_MOD, 6, {events=EPOLLIN|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLONESHOT, data={u32=0, u64=4294967296}}) = 0
[pid 666677] write(5, "[2024-06-22 18:55:12.099727 +000"..., 117) = 117
[pid 666677] write(2, "Launching heal operation to perf"..., 87Launching heal operation to perform full self heal on volume gv0 has been unsuccessful:) = 87
[pid 666677] write(2, "\n", 1
)          = 1
[pid 666677] write(2, "Self-heal daemon is not running."..., 65Self-heal daemon is not running. Check self-heal daemon log file.) = 65
[pid 666677] write(2, "\n", 1
)          = 1

ls /usr/libexec/glusterfs/gsyncd
ls: cannot access '/usr/libexec/glusterfs/gsyncd': No such file or directory

ls /usr/libexec/glusterfs/
glfsheal  glusterfind  mount-shared-storage.sh  peer_add_secret_pub

rpm -ql glusterfs-server |grep gsyncd
/etc/glusterfs/gsyncd.conf

dnf install glusterfs-geo-replication-11.1-1.el8.x86_64.rpm python3-gluster-11.1-1.el8.x86_64.rpm

strace -v -f gluster volume heal gv0 full

[pid 667093] write(1, "Launching heal operation to perf"..., 86Launching heal operation to perform full self heal on volume gv0 has been successful 
) = 86
[pid 667093] write(1, "Use heal info commands to check "..., 40Use heal info commands to check status.

Jun 22 '24 19:06 zenonp

As per my understanding, gsyncd --version check is done in CLI to check if Geo-replication is installed or not. Not to block any command. Please share /var/log/glusterfs/cli.log. I will check the code once for the gluster volume heal <vol> full to see if it blocks when Geo-rep is not installed.

Jul 02 '24 05:07 aravindavk

In my setup,

# gluster volume heal vol1 full
Launching heal operation to perform full self heal on volume vol1 has been successful 
Use heal info commands to check status.

Log: /var/log/glusterfs/cli.log

[2024-07-02 05:15:44.918468 +0000] I [cli.c:788:main] 0-cli: Started running gluster with version 2024.05.29
[2024-07-02 05:15:44.922083 +0000] I [cli-cmd-volume.c:2063:cli_check_gsync_present] 0-: geo-replication not installed
[2024-07-02 05:15:44.922591 +0000] I [MSGID: 101188] [event-epoll.c:643:event_dispatch_epoll_worker] 0-epoll: Started thread with index [{index=0}] 
[2024-07-02 05:15:44.922730 +0000] I [MSGID: 101188] [event-epoll.c:643:event_dispatch_epoll_worker] 0-epoll: Started thread with index [{index=1}] 
[2024-07-02 05:15:44.924902 +0000] I [cli-rpc-ops.c:8106:gf_cli_heal_volume_cbk] 0-cli: Received resp to heal volume
[2024-07-02 05:15:44.924998 +0000] I [input.c:31:cli_batch] 0-: Exiting with: 0

Jul 02 '24 05:07 aravindavk

Github says "zenonp opened this issue last week" and I can for the life of me not force it to show the actual date and time, which makes it difficult now to spot the relevant part of my cli.log. But rpm -qi glusterfs-geo-replication says "Install Date: Sat 22 Jun 2024 07:11:15 PM UTC", so I can go by that. Here follows the log for two attempts before and one after I installed the georeplication rpm.

[2024-06-22 18:54:53.933797 +0000] I [cli.c:789:main] 0-cli: Started running gluster with version 11.1
[2024-06-22 18:54:53.936147 +0000] I [cli-cmd-volume.c:2063:cli_check_gsync_present] 0-: geo-replication not installed
[2024-06-22 18:54:53.936381 +0000] I [MSGID: 101188] [event-epoll.c:644:event_dispatch_epoll_worker] 0-epoll: Started thread with index [{index=0}] 
[2024-06-22 18:54:53.936434 +0000] I [MSGID: 101188] [event-epoll.c:644:event_dispatch_epoll_worker] 0-epoll: Started thread with index [{index=1}] 
[2024-06-22 18:54:53.937809 +0000] I [cli-rpc-ops.c:8106:gf_cli_heal_volume_cbk] 0-cli: Received resp to heal volume
[2024-06-22 18:54:53.937858 +0000] I [input.c:31:cli_batch] 0-: Exiting with: -1
[2024-06-22 18:55:12.086329 +0000] I [cli.c:789:main] 0-cli: Started running gluster with version 11.1
[2024-06-22 18:55:12.097238 +0000] I [cli-cmd-volume.c:2063:cli_check_gsync_present] 0-: geo-replication not installed
[2024-06-22 18:55:12.097786 +0000] I [MSGID: 101188] [event-epoll.c:644:event_dispatch_epoll_worker] 0-epoll: Started thread with index [{index=0}] 
[2024-06-22 18:55:12.098039 +0000] I [MSGID: 101188] [event-epoll.c:644:event_dispatch_epoll_worker] 0-epoll: Started thread with index [{index=1}] 
[2024-06-22 18:55:12.099727 +0000] I [cli-rpc-ops.c:8106:gf_cli_heal_volume_cbk] 0-cli: Received resp to heal volume
[2024-06-22 18:55:12.099997 +0000] I [input.c:31:cli_batch] 0-: Exiting with: -1
[2024-06-22 19:12:01.417516 +0000] I [cli.c:789:main] 0-cli: Started running gluster with version 11.1
[2024-06-22 19:12:01.595271 +0000] I [MSGID: 101188] [event-epoll.c:644:event_dispatch_epoll_worker] 0-epoll: Started thread with index [{index=0}] 
[2024-06-22 19:12:01.595766 +0000] I [MSGID: 101188] [event-epoll.c:644:event_dispatch_epoll_worker] 0-epoll: Started thread with index [{index=1}] 
[2024-06-22 19:12:01.602504 +0000] I [cli-rpc-ops.c:8106:gf_cli_heal_volume_cbk] 0-cli: Received resp to heal volume
[2024-06-22 19:12:01.602718 +0000] I [input.c:31:cli_batch] 0-: Exiting with: 0

Note: I never had this problem before. It first appeared after I upgraded from 10.3 to 11.1. If there is a bug, it was introduced somewhere between those two versions.

Jul 02 '24 09:07 zenonp

Thanks for the logs. I will check the code and update here.

Jul 02 '24 13:07 aravindavk