segfault in pmproxy redisProcessCallbacks / __redisRunCallback / cluster_async_data_free / command_destroy / hi_free
pmproxy segfaulted ~ 3 mins after pmlogger_daily started. System was running 201 pmloggers at the time, processing tens of thousands of redis requests/second. THis same configuration worked in yesterdays log-roll. The only difference is the additional pmlogger archives over the 24h period.
%cpu maxrss backlog inflight reqs/s resp/s wait req err resp err changed throttled
ms/s Kbyte byte count count/s count/s s/s count/s count/s count/s count/s
00:07:58 22.4 1543320 0 0 38778.2 38778.2 690 0 0 20.1 31.3
00:08:28 22.5 1543320 0 0 40059.0 40059.0 729 0 0 20.1 31.4
00:08:58 23.1 1543320 0 0 40585.6 40585.6 722 0 0 20.1 31.2
00:09:28 22.2 1543320 0 0 39081.0 39081.0 720 0 0 20.1 31.3
00:09:58 23.4 1543320 275785 1672 40913.5 40857.7 2487 0 0 20.1 30.5
00:10:28 22.1 1543320 0 0 36514.0 36569.8 6422 0 0 20.2 807.4
00:10:58 25.7 1543320 556897 3332 45280.7 45169.7 1024 0 0 19.9 1234.3
00:11:28 31.4 1543320 114549193 806651 66516.6 39739.2 10906 0 0 19.7 4786.3
00:11:58 50.5 1543320 0 0 85728.4 112617.6 177824 0 0 17.8 4846.2
00:12:28 13.4 1543320 90062395 634284 29747.8 8605.7 941 0 0 4.8 1171.8
00:12:58 0.0 1543320 90062395 634284 0.0 0.0 0 0 0 0.0 0.0
Looks like a double free or use-after-free or heap corruption, see below. I saved the entire /var/log/pcp/pmlogger directory just prior to pmlogger_daily starting, so might be able to repro this.
Thread 1 "pmproxy" received signal SIGSEGV, Segmentation fault.
0x00007f4542a14402 in _int_free () from /lib64/libc.so.6
(gdb) where
#0 0x00007f4542a14402 in _int_free () from /lib64/libc.so.6
#1 0x00007f454369e7b9 in hi_free (ptr=<optimized out>) at deps/hiredis/alloc.h:74
#2 command_destroy (command=0x561f4a9cdb60) at deps/hiredis-cluster/command.c:1308
#3 0x00007f454369fb3d in cluster_async_data_free (cad=0x561f46a66ef0) at deps/hiredis-cluster/hircluster.c:3701
#4 0x00007f4543695f55 in __redisRunCallback (cb=0x7ffd4fb72000, cb=0x7ffd4fb72000, reply=<optimized out>,
ac=0x561f0f7c8840) at deps/hiredis/async.c:287
#5 redisProcessCallbacks (ac=0x561f0f7c8840) at deps/hiredis/async.c:572
#6 0x00007f454368b689 in redisLibuvPoll (handle=<optimized out>, status=<optimized out>, events=<optimized out>)
at deps/hiredis/adapters/libuv.h:21
#7 0x00007f454344c733 in uv__io_poll (loop=loop@entry=0x7f4543659240 <default_loop_struct>,
timeout=<optimized out>) at src/unix/linux-core.c:462
#8 0x00007f454343b7d4 in uv_run (loop=0x7f4543659240 <default_loop_struct>, mode=mode@entry=UV_RUN_DEFAULT)
at src/unix/core.c:385
#9 0x0000561f0deb94f2 in main_loop (arg=0x561f0f7bcc90) at server.c:917
#10 0x0000561f0deb8aa6 in main (argc=2, argv=0x7ffd4fb76558) at pmproxy.c:467
(gdb) up
#1 0x00007f454369e7b9 in hi_free (ptr=<optimized out>) at deps/hiredis/alloc.h:74
74 hiredisAllocFns.freeFn(ptr);
(gdb) p ptr
$1 = <optimized out>
(gdb) up
#2 command_destroy (command=0x561f4a9cdb60) at deps/hiredis-cluster/command.c:1308
1308 hi_free(command->cmd);
(gdb) p *command
$2 = {id = 2146991442, result = CMD_PARSE_OK, errstr = 0x0, type = CMD_UNKNOWN,
cmd = 0x561f284be6c0 "\200\070\321:\037V", clen = 138, keys = 0x561f46a66e90, narg_start = 0x0, narg_end = 0x0,
narg = 0, quit = 0, noforward = 0, slot_num = -1, node_addr = 0x0, frag_seq = 0x0, reply = 0x0,
sub_commands = 0x0}
(gdb) p *command->keys
$3 = {nelem = 0, elem = 0x561f224b0130, size = 24, nalloc = 1}
stopped all pmloggers and pmproxy. Restored the pre-pmlogger_daily pmlogger directory, then restarted all pmloggers. Once they were all started, manually ran pmproxy under valgrind. Then removed pmlogger_daily.stamp and manually started the pmlogger_daily service. After an hour or so, valgrind reported an invalid write:
Log for pmproxy on SOMEHOST started Wed Jul 7 04:02:01 2021
pmproxy: PID = 1590164, PDU version = 2, user = pcp (0)
pmproxy request port(s):
sts fd port family address
=== ==== ===== ====== =======
ok 10 unix /run/pcp/pmproxy.socket
ok 11 44322 inet INADDR_ANY
ok 15 44323 inet INADDR_ANY
[Wed Jul 7 04:02:01] pmproxy(1590164) Info: OpenSSL 1.1.1k FIPS 25 Mar 2021 - no certificates found
[Wed Jul 7 04:02:02] pmproxy(1590164) Info: Redis slots, command keys, schema version setup
==1590164== Invalid write of size 8
==1590164== at 0x5A67CBA: uv_fs_event_init (linux-inotify.c:244)
==1590164== by 0x5816E41: pmDiscoverMonitor.constprop.15 (discover.c:524)
==1590164== by 0x5818B63: created_callback (discover.c:638)
==1590164== by 0x5818B63: created_callback (discover.c:622)
==1590164== by 0x5818B63: pmDiscoverTraverse (discover.c:195)
==1590164== by 0x5818B63: changed_callback (discover.c:1538)
==1590164== by 0x5816845: fs_change_callBack (discover.c:490)
==1590164== by 0x5A67B99: uv__inotify_read (linux-inotify.c:233)
==1590164== by 0x5A66732: uv__io_poll (linux-core.c:462)
==1590164== by 0x5A557D3: uv_run (core.c:385)
==1590164== by 0x1124F1: main_loop (server.c:917)
==1590164== by 0x111AA5: main (pmproxy.c:467)
==1590164== Address 0xc9abcf0 is 32 bytes inside an unallocated block of size 48 in arena "client"
==1590164==
That's here :
500 pmDiscoverMonitor(sds path, void (*callback)(pmDiscover *))
501 {
502 discoverModuleData *data;
503 pmDiscover *p;
504 sds eventfilename;
505
506 if ((p = pmDiscoverLookup(path)) == NULL) {
507 if (pmDebugOptions.discovery) {
508 fprintf(stderr, "pmDiscoverMonitor: lookup failed for %s\n", path);
509 }
510 return -ESRCH;
511 }
512 data = getDiscoverModuleData(p->module);
513
514 /* save the discovery callback to be invoked */
515 p->changed = callback;
516
517 /* filesystem event request buffer */
518 if ((p->event_handle = malloc(sizeof(uv_fs_event_t))) != NULL) {
519 /*
520 * Start monitoring, using given uv loop. Up to the caller to create
521 * a PCP PMAPI context and to fetch/logtail in the changed callback.
522 */
523 eventfilename = sdsnew(p->context.name);
524 uv_fs_event_init(data->events, p->event_handle);
and in libuv:
243 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
244 uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
245 return 0;
246 }
#define uv__handle_init(loop_, h, type_) \
do { \
(h)->loop = (loop_); \
(h)->type = (type_); \
(h)->flags = UV_HANDLE_REF; /* Ref the loop when active. */ \
QUEUE_INSERT_TAIL(&(loop_)->handle_queue, &(h)->handle_queue); \
uv__handle_platform_init(h); \
} \
while (0)
Clearing assignment - cant repro without large pmlogger farm deployment