pcp icon indicating copy to clipboard operation
pcp copied to clipboard

segfault in pmproxy redisProcessCallbacks / __redisRunCallback / cluster_async_data_free / command_destroy / hi_free

Open goodwinos opened this issue 4 years ago • 2 comments

pmproxy segfaulted ~ 3 mins after pmlogger_daily started. System was running 201 pmloggers at the time, processing tens of thousands of redis requests/second. THis same configuration worked in yesterdays log-roll. The only difference is the additional pmlogger archives over the 24h period.

           %cpu    maxrss    backlog  inflight    reqs/s    resp/s      wait  req err  resp err  changed  throttled
           ms/s     Kbyte       byte     count   count/s   count/s       s/s  count/s   count/s  count/s    count/s
00:07:58   22.4   1543320          0         0   38778.2   38778.2       690        0         0     20.1       31.3
00:08:28   22.5   1543320          0         0   40059.0   40059.0       729        0         0     20.1       31.4
00:08:58   23.1   1543320          0         0   40585.6   40585.6       722        0         0     20.1       31.2
00:09:28   22.2   1543320          0         0   39081.0   39081.0       720        0         0     20.1       31.3
00:09:58   23.4   1543320     275785      1672   40913.5   40857.7      2487        0         0     20.1       30.5
00:10:28   22.1   1543320          0         0   36514.0   36569.8      6422        0         0     20.2      807.4
00:10:58   25.7   1543320     556897      3332   45280.7   45169.7      1024        0         0     19.9     1234.3
00:11:28   31.4   1543320  114549193    806651   66516.6   39739.2     10906        0         0     19.7     4786.3
00:11:58   50.5   1543320          0         0   85728.4  112617.6    177824        0         0     17.8     4846.2
00:12:28   13.4   1543320   90062395    634284   29747.8    8605.7       941        0         0      4.8     1171.8
00:12:58    0.0   1543320   90062395    634284       0.0       0.0         0        0         0      0.0        0.0

Looks like a double free or use-after-free or heap corruption, see below. I saved the entire /var/log/pcp/pmlogger directory just prior to pmlogger_daily starting, so might be able to repro this.

Thread 1 "pmproxy" received signal SIGSEGV, Segmentation fault.
0x00007f4542a14402 in _int_free () from /lib64/libc.so.6
(gdb) where
#0  0x00007f4542a14402 in _int_free () from /lib64/libc.so.6
#1  0x00007f454369e7b9 in hi_free (ptr=<optimized out>) at deps/hiredis/alloc.h:74
#2  command_destroy (command=0x561f4a9cdb60) at deps/hiredis-cluster/command.c:1308
#3  0x00007f454369fb3d in cluster_async_data_free (cad=0x561f46a66ef0) at deps/hiredis-cluster/hircluster.c:3701
#4  0x00007f4543695f55 in __redisRunCallback (cb=0x7ffd4fb72000, cb=0x7ffd4fb72000, reply=<optimized out>, 
    ac=0x561f0f7c8840) at deps/hiredis/async.c:287
#5  redisProcessCallbacks (ac=0x561f0f7c8840) at deps/hiredis/async.c:572
#6  0x00007f454368b689 in redisLibuvPoll (handle=<optimized out>, status=<optimized out>, events=<optimized out>)
    at deps/hiredis/adapters/libuv.h:21
#7  0x00007f454344c733 in uv__io_poll (loop=loop@entry=0x7f4543659240 <default_loop_struct>, 
    timeout=<optimized out>) at src/unix/linux-core.c:462
#8  0x00007f454343b7d4 in uv_run (loop=0x7f4543659240 <default_loop_struct>, mode=mode@entry=UV_RUN_DEFAULT)
    at src/unix/core.c:385
#9  0x0000561f0deb94f2 in main_loop (arg=0x561f0f7bcc90) at server.c:917
#10 0x0000561f0deb8aa6 in main (argc=2, argv=0x7ffd4fb76558) at pmproxy.c:467
(gdb) up
#1  0x00007f454369e7b9 in hi_free (ptr=<optimized out>) at deps/hiredis/alloc.h:74
74	    hiredisAllocFns.freeFn(ptr);
(gdb) p ptr
$1 = <optimized out>
(gdb) up
#2  command_destroy (command=0x561f4a9cdb60) at deps/hiredis-cluster/command.c:1308
1308	        hi_free(command->cmd);
(gdb) p *command
$2 = {id = 2146991442, result = CMD_PARSE_OK, errstr = 0x0, type = CMD_UNKNOWN, 
  cmd = 0x561f284be6c0 "\200\070\321:\037V", clen = 138, keys = 0x561f46a66e90, narg_start = 0x0, narg_end = 0x0, 
  narg = 0, quit = 0, noforward = 0, slot_num = -1, node_addr = 0x0, frag_seq = 0x0, reply = 0x0, 
  sub_commands = 0x0}
(gdb) p *command->keys
$3 = {nelem = 0, elem = 0x561f224b0130, size = 24, nalloc = 1}

goodwinos avatar Jul 07 '21 01:07 goodwinos

stopped all pmloggers and pmproxy. Restored the pre-pmlogger_daily pmlogger directory, then restarted all pmloggers. Once they were all started, manually ran pmproxy under valgrind. Then removed pmlogger_daily.stamp and manually started the pmlogger_daily service. After an hour or so, valgrind reported an invalid write:

Log for pmproxy on SOMEHOST started Wed Jul  7 04:02:01 2021

pmproxy: PID = 1590164, PDU version = 2, user = pcp (0)
pmproxy request port(s):
  sts fd   port  family address
  === ==== ===== ====== =======
  ok    10       unix   /run/pcp/pmproxy.socket
  ok    11 44322 inet   INADDR_ANY
  ok    15 44323 inet   INADDR_ANY
[Wed Jul  7 04:02:01] pmproxy(1590164) Info: OpenSSL 1.1.1k  FIPS 25 Mar 2021 - no certificates found
[Wed Jul  7 04:02:02] pmproxy(1590164) Info: Redis slots, command keys, schema version setup
==1590164== Invalid write of size 8
==1590164==    at 0x5A67CBA: uv_fs_event_init (linux-inotify.c:244)
==1590164==    by 0x5816E41: pmDiscoverMonitor.constprop.15 (discover.c:524)
==1590164==    by 0x5818B63: created_callback (discover.c:638)
==1590164==    by 0x5818B63: created_callback (discover.c:622)
==1590164==    by 0x5818B63: pmDiscoverTraverse (discover.c:195)
==1590164==    by 0x5818B63: changed_callback (discover.c:1538)
==1590164==    by 0x5816845: fs_change_callBack (discover.c:490)
==1590164==    by 0x5A67B99: uv__inotify_read (linux-inotify.c:233)
==1590164==    by 0x5A66732: uv__io_poll (linux-core.c:462)
==1590164==    by 0x5A557D3: uv_run (core.c:385)
==1590164==    by 0x1124F1: main_loop (server.c:917)
==1590164==    by 0x111AA5: main (pmproxy.c:467)
==1590164==  Address 0xc9abcf0 is 32 bytes inside an unallocated block of size 48 in arena "client"
==1590164== 

That's here :

500 pmDiscoverMonitor(sds path, void (*callback)(pmDiscover *))
501 {
502     discoverModuleData  *data;
503     pmDiscover          *p;
504     sds                 eventfilename;
505 
506     if ((p = pmDiscoverLookup(path)) == NULL) {
507         if (pmDebugOptions.discovery) {
508             fprintf(stderr, "pmDiscoverMonitor: lookup failed for %s\n", path);
509         }
510         return -ESRCH;
511     }
512     data = getDiscoverModuleData(p->module);
513 
514     /* save the discovery callback to be invoked */
515     p->changed = callback;
516 
517     /* filesystem event request buffer */
518     if ((p->event_handle = malloc(sizeof(uv_fs_event_t))) != NULL) {
519         /*
520          * Start monitoring, using given uv loop. Up to the caller to create
521          * a PCP PMAPI context and to fetch/logtail in the changed callback.
522          */
523         eventfilename = sdsnew(p->context.name);
524         uv_fs_event_init(data->events, p->event_handle);

and in libuv:

243 int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle) {
244   uv__handle_init(loop, (uv_handle_t*)handle, UV_FS_EVENT);
245   return 0;
246 }

#define uv__handle_init(loop_, h, type_)                                      \
  do {                                                                        \
    (h)->loop = (loop_);                                                      \
    (h)->type = (type_);                                                      \
    (h)->flags = UV_HANDLE_REF;  /* Ref the loop when active. */              \
    QUEUE_INSERT_TAIL(&(loop_)->handle_queue, &(h)->handle_queue);            \
    uv__handle_platform_init(h);                                              \
  }                                                                           \
  while (0)

goodwinos avatar Jul 07 '21 05:07 goodwinos

Clearing assignment - cant repro without large pmlogger farm deployment

goodwinos avatar Mar 10 '22 05:03 goodwinos