ucx icon indicating copy to clipboard operation
ucx copied to clipboard

Build fails on missing mlx5dv functions

Open alex--m opened this issue 3 years ago • 0 comments

Describe the bug

UCX master fails to build with the following error:

  CC       rc/accel/libuct_ib_la-rc_mlx5_devx.lo
rc/accel/rc_mlx5_devx.c: In function ‘uct_rc_mlx5_devx_iface_subscribe_event’:
rc/accel/rc_mlx5_devx.c:33:11: error: implicit declaration of function ‘mlx5dv_devx_subscribe_devx_event’; did you mean ‘mlx5dv_devx_query_eqn’? [-Werror=implicit-function-declaration]
     ret = mlx5dv_devx_subscribe_devx_event(event_channel, obj, sizeof(event),
           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
           mlx5dv_devx_query_eqn
rc/accel/rc_mlx5_devx.c:33:11: error: nested extern declaration of ‘mlx5dv_devx_subscribe_devx_event’ [-Werror=nested-externs]
rc/accel/rc_mlx5_devx.c: In function ‘uct_rc_mlx5_devx_iface_event_handler’:
rc/accel/rc_mlx5_devx.c:50:40: error: storage size of ‘devx_event’ isn’t known
     struct mlx5dv_devx_async_event_hdr devx_event;
                                        ^~~~~~~~~~
rc/accel/rc_mlx5_devx.c:54:11: error: implicit declaration of function ‘mlx5dv_devx_get_event’; did you mean ‘mlx5dv_devx_wq_query’? [-Werror=implicit-function-declaration]
     ret = mlx5dv_devx_get_event(iface->event_channel, &devx_event, sizeof(devx_event));
           ^~~~~~~~~~~~~~~~~~~~~
           mlx5dv_devx_wq_query
rc/accel/rc_mlx5_devx.c:54:11: error: nested extern declaration of ‘mlx5dv_devx_get_event’ [-Werror=nested-externs]
rc/accel/rc_mlx5_devx.c:50:40: error: unused variable ‘devx_event’ [-Werror=unused-variable]
     struct mlx5dv_devx_async_event_hdr devx_event;
                                        ^~~~~~~~~~
rc/accel/rc_mlx5_devx.c: In function ‘uct_rc_mlx5_iface_devx_pre_arm’:
rc/accel/rc_mlx5_devx.c:81:40: error: storage size of ‘event’ isn’t known
     struct mlx5dv_devx_async_event_hdr event;
                                        ^~~~~
rc/accel/rc_mlx5_devx.c:81:40: error: unused variable ‘event’ [-Werror=unused-variable]
rc/accel/rc_mlx5_devx.c: In function ‘uct_rc_mlx5_devx_create_event_channel’:
rc/accel/rc_mlx5_devx.c:141:21: error: implicit declaration of function ‘mlx5dv_devx_create_event_channel’; did you mean ‘uct_rc_mlx5_devx_create_event_channel’? [-Werror=implicit-function-declaration]
     event_channel = mlx5dv_devx_create_event_channel(
                     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                     uct_rc_mlx5_devx_create_event_channel
rc/accel/rc_mlx5_devx.c:141:21: error: nested extern declaration of ‘mlx5dv_devx_create_event_channel’ [-Werror=nested-externs]
rc/accel/rc_mlx5_devx.c:143:13: error: ‘MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA’ undeclared (first use in this function); did you mean ‘MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA’?
             MLX5_IB_UAPI_DEVX_CR_EV_CH_FLAGS_OMIT_DATA);
             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA
rc/accel/rc_mlx5_devx.c:143:13: note: each undeclared identifier is reported only once for each function it appears in
rc/accel/rc_mlx5_devx.c:151:47: error: dereferencing pointer to incomplete type ‘struct mlx5dv_devx_event_channel’
     status = ucs_sys_fcntl_modfl(event_channel->fd, O_NONBLOCK, 0);
                                               ^~
rc/accel/rc_mlx5_devx.c:160:5: error: implicit declaration of function ‘mlx5dv_devx_destroy_event_channel’; did you mean ‘uct_rc_mlx5_devx_create_event_channel’? [-Werror=implicit-function-declaration]
     mlx5dv_devx_destroy_event_channel(event_channel);
     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     uct_rc_mlx5_devx_create_event_channel
rc/accel/rc_mlx5_devx.c:160:5: error: nested extern declaration of ‘mlx5dv_devx_destroy_event_channel’ [-Werror=nested-externs]
cc1: all warnings being treated as errors

Looks like UCX uses a symbol even though it was not detected on the host:

configure:23878: checking for mlx5dv_query_device in -lmlx5
configure:23903: gcc -o conftest     -I/usr/include  -libverbs  conftest.c -lmlx5 -libverbs -lpthread -lrt -lrt  >&5
configure:23903: $? = 0
configure:23912: result: yes
configure:23928: checking for infiniband/mlx5dv.h
configure:23928: gcc -c     -I/usr/include conftest.c >&5
configure:23928: $? = 0
configure:23928: result: yes
configure:23941: checking whether mlx5dv_init_obj is declared
configure:23941: gcc -c     -I/usr/include conftest.c >&5
configure:23941: $? = 0
configure:23941: result: yes
configure:23957: checking whether mlx5dv_create_qp is declared
configure:23957: gcc -c     -I/usr/include conftest.c >&5
configure:23957: $? = 0
configure:23957: result: yes
configure:23973: checking whether mlx5dv_is_supported is declared
configure:23973: gcc -c     -I/usr/include conftest.c >&5
configure:23973: $? = 0
configure:23973: result: yes
configure:23989: checking whether mlx5dv_devx_subscribe_devx_event is declared
configure:23989: gcc -c     -I/usr/include conftest.c >&5
conftest.c: In function 'main':
conftest.c:145:10: error: 'mlx5dv_devx_subscribe_devx_event' undeclared (first use in this function); did you mean 'mlx5dv_devx_query_eqn'?
   (void) mlx5dv_devx_subscribe_devx_event;
          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          mlx5dv_devx_query_eqn
conftest.c:145:10: note: each undeclared identifier is reported only once for each function it appears in
configure:23989: $? = 1
configure: failed program was:
| /* confdefs.h */
| #define PACKAGE_NAME "ucx"
| #define PACKAGE_TARNAME "ucx"
| #define PACKAGE_VERSION "1.14"
| #define PACKAGE_STRING "ucx 1.14"
| #define PACKAGE_BUGREPORT ""

...

| #define HAVE_INFINIBAND_MLX5DV_H 1
| #define HAVE_DECL_MLX5DV_INIT_OBJ 1
| #define HAVE_DECL_MLX5DV_CREATE_QP 1
| #define HAVE_DECL_MLX5DV_IS_SUPPORTED 1
| /* end confdefs.h.  */
| #include <infiniband/mlx5dv.h>
| 
| int
| main ()
| {
| #ifndef mlx5dv_devx_subscribe_devx_event
| #ifdef __cplusplus
|   (void) mlx5dv_devx_subscribe_devx_event;
| #else
|   (void) mlx5dv_devx_subscribe_devx_event;
| #endif
| #endif
| 
|   ;
|   return 0;
| }
configure:23989: result: no
configure:24125: checking whether MLX5DV_CONTEXT_FLAGS_DEVX is declared
configure:24125: gcc -c     -I/usr/include conftest.c >&5
configure:24125: $? = 0
configure:24125: result: yes
configure:24164: checking whether IBV_LINK_LAYER_INFINIBAND is declared
configure:24164: gcc -c     -I/usr/include conftest.c >&5
configure:24164: $? = 0
configure:24164: result: yes
...

Setup and versions

#> cat /etc/debian_version 
10.10
#> apt show rdma-core
Package: rdma-core
Version: 22.1-1
...

Note: the server where this happens can be accessed remotely (contact me for details).

alex--m avatar Aug 14 '22 18:08 alex--m