UnifyFS
UnifyFS copied to clipboard
UnifyFS Client Timeouts at rpc for unifyfs_dispatch_transfer with MOVE and unifyfs_finalize
System information
Lassen Machine link
Describe the problem you're observing
The code hangs and timeouts on two UnifyFS calls. unifyfs_dispatch_transfer and unifyfs_finalize
for unifyfs_dispatch_transfer it hangs for the UNIFYFS_TRANSFER_MODE_MOVE but works for UNIFYFS_TRANSFER_MODE_COPY
for unifyfs_finalize the function timeouts at RPC.
Describe how to reproduce the problem
unifyfs_handle fshdl;
options_ct = 4;
unifyfs_cfg_option *options = static_cast<unifyfs_cfg_option *>(
calloc(options_ct, sizeof(unifyfs_cfg_option)));
options[0] = {.opt_name = "logio.spill_dir", .opt_value = logio_spill_dir};
options[1] = {.opt_name = "logio.spill_size",
.opt_value = logio_spill_size};
options[2] = {.opt_name = "logio.shmem_size",
.opt_value = logio_shmem_size};
options[3] = {.opt_name = "logio.chunk_size",
.opt_value = logio_chunk_size};
int rc = unifyfs_initialize(info.unifyfs_path.c_str(), options, options_ct, fshdl);
REQUIRE(rc == UNIFYFS_SUCCESS);
fs::path unifyfs_filename = info.unifyfs_path / filename;
unifyfs_gfid gfid = 0;
int rc = UNIFYFS_SUCCESS;
int create_flags = 0;
open_time.resumeTime();
rc = unifyfs_create(fshdl, create_flags, unifyfs_filename.c_str(), &gfid);
open_time.pauseTime();
INFO("unifyfs rc " << strerror(rc));
REQUIRE(rc == UNIFYFS_SUCCESS);
REQUIRE(gfid != UNIFYFS_INVALID_GFID);
if (info.rank == 0) INFO("Writing data");
/* Write data to file */
auto write_data =
std::vector<char>(args.request_size * args.iteration, 'w');
size_t write_req_ct = args.iteration + 1;
unifyfs_io_request write_req[write_req_ct];
for (size_t i = 0; i < args.iteration; ++i) {
write_req[i].op = UNIFYFS_IOREQ_OP_WRITE;
write_req[i].gfid = gfid;
write_req[i].nbytes = args.request_size;
off_t base_offset = 0;
if (args.file_sharing == tt::FileSharing::SHARED_FILE) {
base_offset = (off_t)info.rank * args.request_size * args.iteration;
}
off_t relative_offset = i * args.request_size;
write_req[i].offset = base_offset + relative_offset;
write_req[i].user_buf = write_data.data() + (i * args.request_size);
}
write_req[args.iteration].op = UNIFYFS_IOREQ_OP_SYNC_META;
write_req[args.iteration].gfid = gfid;
rc = unifyfs_dispatch_io(fshdl, write_req_ct, write_req);
if (rc == UNIFYFS_SUCCESS) {
int waitall = 1;
rc = unifyfs_wait_io(fshdl, write_req_ct, write_req, waitall);
if (rc == UNIFYFS_SUCCESS) {
for (size_t i = 0; i < args.iteration; i++) {
REQUIRE(write_req[i].result.error == 0);
REQUIRE(write_req[i].result.count == args.request_size);
}
REQUIRE(write_req[args.iteration].result.error == 0);
}
}
MPI_Barrier(MPI_COMM_WORLD);
if (info.rank == 0) PRINT_MSG("Finished Writing", "");
if (info.rank == 0) INFO("Flushing data");
unifyfs_transfer_request mv_req;
mv_req.src_path = unifyfs_filename.c_str();
mv_req.dst_path = full_filename_path.c_str();
mv_req.mode = UNIFYFS_TRANSFER_MODE_MOVE;
mv_req.use_parallel = 1;
rc = unifyfs_dispatch_transfer(fshdl, 1, &mv_req);
REQUIRE(rc == UNIFYFS_SUCCESS);
if (rc == UNIFYFS_SUCCESS) {
int waitall = 1;
rc = unifyfs_wait_transfer(fshdl, 1, &mv_req, waitall);
if (rc == UNIFYFS_SUCCESS) {
for (int i = 0; i < (int)1; i++) {
REQUIRE(mv_req.result.error == 0);
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
rc = unifyfs_finalize(fshdl);