From cc4bf624511d47e018631bc42c19645cb6a06955 Mon Sep 17 00:00:00 2001 From: e-ago Date: Mon, 2 Jul 2018 12:57:41 +0200 Subject: [PATCH 1/5] mp_register() extended with implicit ODP --- benchmarks/mp_pingpong_all.cu | 4 +- benchmarks/mp_pingpong_kernel.cu | 4 +- benchmarks/mp_pingpong_kernel_stream.cu | 4 +- .../mp_pingpong_kernel_stream_latency.cu | 4 +- .../mp_pingpong_kernel_stream_latency_mpi.cu | 4 +- benchmarks/mp_pingpong_kernel_stream_mpi.cu | 4 +- .../mp_pingpong_kernel_stream_wait_send.cu | 4 +- .../mp_producer_consumer_kernel_stream.cu | 4 +- benchmarks/mp_sendrecv_kernel_stream.cu | 4 +- comm_library/comm.cpp | 29 +++- comm_library/comm.h | 1 + examples/mp_putget.c | 16 +- examples/mp_sendrecv.c | 2 +- examples/mp_sendrecv_kernel.cu | 4 +- examples/mp_sendrecv_stream.c | 4 +- include/mp.h | 2 +- src/mp.c | 145 +++++++++++------- 17 files changed, 148 insertions(+), 91 deletions(-) diff --git a/benchmarks/mp_pingpong_all.cu b/benchmarks/mp_pingpong_all.cu index 89b86ba..6fad0b0 100644 --- a/benchmarks/mp_pingpong_all.cu +++ b/benchmarks/mp_pingpong_all.cu @@ -842,8 +842,8 @@ int main (int argc, char *argv[]) CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); } - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) fprintf(stdout, "%10d", size); diff --git a/benchmarks/mp_pingpong_kernel.cu b/benchmarks/mp_pingpong_kernel.cu index 6e1efd1..743aedf 100644 --- a/benchmarks/mp_pingpong_kernel.cu +++ b/benchmarks/mp_pingpong_kernel.cu @@ -232,8 +232,8 @@ double sr_exchange (MPI_Comm comm, int size, int iter_count, int validate, doubl CUDA_CHECK(cudaMalloc((void **)&rbuf_d, size*iter_count)); CUDA_CHECK(cudaMemset(rbuf_d, 0, size*iter_count)); - MP_CHECK(mp_register(sbuf_d, size*iter_count, &sreg)); - MP_CHECK(mp_register(rbuf_d, size*iter_count, &rreg)); + MP_CHECK(mp_register(sbuf_d, size*iter_count, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, size*iter_count, &rreg, 0)); if (validate) { mp_dbg_msg("initializing the buffer \n"); diff --git a/benchmarks/mp_pingpong_kernel_stream.cu b/benchmarks/mp_pingpong_kernel_stream.cu index 3251fbd..0b3b6ae 100644 --- a/benchmarks/mp_pingpong_kernel_stream.cu +++ b/benchmarks/mp_pingpong_kernel_stream.cu @@ -455,8 +455,8 @@ int main (int argc, char *argv[]) CUDA_CHECK(cudaMalloc((void **)&rbuf_d, buf_size)); CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) { if (prof_init(&prof_normal, 1000, 1000, "1us", 100, 1, tags)) { diff --git a/benchmarks/mp_pingpong_kernel_stream_latency.cu b/benchmarks/mp_pingpong_kernel_stream_latency.cu index 513d4fb..dd57020 100644 --- a/benchmarks/mp_pingpong_kernel_stream_latency.cu +++ b/benchmarks/mp_pingpong_kernel_stream_latency.cu @@ -539,8 +539,8 @@ int main (int argc, char *argv[]) CUDA_CHECK(cudaMalloc((void **)&rbuf_d, buf_size)); CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) fprintf(stdout, "%10d", size); diff --git a/benchmarks/mp_pingpong_kernel_stream_latency_mpi.cu b/benchmarks/mp_pingpong_kernel_stream_latency_mpi.cu index 7a25633..7b2f2cb 100644 --- a/benchmarks/mp_pingpong_kernel_stream_latency_mpi.cu +++ b/benchmarks/mp_pingpong_kernel_stream_latency_mpi.cu @@ -719,8 +719,8 @@ int main (int argc, char *argv[]) CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); } - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) fprintf(stdout, "%10d", size); diff --git a/benchmarks/mp_pingpong_kernel_stream_mpi.cu b/benchmarks/mp_pingpong_kernel_stream_mpi.cu index f744b2c..93d32cd 100644 --- a/benchmarks/mp_pingpong_kernel_stream_mpi.cu +++ b/benchmarks/mp_pingpong_kernel_stream_mpi.cu @@ -641,8 +641,8 @@ int main (int argc, char *argv[]) CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); } - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (my_rank == 0) { if (prof_init(&prof_normal, 1000, 1000, "1us", 100, 1, tags)) { diff --git a/benchmarks/mp_pingpong_kernel_stream_wait_send.cu b/benchmarks/mp_pingpong_kernel_stream_wait_send.cu index 3aa92d0..f32a960 100644 --- a/benchmarks/mp_pingpong_kernel_stream_wait_send.cu +++ b/benchmarks/mp_pingpong_kernel_stream_wait_send.cu @@ -388,8 +388,8 @@ int main (int c, char *v[]) CUDA_CHECK(cudaMalloc((void **)&rbuf_d, buf_size)); CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) { if (prof_init(&prof_normal, 1000, 1000, "1us", 100, 1, tags)) { diff --git a/benchmarks/mp_producer_consumer_kernel_stream.cu b/benchmarks/mp_producer_consumer_kernel_stream.cu index 0c1e8ac..94754bd 100644 --- a/benchmarks/mp_producer_consumer_kernel_stream.cu +++ b/benchmarks/mp_producer_consumer_kernel_stream.cu @@ -373,8 +373,8 @@ int main (int c, char *v[]) CUDA_CHECK(cudaMalloc((void **)&rbuf_d, buf_size)); CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) { if (prof_init(&prof_normal, 1000, 1000, "1us", 100, 1, tags)) { diff --git a/benchmarks/mp_sendrecv_kernel_stream.cu b/benchmarks/mp_sendrecv_kernel_stream.cu index 4f04ff2..c86864e 100644 --- a/benchmarks/mp_sendrecv_kernel_stream.cu +++ b/benchmarks/mp_sendrecv_kernel_stream.cu @@ -359,8 +359,8 @@ int main (int c, char *v[]) CUDA_CHECK(cudaMalloc((void **)&rbuf_d, buf_size)); CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) { if (prof_init(&prof_normal, 1000, 1000, "1us", 100, 1, tags)) { diff --git a/comm_library/comm.cpp b/comm_library/comm.cpp index 2c53573..ede5b80 100644 --- a/comm_library/comm.cpp +++ b/comm_library/comm.cpp @@ -246,11 +246,11 @@ int comm_init(MPI_Comm comm, int gpuId) iomb(); DBG("registering ready_table size=%zd\n", table_size); - MP_CHECK(mp_register(ready_table, table_size, &ready_table_reg)); + MP_CHECK(mp_register(ready_table, table_size, &ready_table_reg, 0)); DBG("creating ready_table window\n"); MP_CHECK(mp_window_create(ready_table, table_size, &ready_table_win)); DBG("registering remote_ready_table\n"); - MP_CHECK(mp_register(remote_ready_values, table_size, &remote_ready_values_reg)); + MP_CHECK(mp_register(remote_ready_values, table_size, &remote_ready_values_reg, 0)); comm_initialized = 1; @@ -706,7 +706,30 @@ int comm_register(void *buf, size_t size, comm_reg_t *creg) if (!*reg) { DBG("registering buffer %p\n", buf); - MP_CHECK(mp_register(buf, size, reg)); + MP_CHECK(mp_register(buf, size, reg, 0)); + } + +out: + return ret; +} + +int comm_register_odp(void *buf, size_t size, comm_reg_t *creg) +{ + assert(comm_initialized); + int ret = 0; + int retcode; + mp_reg_t *reg = (mp_reg_t*)creg; + assert(reg); + + if (!size) { + ret = -EINVAL; + comm_err("SIZE==0\n"); + goto out; + } + + if (!*reg) { + DBG("registering buffer %p\n", buf); + MP_CHECK(mp_register(buf, size, reg, IBV_EXP_ACCESS_ON_DEMAND)); } out: diff --git a/comm_library/comm.h b/comm_library/comm.h index eb76773..fdd1173 100644 --- a/comm_library/comm.h +++ b/comm_library/comm.h @@ -125,6 +125,7 @@ extern "C" { int comm_prepare_wait_all(int count, comm_request_t *creqs); comm_dev_descs_t comm_prepared_requests(); int comm_register(void *buf, size_t size, comm_reg_t *creg); + int comm_register_odp(void *buf, size_t size, comm_reg_t *creg); int comm_deregister(comm_reg_t *creg); int comm_select_device(int mpiRank); diff --git a/examples/mp_putget.c b/examples/mp_putget.c index bf43f38..85dbcfe 100644 --- a/examples/mp_putget.c +++ b/examples/mp_putget.c @@ -81,7 +81,7 @@ int put_exchange (MPI_Comm comm, int size, int iter_count, int window_size, int memset(buf_d, 0, buf_size); } - MP_CHECK(mp_register(buf_d, buf_size, ®)); + MP_CHECK(mp_register(buf_d, buf_size, ®, 0)); MP_CHECK(mp_window_create(buf_d, buf_size, &win)); @@ -189,8 +189,8 @@ int put_exchange_on_stream (MPI_Comm comm, int size, int iter_count, int window_ memset(buf_d, 0, buf_size); } - MP_CHECK(mp_register(buf_d, buf_size, ®)); - MP_CHECK(mp_register(signal, 4096, &signal_reg)); + MP_CHECK(mp_register(buf_d, buf_size, ®, 0)); + MP_CHECK(mp_register(signal, 4096, &signal_reg, 0)); MP_CHECK(mp_window_create(buf_d, buf_size, &win)); @@ -329,8 +329,8 @@ int put_desc_exchange_on_stream (MPI_Comm comm, int size, int iter_count, int wi mp_desc_queue_t dq = NULL; MP_CHECK(mp_desc_queue_alloc(&dq)); - MP_CHECK(mp_register(buf_d, buf_size, ®)); - MP_CHECK(mp_register(signal, 4096, &signal_reg)); + MP_CHECK(mp_register(buf_d, buf_size, ®, 0)); + MP_CHECK(mp_register(signal, 4096, &signal_reg, 0)); MP_CHECK(mp_window_create(buf_d, buf_size, &win)); @@ -478,8 +478,8 @@ int put_desc_nowait_exchange_on_stream (MPI_Comm comm, int size, int iter_count, mp_desc_queue_t dq = NULL; MP_CHECK(mp_desc_queue_alloc(&dq)); - MP_CHECK(mp_register(buf_d, buf_size, ®)); - MP_CHECK(mp_register(signal, 4096, &signal_reg)); + MP_CHECK(mp_register(buf_d, buf_size, ®, 0)); + MP_CHECK(mp_register(signal, 4096, &signal_reg, 0)); MP_CHECK(mp_window_create(buf_d, buf_size, &win)); @@ -616,7 +616,7 @@ int get_exchange (MPI_Comm comm, int size, int iter_count, int window_size, int memset(buf_d, 0, buf_size); } - MP_CHECK(mp_register(buf_d, buf_size, ®)); + MP_CHECK(mp_register(buf_d, buf_size, ®, 0)); MP_CHECK(mp_window_create(buf_d, buf_size, &win)); diff --git a/examples/mp_sendrecv.c b/examples/mp_sendrecv.c index 99a078e..48ef781 100644 --- a/examples/mp_sendrecv.c +++ b/examples/mp_sendrecv.c @@ -65,7 +65,7 @@ int sr_exchange (MPI_Comm comm, int size, int iter_count, int window_size, int v CUDA_CHECK(cudaMalloc((void **)&buf_d, buf_size)); CUDA_CHECK(cudaMemset(buf_d, 0, buf_size)); - MP_CHECK(mp_register(buf_d, buf_size, ®)); + MP_CHECK(mp_register(buf_d, buf_size, ®, 0)); dbg_msg("registered ptr: %p size: %zu\n", buf_d, buf_size); diff --git a/examples/mp_sendrecv_kernel.cu b/examples/mp_sendrecv_kernel.cu index eab467b..4c0a2cf 100644 --- a/examples/mp_sendrecv_kernel.cu +++ b/examples/mp_sendrecv_kernel.cu @@ -127,8 +127,8 @@ int sr_exchange (MPI_Comm comm, int size, int iter_count, int validate) CUDA_CHECK(cudaStreamCreate(&stream)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (validate) { CUDA_CHECK(cudaMemset(sbuf_d, (my_rank + 1), buf_size)); diff --git a/examples/mp_sendrecv_stream.c b/examples/mp_sendrecv_stream.c index c5dbe6e..b71c54b 100644 --- a/examples/mp_sendrecv_stream.c +++ b/examples/mp_sendrecv_stream.c @@ -75,8 +75,8 @@ int sr_exchange (MPI_Comm comm, int size, int iter_count, int validate) CUDA_CHECK(cudaStreamCreate(&stream)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (validate) { CUDA_CHECK(cudaMemset(sbuf_d, (my_rank + 1), buf_size)); diff --git a/include/mp.h b/include/mp.h index 872a5f2..8c946d8 100644 --- a/include/mp.h +++ b/include/mp.h @@ -85,7 +85,7 @@ enum mp_init_flags { int mp_init(MPI_Comm comm, int *peers, int count, int flags, int gpu_id); void mp_finalize(); -int mp_register(void *addr, size_t length, mp_reg_t *reg_t); +int mp_register(void *addr, size_t length, mp_reg_t *reg_t, int exp_flags); int mp_deregister(mp_reg_t *reg); /* diff --git a/src/mp.c b/src/mp.c index 6bc861d..926f7e6 100644 --- a/src/mp.c +++ b/src/mp.c @@ -739,80 +739,113 @@ static void check_cuda_ptr(void *addr, size_t length) } #endif // DADO_DEBUG -int mp_register(void *addr, size_t length, mp_reg_t *reg_) +int mp_register(void *addr, size_t length, mp_reg_t *reg_, int exp_flags) { - - //int myrank; - //MPI_Comm_rank (MPI_COMM_WORLD, &myrank); - - struct mp_reg *reg = calloc(1, sizeof(struct mp_reg)); - if (!reg) { + /*set SYNC MEMOPS if its device buffer*/ + unsigned int type, flag; + size_t size; + CUdeviceptr base; + CUresult curesult; + int flags; + + struct mp_reg *reg = calloc(1, sizeof(struct mp_reg)); + if (!reg) { mp_err_msg("malloc returned NULL while allocating struct mp_reg\n"); return MP_FAILURE; - } + } - /*set SYNC MEMOPS if its device buffer*/ - unsigned int type, flag; - size_t size; - CUdeviceptr base; - CUresult curesult; - int flags; - curesult = cuPointerGetAttribute((void *)&type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)addr); - if ((curesult == CUDA_SUCCESS) && (type == CU_MEMORYTYPE_DEVICE)) { + curesult = cuPointerGetAttribute((void *)&type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)addr); + if ((curesult == CUDA_SUCCESS) && (type == CU_MEMORYTYPE_DEVICE)) { CU_CHECK(cuMemGetAddressRange(&base, &size, (CUdeviceptr)addr)); flag = 1; CU_CHECK(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, base)); - } + } - if (mp_enable_ud) { - mp_dbg_msg("UD enabled, registering buffer for LOCAL_WRITE\n"); - flags = IBV_ACCESS_LOCAL_WRITE; - } else { - flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | + if (mp_enable_ud) { + mp_dbg_msg("UD enabled, registering buffer for LOCAL_WRITE\n"); + flags = IBV_ACCESS_LOCAL_WRITE; + } else { + flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; - } - mp_dbg_msg("ibv_reg_mr addr:%p size:%zu flags=0x%08x\n", addr, length, flags); - // note: register addr, not base. no advantage in registering the whole buffer as we don't - // maintain a registration cache yet - reg->mr = ibv_reg_mr(ib_ctx->pd, addr, length, flags); - if (!reg->mr) { - mp_err_msg("ibv_reg_mr returned NULL for addr:%p size:%zu errno=%d(%s)\n", - addr, length, errno, strerror(errno)); -#ifdef DADO_DEBUG - check_cuda_ptr(addr, length); - spin_forever(); - free(reg); - MPI_Abort(MPI_COMM_WORLD, 1); -#endif - return MP_FAILURE; - } - - reg->key = reg->mr->lkey; + } - mp_dbg_msg("reg=%p key=%x\n", reg, reg->key); + if(exp_flags & IBV_EXP_ACCESS_ON_DEMAND) + { + struct ibv_exp_device_attr dattr; + dattr.comp_mask = IBV_EXP_DEVICE_ATTR_ODP | IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS; + int ret = ibv_exp_query_device(ib_ctx->context, &dattr); + if (!(dattr.exp_device_cap_flags & IBV_EXP_DEVICE_ODP)) + { + mp_err_msg("ODP not supported!\n"); + return MP_FAILURE; + } - *reg_ = reg; - - return MP_SUCCESS; + //In LibMP we only support implicit ODP for the moment + if(!(dattr.odp_caps.general_odp_caps & IBV_EXP_ODP_SUPPORT_IMPLICIT)) + { + mp_err_msg("Implicit ODP not supported!\n"); + return MP_FAILURE; + } + + //Implicit On-Demand Paging is supported. + struct ibv_exp_reg_mr_in in; + in.pd = ib_ctx->pd; + in.addr = 0; + in.length = IBV_EXP_IMPLICIT_MR_SIZE; + in.exp_access = IBV_EXP_ACCESS_ON_DEMAND|flags; + in.comp_mask = 0; + + mp_dbg_msg("ibv_exp_reg_mr addr:0 size:IBV_EXP_IMPLICIT_MR_SIZE flags=0x%08x\n", in.exp_access); + reg->mr = ibv_exp_reg_mr(&in); + } + else + { + // note: register addr, not base. no advantage in registering the whole buffer as we don't + // maintain a registration cache yet + mp_dbg_msg("ibv_reg_mr addr:%p size:%zu flags=0x%08x\n", addr, length, flags); + reg->mr = ibv_reg_mr(ib_ctx->pd, addr, length, flags); + } + + if (!reg->mr) { + mp_err_msg("ibv_reg_mr returned NULL for addr:%p size:%zu errno=%d(%s)\n", + addr, length, errno, strerror(errno)); + + #ifdef DADO_DEBUG + check_cuda_ptr(addr, length); + spin_forever(); + free(reg); + MPI_Abort(MPI_COMM_WORLD, 1); + #endif + + return MP_FAILURE; + } + + reg->key = reg->mr->lkey; + + mp_dbg_msg("reg=%p key=%x\n", reg, reg->key); + + *reg_ = reg; + + return MP_SUCCESS; } int mp_deregister(mp_reg_t *reg_) { - int ret=0; - struct mp_reg *reg = (struct mp_reg *) *reg_; + int ret=0; + struct mp_reg *reg = (struct mp_reg *) *reg_; - assert(reg); - assert(reg->mr); - ret = ibv_dereg_mr(reg->mr); - if(ret) - { + assert(reg); + assert(reg->mr); + ret = ibv_dereg_mr(reg->mr); + if(ret) + { mp_err_msg("ibv_dereg_mr returned %d\n", ret); return MP_FAILURE; - } + } - free(reg); - return MP_SUCCESS; + free(reg); + return MP_SUCCESS; } char shm_filename[100]; @@ -1206,7 +1239,7 @@ int mp_init (MPI_Comm comm, int *peers, int count, int init_flags, int gpu_id) } if (mp_enable_ud) { - int result = mp_register(ud_padding, UD_ADDITION, &ud_padding_reg); + int result = mp_register(ud_padding, UD_ADDITION, &ud_padding_reg, 0); assert(result == MP_SUCCESS); } @@ -1820,7 +1853,7 @@ int mp_window_create(void *addr, size_t size, mp_window_t *window_t) assert(exchange_win != NULL); window->reg=NULL; - result = mp_register(addr, size, &window->reg); + result = mp_register(addr, size, &window->reg, 0); assert(result == MP_SUCCESS); exchange_win[mpi_comm_rank].base_addr = addr; From 8cccb9afc6f498d2979e74a6532d05ef3d607add Mon Sep 17 00:00:00 2001 From: e-ago Date: Mon, 2 Jul 2018 16:20:15 +0200 Subject: [PATCH 2/5] mp_register() does cuPointerGetAttribute only without odp. comm_pingpong code reworked with additional options and odp mode --- comm_library/comm.cpp | 6 - comm_library/examples/comm_pingpong.cpp | 168 ++++++++++++++++-------- include/mp.h | 2 +- src/mp.c | 23 ++-- 4 files changed, 123 insertions(+), 76 deletions(-) diff --git a/comm_library/comm.cpp b/comm_library/comm.cpp index ede5b80..43dd06f 100644 --- a/comm_library/comm.cpp +++ b/comm_library/comm.cpp @@ -721,12 +721,6 @@ int comm_register_odp(void *buf, size_t size, comm_reg_t *creg) mp_reg_t *reg = (mp_reg_t*)creg; assert(reg); - if (!size) { - ret = -EINVAL; - comm_err("SIZE==0\n"); - goto out; - } - if (!*reg) { DBG("registering buffer %p\n", buf); MP_CHECK(mp_register(buf, size, reg, IBV_EXP_ACCESS_ON_DEMAND)); diff --git a/comm_library/examples/comm_pingpong.cpp b/comm_library/examples/comm_pingpong.cpp index 745c459..3697a64 100644 --- a/comm_library/examples/comm_pingpong.cpp +++ b/comm_library/examples/comm_pingpong.cpp @@ -54,14 +54,24 @@ do { \ } while (0) -comm_reg_t * sreg, * rreg; +comm_reg_t * sreg, * rreg, * odpreg; int comm_size, my_rank, device_id; unsigned char * send_buf[MAX_PEERS]; unsigned char * recv_buf[MAX_PEERS]; -int use_gpu_buffers=0; int tot_iters=MAX_ITERS; -int max_size=BUF_SIZE; +int buf_size=BUF_SIZE; +int use_gpu_buffers=0; int validate=0; +int use_odp=0; + +static void usage() +{ + printf("Options:\n"); + printf(" -g allocate GPU intead of CPU memory buffers\n"); + printf(" -o use implici ODP\n"); + printf(" -n= number of exchanges (default %d)\n", MAX_ITERS); + printf(" -s= S/R buffer size (default %d)\n", BUF_SIZE); +} int async_exchange(int iter) { int peer, n_sreqs=0, n_rreqs=0; @@ -73,8 +83,10 @@ int async_exchange(int iter) { { if(peer != my_rank) { - comm_irecv(recv_buf[peer], max_size, MPI_CHAR, &rreg[peer], peer, &recv_requests[n_rreqs]); - comm_send_ready_on_stream(peer, &ready_requests[n_rreqs], NULL); + COMM_CHECK(comm_irecv(recv_buf[peer], buf_size, MPI_CHAR, + (use_odp ? &odpreg[0] : &rreg[peer]), + peer, &recv_requests[n_rreqs])); + COMM_CHECK(comm_send_ready_on_stream(peer, &ready_requests[n_rreqs], NULL)); n_rreqs++; } } @@ -83,18 +95,20 @@ int async_exchange(int iter) { { if(peer != my_rank) { - comm_wait_ready_on_stream(peer,NULL); - comm_isend_on_stream(send_buf[peer], max_size, MPI_CHAR, - &sreg[peer], peer, &send_requests[n_sreqs], NULL); + COMM_CHECK(comm_wait_ready_on_stream(peer,NULL)); + COMM_CHECK(comm_isend_on_stream(send_buf[peer], buf_size, MPI_CHAR, + (use_odp ? &odpreg[0] : &sreg[peer]), + peer, &send_requests[n_sreqs], NULL)); n_sreqs++; } } - comm_wait_all_on_stream(n_rreqs, recv_requests, NULL); - comm_wait_all_on_stream(n_sreqs, send_requests, NULL); - //comm_wait_all_on_stream(n_rreqs, ready_requests, NULL); + COMM_CHECK(comm_wait_all_on_stream(n_rreqs, recv_requests, NULL)); + COMM_CHECK(comm_wait_all_on_stream(n_rreqs, ready_requests, NULL)); + COMM_CHECK(comm_wait_all_on_stream(n_sreqs, send_requests, NULL)); + //printf("Before progress, %d iter, %d n_rreqs, %d n_sreqs, %d comm_size\n", iter, n_rreqs, n_sreqs, comm_size); comm_progress(); } @@ -108,8 +122,10 @@ int sync_exchange(int iter) { { if(peer != my_rank) { - comm_irecv(recv_buf[peer], max_size, MPI_CHAR, &rreg[peer], peer, &recv_requests[n_rreqs]); - comm_send_ready(peer, &ready_requests[n_rreqs]); + COMM_CHECK(comm_irecv(recv_buf[peer], buf_size, MPI_CHAR, + (use_odp ? &odpreg[0] : &rreg[peer]), + peer, &recv_requests[n_rreqs])); + COMM_CHECK(comm_send_ready(peer, &ready_requests[n_rreqs])); n_rreqs++; } } @@ -118,13 +134,15 @@ int sync_exchange(int iter) { { if(peer != my_rank) { - comm_wait_ready(peer); - comm_isend(send_buf[peer], max_size, MPI_CHAR, - &sreg[peer], peer, &send_requests[n_sreqs]); + COMM_CHECK(comm_wait_ready(peer)); + COMM_CHECK(comm_isend(send_buf[peer], buf_size, MPI_CHAR, + (use_odp ? &odpreg[0] : &sreg[peer]), + peer, &send_requests[n_sreqs])); n_sreqs++; } } + comm_flush(); } @@ -132,28 +150,45 @@ int main(int argc, char **argv) { int i,j,k,iter; char *value; double tot_time, start_time, stop_time; - - value = getenv("USE_GPU_BUFFERS"); - if (value != NULL) { - use_gpu_buffers = atoi(value); - } - - value = getenv("MAX_SIZE"); - if (value != NULL) { - max_size = atoi(value); - } - - value = getenv("TOT_ITERS"); - if (value != NULL) { - tot_iters = atoi(value); - if(tot_iters > MAX_ITERS) - { - printf("ERROR: max iters number allowed=%d\n", MAX_ITERS); - tot_iters = MAX_ITERS; + int c; + + while (1) { + + c = getopt(argc, argv, "gon:s:"); + if (c == -1) + break; + + switch (c) { + case 'g': + use_gpu_buffers=1; + printf("Using GPU memory for communication buffers\n"); + break; + + case 'n': + tot_iters = strtol(optarg, NULL, 0); + if(tot_iters > MAX_ITERS) + tot_iters = MAX_ITERS; + printf("Tot iters=%d\n", tot_iters); + break; + + case 'o': + use_odp=1; + printf("Using implicit ODP\n"); + break; + + + case 's': + buf_size=strtol(optarg, NULL, 0); + printf("Using buf_size=%d\n", buf_size); + + break; + + default: + usage(); + return 1; } - } - + value = getenv("ENABLE_VALIDATION"); if (value != NULL) { validate = atoi(value); @@ -166,48 +201,60 @@ int main(int argc, char **argv) { MPI_Comm_size(MPI_COMM_WORLD, &comm_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - assert(comm_size <= MAX_PEERS); + if (comm_size < 2 || comm_size > MAX_PEERS) { + fprintf(stderr, "this test requires 2 async sa=%d, use_gpu_buffers=%d, max_size=%d, tot_iters=%d num peers=%d validate=%d\n", - comm_use_model_sa()?1:0, use_gpu_buffers, max_size, tot_iters, comm_size, validate); + printf("# SA Model=%d\n# use_gpu_buffers=%d\n# buf_size=%d\n# tot_iters=%d\n# num peers=%d\n# validate=%d\n# use_odp=%d\n", + comm_use_model_sa()?1:0, use_gpu_buffers, buf_size, tot_iters, comm_size, validate, use_odp); start_time = MPI_Wtime(); for(iter=0; itermr = ibv_reg_mr(ib_ctx->pd, addr, length, flags); } From 57fb8bfcda03f193f05340a20a394addf1bd46a8 Mon Sep 17 00:00:00 2001 From: e-ago Date: Mon, 2 Jul 2018 16:22:31 +0200 Subject: [PATCH 3/5] minor fix debug print --- src/mp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mp.c b/src/mp.c index d498292..4817086 100644 --- a/src/mp.c +++ b/src/mp.c @@ -762,7 +762,7 @@ int mp_register(void *addr, size_t length, mp_reg_t *reg_, uint64_t exp_flags) IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; } - mp_dbg_msg(stderr, "exp_flags=%llx\n", exp_flags); + mp_dbg_msg("exp_flags=%llx\n", exp_flags); if(exp_flags & IBV_EXP_ACCESS_ON_DEMAND) { From ea34f238c10ff6babc82f459837f1f6091c9ccb2 Mon Sep 17 00:00:00 2001 From: e-ago Date: Mon, 2 Jul 2018 17:24:57 +0200 Subject: [PATCH 4/5] Adding error checks. Can't use ODP in combination with GMEM --- comm_library/examples/comm_pingpong.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/comm_library/examples/comm_pingpong.cpp b/comm_library/examples/comm_pingpong.cpp index 3697a64..0f0c6d9 100644 --- a/comm_library/examples/comm_pingpong.cpp +++ b/comm_library/examples/comm_pingpong.cpp @@ -194,16 +194,25 @@ int main(int argc, char **argv) { validate = atoi(value); } - if(!comm_use_comm()) - fprintf(stderr, "ERROR: pingpong + one sided for comm library only\n"); - MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); if (comm_size < 2 || comm_size > MAX_PEERS) { - fprintf(stderr, "this test requires 2 Date: Mon, 2 Jul 2018 18:02:52 +0200 Subject: [PATCH 5/5] mp_register now uses ibv_exp_reg_mr instead of ibv_reg_mr --- comm_library/comm.cpp | 6 +-- comm_library/comm.h | 2 +- comm_library/examples/comm_pingpong.cpp | 2 +- src/mp.c | 61 ++++++++++++++----------- 4 files changed, 40 insertions(+), 31 deletions(-) diff --git a/comm_library/comm.cpp b/comm_library/comm.cpp index 43dd06f..34390de 100644 --- a/comm_library/comm.cpp +++ b/comm_library/comm.cpp @@ -713,7 +713,7 @@ int comm_register(void *buf, size_t size, comm_reg_t *creg) return ret; } -int comm_register_odp(void *buf, size_t size, comm_reg_t *creg) +int comm_register_odp(comm_reg_t *creg) { assert(comm_initialized); int ret = 0; @@ -722,8 +722,8 @@ int comm_register_odp(void *buf, size_t size, comm_reg_t *creg) assert(reg); if (!*reg) { - DBG("registering buffer %p\n", buf); - MP_CHECK(mp_register(buf, size, reg, IBV_EXP_ACCESS_ON_DEMAND)); + DBG("registering implicit ODP MR\n"); + MP_CHECK(mp_register(NULL, 0, reg, IBV_EXP_ACCESS_ON_DEMAND)); } out: diff --git a/comm_library/comm.h b/comm_library/comm.h index fdd1173..947b608 100644 --- a/comm_library/comm.h +++ b/comm_library/comm.h @@ -125,7 +125,7 @@ extern "C" { int comm_prepare_wait_all(int count, comm_request_t *creqs); comm_dev_descs_t comm_prepared_requests(); int comm_register(void *buf, size_t size, comm_reg_t *creg); - int comm_register_odp(void *buf, size_t size, comm_reg_t *creg); + int comm_register_odp(comm_reg_t *creg); int comm_deregister(comm_reg_t *creg); int comm_select_device(int mpiRank); diff --git a/comm_library/examples/comm_pingpong.cpp b/comm_library/examples/comm_pingpong.cpp index 0f0c6d9..3ac82c8 100644 --- a/comm_library/examples/comm_pingpong.cpp +++ b/comm_library/examples/comm_pingpong.cpp @@ -251,7 +251,7 @@ int main(int argc, char **argv) { { odpreg = (comm_reg_t*)calloc(1, sizeof(comm_reg_t)); assert(odpreg); - COMM_CHECK(comm_register_odp(NULL, 0, &odpreg[0])); + COMM_CHECK(comm_register_odp(&odpreg[0])); } else { diff --git a/src/mp.c b/src/mp.c index 4817086..3537d6e 100644 --- a/src/mp.c +++ b/src/mp.c @@ -744,9 +744,9 @@ int mp_register(void *addr, size_t length, mp_reg_t *reg_, uint64_t exp_flags) /*set SYNC MEMOPS if its device buffer*/ unsigned int type, flag; size_t size; - CUdeviceptr base; - CUresult curesult; - int flags=0; + int flags=0, ret=0; + struct ibv_exp_reg_mr_in in; + struct ibv_exp_device_attr dattr; struct mp_reg *reg = calloc(1, sizeof(struct mp_reg)); if (!reg) { @@ -762,11 +762,33 @@ int mp_register(void *addr, size_t length, mp_reg_t *reg_, uint64_t exp_flags) IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC; } + in.pd = ib_ctx->pd; + in.exp_access = flags; + in.comp_mask = 0; + + if(addr != NULL && length > 0) + { + CUdeviceptr base; + CUresult curesult; + + // note: register addr, not base. no advantage in registering the whole buffer as we don't + // maintain a registration cache yet + curesult = cuPointerGetAttribute((void *)&type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)addr); + if ((curesult == CUDA_SUCCESS) && (type == CU_MEMORYTYPE_DEVICE)) { + CU_CHECK(cuMemGetAddressRange(&base, &size, (CUdeviceptr)addr)); + + flag = 1; + CU_CHECK(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, base)); + } + + in.addr = addr; + in.length = length; + } + mp_dbg_msg("exp_flags=%llx\n", exp_flags); if(exp_flags & IBV_EXP_ACCESS_ON_DEMAND) { - struct ibv_exp_device_attr dattr; dattr.comp_mask = IBV_EXP_DEVICE_ATTR_ODP | IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS; int ret = ibv_exp_query_device(ib_ctx->context, &dattr); if (!(dattr.exp_device_cap_flags & IBV_EXP_DEVICE_ODP)) @@ -775,38 +797,25 @@ int mp_register(void *addr, size_t length, mp_reg_t *reg_, uint64_t exp_flags) return MP_FAILURE; } - //In LibMP we only support implicit ODP for the moment + //In LibMP we only support implicit ODP if(!(dattr.odp_caps.general_odp_caps & IBV_EXP_ODP_SUPPORT_IMPLICIT)) { mp_err_msg("Implicit ODP not supported!\n"); return MP_FAILURE; } - + +#if defined(__x86_64__) || defined (__i386__) + mp_warn_msg("NOTE: This implicit ODP MR can't be used with GMEM buffers on x86 systems\n"); +#endif //Implicit On-Demand Paging is supported. - struct ibv_exp_reg_mr_in in; - in.pd = ib_ctx->pd; in.addr = 0; in.length = IBV_EXP_IMPLICIT_MR_SIZE; - in.exp_access = IBV_EXP_ACCESS_ON_DEMAND|flags; - in.comp_mask = 0; - - mp_dbg_msg("ibv_exp_reg_mr addr:0 size:IBV_EXP_IMPLICIT_MR_SIZE flags=0x%08x\n", in.exp_access); - reg->mr = ibv_exp_reg_mr(&in); + in.exp_access |= IBV_EXP_ACCESS_ON_DEMAND; } - else - { - // note: register addr, not base. no advantage in registering the whole buffer as we don't - // maintain a registration cache yet - curesult = cuPointerGetAttribute((void *)&type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)addr); - if ((curesult == CUDA_SUCCESS) && (type == CU_MEMORYTYPE_DEVICE)) { - CU_CHECK(cuMemGetAddressRange(&base, &size, (CUdeviceptr)addr)); - flag = 1; - CU_CHECK(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, base)); - } - mp_dbg_msg("ibv_reg_mr addr:%p size:%zu flags=0x%08x\n", addr, length, flags); - reg->mr = ibv_reg_mr(ib_ctx->pd, addr, length, flags); - } + mp_dbg_msg("ibv_exp_reg_mr addr:0 size:IBV_EXP_IMPLICIT_MR_SIZE flags=0x%08x\n", in.exp_access); + reg->mr = ibv_exp_reg_mr(&in); +// reg->mr = ibv_reg_mr(ib_ctx->pd, addr, length, flags); if (!reg->mr) { mp_err_msg("ibv_reg_mr returned NULL for addr:%p size:%zu errno=%d(%s)\n",