From dfede96d1c453ee16bfb3db60e03ff58c1b71520 Mon Sep 17 00:00:00 2001 From: lg Date: Thu, 5 Nov 2020 23:51:28 +0000 Subject: [PATCH] support boot from MPI for debug --- srcs/cpp/src/peer.cpp | 11 ++++++++++- srcs/cpp/src/python/cuda.cpp | 5 +++++ srcs/go/kungfu/env/config.go | 3 +++ srcs/go/kungfu/env/envs.go | 2 ++ srcs/go/kungfu/env/mpi.go | 33 +++++++++++++++++++++++++++++++++ 5 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 srcs/go/kungfu/env/mpi.go diff --git a/srcs/cpp/src/peer.cpp b/srcs/cpp/src/peer.cpp index 484f1922e..261181be4 100644 --- a/srcs/cpp/src/peer.cpp +++ b/srcs/cpp/src/peer.cpp @@ -10,9 +10,18 @@ Peer::Peer() fprintf(stderr, "%s failed\n", "GoKungfuInit"); exit(1); } + if (std::getenv("KUNGFU_DEBUG_BOOT_FROM_MPI")) { + // TODO: call MPI_Init(); + } } -Peer::~Peer() { GoKungfuFinalize(); } +Peer::~Peer() +{ + GoKungfuFinalize(); + if (std::getenv("KUNGFU_DEBUG_BOOT_FROM_MPI")) { + // TODO: call MPI_Finalize(); + } +} bool Peer::Detached() const { return GoKungfuDetached(); } diff --git a/srcs/cpp/src/python/cuda.cpp b/srcs/cpp/src/python/cuda.cpp index 00373ae43..c0429b315 100644 --- a/srcs/cpp/src/python/cuda.cpp +++ b/srcs/cpp/src/python/cuda.cpp @@ -29,6 +29,11 @@ std::vector parse_cuda_visible_devices(const std::string &val) int kungfu_get_cuda_index() { int dev = 0; + if (std::getenv("KUNGFU_DEBUG_BOOT_FROM_MPI")) { + const char *ptr = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); + if (ptr != nullptr) { dev = std::stoi(ptr); } + return dev; + } { const char *ptr = std::getenv("KUNGFU_CUDA_VISIBLE_DEVICES"); if (ptr != nullptr) { dev = std::stoi(ptr); } diff --git a/srcs/go/kungfu/env/config.go b/srcs/go/kungfu/env/config.go index b50993c37..3e2dee7d0 100644 --- a/srcs/go/kungfu/env/config.go +++ b/srcs/go/kungfu/env/config.go @@ -22,6 +22,9 @@ type Config struct { } func ParseConfigFromEnv() (*Config, error) { + if _, ok := os.LookupEnv(BootFromMPI); ok { + return ParseConfigFromOpenMPIEnv() + } if _, ok := os.LookupEnv(SelfSpecEnvKey); !ok { return singleEnv(), nil } diff --git a/srcs/go/kungfu/env/envs.go b/srcs/go/kungfu/env/envs.go index 2785ded0b..cb46b07f1 100644 --- a/srcs/go/kungfu/env/envs.go +++ b/srcs/go/kungfu/env/envs.go @@ -15,4 +15,6 @@ const ( ProcStartTimestamp = `KUNGFU_PROC_START_TIMESTAMP` AllowNvLink = `KUNGFU_ALLOW_NVLINK` + + BootFromMPI = `KUNGFU_DEBUG_BOOT_FROM_MPI` ) diff --git a/srcs/go/kungfu/env/mpi.go b/srcs/go/kungfu/env/mpi.go new file mode 100644 index 000000000..1448a9c70 --- /dev/null +++ b/srcs/go/kungfu/env/mpi.go @@ -0,0 +1,33 @@ +package env + +import ( + "os" + "strconv" + + "github.com/lsds/KungFu/srcs/go/kungfu/base" + "github.com/lsds/KungFu/srcs/go/plan" + "github.com/lsds/KungFu/srcs/go/utils/assert" +) + +// ParseConfigFromOpenMPIEnv is for debug only +func ParseConfigFromOpenMPIEnv() (*Config, error) { + mpiSize, err := strconv.Atoi(os.Getenv(`OMPI_COMM_WORLD_SIZE`)) + if err != nil { + return nil, err + } + mpiRank, err := strconv.Atoi(os.Getenv(`OMPI_COMM_WORLD_RANK`)) + if err != nil { + return nil, err + } + hl := plan.HostList{{ + IPv4: plan.MustParseIPv4(`127.0.0.1`), + Slots: mpiSize, + }} + peers, err := hl.GenPeerList(mpiSize, plan.DefaultPortRange) + assert.OK(err) + return &Config{ + Self: peers[mpiRank], + InitPeers: peers, + Strategy: base.DefaultStrategy, + }, nil +}