From c99f1d7349e471ddfae767a42587a4c9b462f17d Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Thu, 6 Jun 2024 14:23:48 -0700 Subject: [PATCH 1/6] also log/write container version --- launcher_scripts/nemo_launcher/core/stages.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 234f0d360..677eec6f9 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -210,17 +210,24 @@ def _make_nemo_path_command(self) -> List[str]: ] def _make_git_log_command(self, stage_cfg_path: Path): - """log last 5 commits for repos- NeMo, megatron-lm, NeMo-Framework-Launcher or NeMo-Megatron-Launcher - 'NeMo-Megatron-Launcher' was renamed to 'NeMo-Framework-Launcher'. We run git log for both for + """ + log HEAD commit for subset of repos in NeMo container, PyTorch and NeMo container version names + 'NeMo-Megatron-Launcher' was renamed to 'NeMo-Framework-Launcher'. Try logging for both for backwards compatibility. """ append_to_file = f"{stage_cfg_path.parent}/git_log.txt" + if os.path.isfile(append_to_file) and os.path.getsize(append_to_file) > 0: + return [f"echo {append_to_file} already exists. Skipping generating new file..."] + + container_name = self.cfg.get("container", "") return [ - f"(echo PYT$\"NVIDIA_PYTORCH_VERSION\" && \ - git --git-dir=/opt/NeMo/.git log -n 5 --format='NeMo;%h;%aD;%s' && \ - git --git-dir=/opt/megatron-lm/.git log -n 5 --format='megatron-lm;%h;%aD;%s' && \ - git --git-dir=/opt/NeMo-Framework-Launcher/.git log -n 5 --format='NeMo-Framework-Launcher;%h;%aD;%s' && \ - git --git-dir=/opt/NeMo-Megatron-Launcher/.git log -n 5 --format='NeMo-Megatron-Launcher;%h;%aD;%s') > {append_to_file}" + f"(echo {container_name} && \ + echo PYT$\"NVIDIA_PYTORCH_VERSION\"; \ + git --git-dir=/opt/NeMo/.git log -n 1 --format='NeMo;%h;%aD;%s'; \ + git --git-dir=/opt/megatron-lm/.git log -n 1 --format='megatron-lm;%h;%aD;%s'; \ + git --git-dir=/opt/TransformerEngine/.git log -n 1 --format='megatron-lm;%h;%aD;%s'; \ + git --git-dir=/opt/NeMo-Framework-Launcher/.git log -n 1 --format='NeMo-Framework-Launcher;%h;%aD;%s'; \ + git --git-dir=/opt/NeMo-Megatron-Launcher/.git log -n 1 --format='NeMo-Megatron-Launcher;%h;%aD;%s') > {append_to_file}" ] def _make_k8s_spec_file( From c3821011be123dc358bab01be38cbc96baf83a65 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Fri, 14 Jun 2024 18:12:53 -0700 Subject: [PATCH 2/6] log PYT, NeMo and other repos --- launcher_scripts/nemo_launcher/core/stages.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 677eec6f9..e4b36d8af 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -211,25 +211,36 @@ def _make_nemo_path_command(self) -> List[str]: def _make_git_log_command(self, stage_cfg_path: Path): """ - log HEAD commit for subset of repos in NeMo container, PyTorch and NeMo container version names + log HEAD commit for subset of repos in NeMo container, version names for PyTorch and NeMo container 'NeMo-Megatron-Launcher' was renamed to 'NeMo-Framework-Launcher'. Try logging for both for backwards compatibility. """ - append_to_file = f"{stage_cfg_path.parent}/git_log.txt" - if os.path.isfile(append_to_file) and os.path.getsize(append_to_file) > 0: - return [f"echo {append_to_file} already exists. Skipping generating new file..."] + filepath = os.path.join(f"{stage_cfg_path.parent}", "results", "git-info.log") - container_name = self.cfg.get("container", "") - return [ - f"(echo {container_name} && \ - echo PYT$\"NVIDIA_PYTORCH_VERSION\"; \ - git --git-dir=/opt/NeMo/.git log -n 1 --format='NeMo;%h;%aD;%s'; \ - git --git-dir=/opt/megatron-lm/.git log -n 1 --format='megatron-lm;%h;%aD;%s'; \ - git --git-dir=/opt/TransformerEngine/.git log -n 1 --format='megatron-lm;%h;%aD;%s'; \ - git --git-dir=/opt/NeMo-Framework-Launcher/.git log -n 1 --format='NeMo-Framework-Launcher;%h;%aD;%s'; \ - git --git-dir=/opt/NeMo-Megatron-Launcher/.git log -n 1 --format='NeMo-Megatron-Launcher;%h;%aD;%s') > {append_to_file}" + git_repos = [ + "NeMo", + "megatron-lm", + "TransformerEngine", + "NeMo-Framework-Launcher", + "NeMo-Megatron-Launcher", + "apex", + ] + + git_log_cmd = [ + f"git --git-dir=/opt/{repo}/.git log -n 1 --format='{repo};%h;%aD;%s'" + for repo in git_repos ] + container_info_cmd = [ + f"echo NeMo-Container-Version\;{self.cfg.get('container', '')}", + 'echo PyTorch-Container-Version\;PYT$"NVIDIA_PYTORCH_VERSION"', + ] + + # semi-colon delimiter ensures we run all above commands even after a failure + # circular brackets groups commands and ensures we write to file ONLY after all + # commands finish execution + return [f"({';'.join(git_log_cmd + container_info_cmd)}) > {filepath}"] + def _make_k8s_spec_file( self, template_root: str, cluster_parameters: Dict, job_path: JobPaths ): From ffbfb57bfd89e0cfce15084c8f3bd73eee7b9872 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Tue, 18 Jun 2024 11:03:37 -0700 Subject: [PATCH 3/6] log aligner, curator --- launcher_scripts/nemo_launcher/core/stages.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index e4b36d8af..e3ca3be42 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -215,7 +215,7 @@ def _make_git_log_command(self, stage_cfg_path: Path): 'NeMo-Megatron-Launcher' was renamed to 'NeMo-Framework-Launcher'. Try logging for both for backwards compatibility. """ - filepath = os.path.join(f"{stage_cfg_path.parent}", "results", "git-info.log") + filepath = os.path.join(f"{stage_cfg_path.parent}", "git-info.txt") git_repos = [ "NeMo", @@ -224,6 +224,8 @@ def _make_git_log_command(self, stage_cfg_path: Path): "NeMo-Framework-Launcher", "NeMo-Megatron-Launcher", "apex", + "NeMo-Aligner", + "NeMo-Curator", ] git_log_cmd = [ From 45c1562c700b6e5030bae2ff8cfbac44fa247bbe Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Tue, 18 Jun 2024 12:33:33 -0700 Subject: [PATCH 4/6] .log file --- launcher_scripts/nemo_launcher/core/stages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index e3ca3be42..9aed4d434 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -215,7 +215,7 @@ def _make_git_log_command(self, stage_cfg_path: Path): 'NeMo-Megatron-Launcher' was renamed to 'NeMo-Framework-Launcher'. Try logging for both for backwards compatibility. """ - filepath = os.path.join(f"{stage_cfg_path.parent}", "git-info.txt") + filepath = os.path.join(f"{stage_cfg_path.parent}", "git-info.log") git_repos = [ "NeMo", From e70785e9001c5b82d80960c497d5d94ea206a0c6 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Tue, 18 Jun 2024 13:04:05 -0700 Subject: [PATCH 5/6] remove old launcher --- launcher_scripts/nemo_launcher/core/stages.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 9aed4d434..b2dd592f4 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -212,8 +212,6 @@ def _make_nemo_path_command(self) -> List[str]: def _make_git_log_command(self, stage_cfg_path: Path): """ log HEAD commit for subset of repos in NeMo container, version names for PyTorch and NeMo container - 'NeMo-Megatron-Launcher' was renamed to 'NeMo-Framework-Launcher'. Try logging for both for - backwards compatibility. """ filepath = os.path.join(f"{stage_cfg_path.parent}", "git-info.log") @@ -222,7 +220,6 @@ def _make_git_log_command(self, stage_cfg_path: Path): "megatron-lm", "TransformerEngine", "NeMo-Framework-Launcher", - "NeMo-Megatron-Launcher", "apex", "NeMo-Aligner", "NeMo-Curator", From 95113a981bd0e54ab47cda15bdff434428e90134 Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Sun, 4 Aug 2024 16:15:06 +0530 Subject: [PATCH 6/6] only bcm cluster --- launcher_scripts/nemo_launcher/core/stages.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index b2dd592f4..cf382a075 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -639,7 +639,8 @@ def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]: command_groups = [[]] command_groups[0] += self._make_wandb_login_command() command_groups[0] += self._make_nemo_path_command() - command_groups[0] += self._make_git_log_command(stage_cfg_path) + if self.cluster == "bcm": + command_groups[0] += self._make_git_log_command(stage_cfg_path) # command_groups[0] += self._make_numa_mapping_command() # _cuda_device_max_connections and _cuda_visible_devices cannot be used as command prefix on BCP