-
Notifications
You must be signed in to change notification settings - Fork 488
Description
在跑您的ddp_trainer.py时 accelerate launch --config_file defualt_config.yaml ddp_trainer.py
Traceback (most recent call last):
[rank0]: File "/root/LLM_test/ddp_trainer.py", line 105, in
[rank0]: trainer.train()
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/transformers/trainer.py", line 2164, in train
[rank0]: return inner_training_loop(
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/transformers/trainer.py", line 2323, in _inner_training_loop
[rank0]: model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1333, in prepare
[rank0]: result = self._prepare_deepspeed(*args)
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1849, in _prepare_deepspeed
[rank0]: engine, optimizer, _, lr_scheduler = ds_initialize(**kwargs)
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/deepspeed/init.py", line 193, in initialize
[rank0]: engine = DeepSpeedEngine(args=args,
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 271, in init
[rank0]: self._configure_distributed_model(model)
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1213, in _configure_distributed_model
[rank0]: self._broadcast_model()
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1131, in _broadcast_model
[rank0]: dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 117, in log_wrapper
[rank0]: return func(*args, **kwargs)
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 224, in broadcast
[rank0]: return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 200, in broadcast
[rank0]: return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
[rank0]: return func(*args, **kwargs)
[rank0]: File "/root/anaconda3/envs/liuyu_llm/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2421, in broadcast
[rank0]: work = group.broadcast([tensor], opts)
[rank0]: ValueError: Tensors must be contiguous
这应该如何解决