报错大致是这样的,但是直接run没有问题,debug就停住不动了
Traceback (most recent call last):
File "/home/mapengsen/.pycharm_helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 467, in start_client
s.connect((host, port))
TimeoutError: timed out
Traceback (most recent call last):
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
[14:30:48.928250] [14:30:48.928492] [14:30:48.928599] [14:30:48.950877] [14:30:48.951222] [14:30:48.951351] File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
Could not connect to 127.0.0.1: 56945
Traceback (most recent call last):
File "/home/mapengsen/.pycharm_helpers/pydev/_pydevd_bundle/pydevd_comm.py", line 467, in start_client
s.connect((host, port))
TimeoutError: timed out
File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 26, in <module>
Traceback (most recent call last):
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
from torch._inductor.codecache import code_hash, CompiledFxGraph
File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1424, in <module>
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 26, in <module>
from torch._inductor.codecache import code_hash, CompiledFxGraph
File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1424, in <module>
AsyncCompile.warm_pool()AsyncCompile.warm_pool()
File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1363, in warm_pool
File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1363, in warm_pool
pool._adjust_process_count()
File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/concurrent/futures/process.py", line 697, in _adjust_process_count
pool._adjust_process_count()
File "/home/mapengsen/anaconda3/envs/MDT2/lib/python3.10/concurrent/futures/process.py", line 697, in _adjust_process_count
Could not connect to 127.0.0.1: 56945
后来才发现是自己 import 自己定义的datasets的时候出现了错误,因为我是在自己定义的datasets中进行了测试,但是里面有错误,然后我还在主程序中import了这个datasets,所以一直停住不动。把dataset报错的地方删除就行,只留方法部分:
def collate_fn_paired_skip_invalid(batch):
if len(batch[0]) == 5: # 单任务情况 (添加了task_id)
valid_batch_items = [item for item in batch if item[0] is not None and item[2] is not None]
if not valid_batch_items:
return torch.empty(0), torch.empty(0, 0), torch.empty(0), torch.empty(0, 0), torch.empty(0, dtype=torch.long)
return torch.utils.data.dataloader.default_collate(valid_batch_items)else: # 多任务情况 (7个元素,添加了task_id)
valid_batch_items = [item for item in batch if item[0] is not None and item[2] is not None and item[4] is not None]
if not valid_batch_items:
return torch.empty(0), torch.empty(0, 0), torch.empty(0), torch.empty(0, 0), torch.empty(0), torch.empty(0, 0), torch.empty(0, dtype=torch.long)
return torch.utils.data.dataloader.default_collate(valid_batch_items)删除下面的,以免有错误
#
# # --- 主训练循环 ---
# trained_models_per_task = {}
#
# # 假设您在这里定义了 all_task_names
# all_task_names = [['A_bioavailability_ma'], ['A_hia_hou'], ['A_bioavailability_ma', 'A_hia_hou']]
#
# for current_task_names in all_task_names:
# task_key = '+'.join(current_task_names) # 创建任务组合的键名
# print(f"\n--- 开始为任务组合: {task_key} 准备数据和模型 (Paired Data) ---")