python - RuntimeError:在设备字符串开始时需要 cpu、cuda、mkldnn、opengl、opencl、ideep、hip、msnpu 设备类型之一:0
问题描述
我正在尝试运行此代码https://github.com/AshwinRJ/Federated-Learning-PyTorch
我得到错误 RuntimeError: Expected one of cpu, cuda, mkldnn, opengl, opencl, ideep, hip, msnpu device type at start of device string: 0. 如何解决这个问题。ecan上的任何指南如何解决这些错误..请帮助
import os
import copy
import time
import pickle
import numpy as np
from tqdm import tqdm
import torch
from tensorboardX import SummaryWriter
from options import args_parser
from update import LocalUpdate, test_inference
from models import MLP, CNNMnist, CNNFashion_Mnist, CNNCifar
from utils import get_dataset, average_weights, exp_details
if __name__ == '__main__':
start_time = time.time()
# define paths
path_project = os.path.abspath('..')
logger = SummaryWriter('../logs')
args = args_parser()
exp_details(args)
if args.gpu:
torch.cuda.set_device(args.gpu)
device = 'cuda' if args.gpu else 'cpu'
# load dataset and user groups
train_dataset, test_dataset, user_groups = get_dataset(args)
# BUILD MODEL
if args.model == 'cnn':
# Convolutional neural netork
if args.dataset == 'mnist':
global_model = CNNMnist(args=args)
elif args.dataset == 'fmnist':
global_model = CNNFashion_Mnist(args=args)
elif args.dataset == 'cifar':
global_model = CNNCifar(args=args)
elif args.model == 'mlp':
# Multi-layer preceptron
img_size = train_dataset[0][0].shape
len_in = 1
for x in img_size:
len_in *= x
global_model = MLP(dim_in=len_in, dim_hidden=64,
dim_out=args.num_classes)
else:
exit('Error: unrecognized model')
# Set the model to train and send it to device.
global_model.to(device)
global_model.train()
print(global_model)
# copy weights
global_weights = global_model.state_dict()
# Training
train_loss, train_accuracy = [], []
val_acc_list, net_list = [], []
cv_loss, cv_acc = [], []
print_every = 2
val_loss_pre, counter = 0, 0
for epoch in tqdm(range(args.epochs)):
local_weights, local_losses = [], []
print(f'\n | Global Training Round : {epoch+1} |\n')
global_model.train()
m = max(int(args.frac * args.num_users), 1)
idxs_users = np.random.choice(range(args.num_users), m, replace=False)
for idx in idxs_users:
local_model = LocalUpdate(args=args, dataset=train_dataset,
idxs=user_groups[idx], logger=logger)
w, loss = local_model.update_weights(
model=copy.deepcopy(global_model), global_round=epoch)
local_weights.append(copy.deepcopy(w))
local_losses.append(copy.deepcopy(loss))
# update global weights
global_weights = average_weights(local_weights)
# update global weights
global_model.load_state_dict(global_weights)
loss_avg = sum(local_losses) / len(local_losses)
train_loss.append(loss_avg)
# Calculate avg training accuracy over all users at every epoch
list_acc, list_loss = [], []
global_model.eval()
for c in range(args.num_users):
local_model = LocalUpdate(args=args, dataset=train_dataset,
idxs=user_groups[idx], logger=logger)
acc, loss = local_model.inference(model=global_model)
list_acc.append(acc)
list_loss.append(loss)
train_accuracy.append(sum(list_acc)/len(list_acc))
# print global training loss after every 'i' rounds
if (epoch+1) % print_every == 0:
print(f' \nAvg Training Stats after {epoch+1} global rounds:')
print(f'Training Loss : {np.mean(np.array(train_loss))}')
print('Train Accuracy: {:.2f}% \n'.format(100*train_accuracy[-1]))
# Test inference after completion of training
test_acc, test_loss = test_inference(args, global_model, test_dataset)
print(f' \n Results after {args.epochs} global rounds of training:')
print("|---- Avg Train Accuracy: {:.2f}%".format(100*train_accuracy[-1]))
print("|---- Test Accuracy: {:.2f}%".format(100*test_acc))
# Saving the objects train_loss and train_accuracy:
file_name = '../save/objects/{}_{}_{}_C[{}]_iid[{}]_E[{}]_B[{}].pkl'.\
format(args.dataset, args.model, args.epochs, args.frac, args.iid,
args.local_ep, args.local_bs)
with open(file_name, 'wb') as f:
pickle.dump([train_loss, train_accuracy], f)
print('\n Total Run Time: {0:0.4f}'.format(time.time()-start_time))
# PLOTTING (optional)
# import matplotlib
# import matplotlib.pyplot as plt
# matplotlib.use('Agg')
# Plot Loss curve
# plt.figure()
# plt.title('Training Loss vs Communication rounds')
# plt.plot(range(len(train_loss)), train_loss, color='r')
# plt.ylabel('Training loss')
# plt.xlabel('Communication Rounds')
# plt.savefig('../save/fed_{}_{}_{}_C[{}]_iid[{}]_E[{}]_B[{}]_loss.png'.
# format(args.dataset, args.model, args.epochs, args.frac,
# args.iid, args.local_ep, args.local_bs))
#
# # Plot Average Accuracy vs Communication rounds
# plt.figure()
# plt.title('Average Accuracy vs Communication rounds')
# plt.plot(range(len(train_accuracy)), train_accuracy, color='k')
# plt.ylabel('Average Accuracy')
# plt.xlabel('Communication Rounds')
# plt.savefig('../save/fed_{}_{}_{}_C[{}]_iid[{}]_E[{}]_B[{}]_acc.png'.
# format(args.dataset, args.model, args.epochs, args.frac,
# args.iid, args.local_ep, args.local_bs))
输出日志
Experimental details:
Model : cnn
Optimizer : sgd
Learning : 0.01
Global Rounds : 10
Federated parameters:
IID
Fraction of users : 0.1
Local Batch size : 10
Local Epochs : 10
Traceback (most recent call last):
File "src/federated_main.py", line 32, in <module>
torch.cuda.set_device(args.gpu)
File "C:\Users\S\AppData\Local\Programs\Python\Python37\lib\site-packages\torch\cuda\__init__.py", line 298, in set_device
device = _get_device_index(device)
File "C:\Users\S\AppData\Local\Programs\Python\Python37\lib\site-packages\torch\cuda\_utils.py", line 20, in _get_device_index
device = torch.device(device)
RuntimeError: Expected one of cpu, cuda, mkldnn, opengl, opencl, ideep, hip, msnpu device type at start of device string: 0
PS C:\Users\S\Downloads\Federated-Learning-PyTorch-master\Federated-Learning-PyTorch-master>
解决方案
If you are using just a single gpu then try to replace
if args.gpu:
torch.cuda.set_device(args.gpu)
device = 'cuda' if args.gpu else 'cpu'
with
if torch.cuda.is_available():
device = "cuda:0"
else:
device = "cpu"
device = torch.device(device)
推荐阅读
- python - __str__(self) 函数不适用于在 Python 中打印对象
- accordion - Bootstrap手风琴(点击按钮,手风琴的所有其他部分应关闭并隐藏在显示中)
- c# - 如何在同一个应用程序中触发天蓝色功能
- r - 将表列表更改为 R 中的 data.frame
- algorithm - 如何检查流网络是否包含唯一的最大流量?
- javascript - 什么是创建动态更新 N 个 HTML 输出的 N 个文本字段输入的非天真或优雅的方式?
- c++ - 在特定的儿童情况下未能渗透到 MIN 二元堆
- angular - @angular-redux/store:如何为 redux store 编写单元测试?
- css - React Web 导航栏树形菜单激活
- python - 如何将字符串添加到列表的每个值,然后使该字符串的值向上计数?