python - 如何在 pytorch 中将批量序列输入 CNN 和 LSTM?
问题描述
我有一个包含数千个视频的视频数据集,每个视频有 100 帧。输入是 2 个视频,由 video_x 和 video_y 组成,每个视频有 100 帧,每个视频的大小为 [2,100,3,600,600],遵循 [批次、帧、通道、高度、宽度] 的顺序。输出也是一个大小为[2,100,3,600,600]
.
我使用现成的视频数据加载器来提取帧并将它们提供给网络。[batches, frames, channels, height, width]
我通过将批次与帧 ( to )相乘来将输入的维度从 5D 减小到 4D,[batches*frames, channels, height, width]
以便能够输入到 conv2d 中,并且在 forward 函数的末尾,我将其重新整形为原始大小。但是由于降维后输入的大小变为[200,3,600,600]
将 200 帧输入 CNN 对网络来说是一个巨大的负担,并且会报错:
RuntimeError: DataLoader worker (pid 2002326) 被信号杀死:总线错误。数据加载器的工作人员可能没有共享内存。请尝试提高您的共享内存限制。
我想知道如何通过将帧分成批次将它们输入网络?例如,不是喂 100 帧,而是加载 5 批 20 帧。为了简化,我从代码中删除了 LSTM。
num = 1000
epochs = 1
batch_size = 2
lr = 1e-5
weight_decay=1e-7
num_filters = 16
kernel=3
num_workers = 4
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.empty_cache()
os.environ['CUDA_LAUNCH_BLOCKING']="1"
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = True
def loader(idx, label, path0=os.getcwd()):
if label == 'video_x':
dataset = datasets.VideoDataset(
os.path.join(path0, "video_load_x.csv"),
transform=torchvision.transforms.Compose([
transforms.VideoFilePathToTensor(max_len=None, fps=1, padding_mode='last')]))
data_loader_x = torch.utils.data.DataLoader(dataset, batch_size = batch_size, shuffle = True)
for videos_x in data_loader_x:
return videos_x
if label == 'video_y':
dataset = datasets.VideoDataset(
os.path.join(path0, "video_load_y.csv"),
transform=torchvision.transforms.Compose([
transforms.VideoFilePathToTensor(max_len=None, fps=1, padding_mode='last')]))
data_loader_y = torch.utils.data.DataLoader(dataset, batch_size = batch_size, shuffle = True)
for videos_y in data_loader_y:
return videos_y
if label == 'video_output':
dataset = datasets.VideoDataset(
os.path.join(path0, "video_output.csv"),
transform=torchvision.transforms.Compose([
transforms.VideoFilePathToTensor(max_len=None, fps=1, padding_mode='last')]))
data_loader_output = torch.utils.data.DataLoader(dataset, batch_size = batch_size, shuffle = True)
for videos_output in data_loader_output:
return videos_output
class dataset(Dataset):
def __init__(self, num, path0=os.getcwd(), train_ratio = 0.8,
is_train=True, loader=loader):
self.train_list, self.test_list = train_test_split(np.arange(num),
train_size=train_ratio,
random_state=42)
self.is_train = is_train
self.loader = loader
def __len__(self):
if self.is_train is True:
return len(self.train_list)
else:
return len(self.test_list)
def __getitem__(self, idx):
if self.is_train is True:
fidx = self.train_list[idx]
train_video_load_x = np.transpose(self.loader(fidx, 'video_x')[0],(1,0,2,3)) # [100,3,600,600] [0] deletes the extra bach from loader
train_video_load_y = np.transpose(self.loader(fidx, 'video_y')[0],(1,0,2,3)) # [100,3,600,600]
train_output = np.transpose(self.loader(fidx, 'video_output')[0],(1,0,2,3)) # [100,3,600,600]
return train_video_load_x,train_video_load_y,train_output
else:
fidx = self.test_list[idx]
test_video_load_x = np.transpose(self.loader(fidx, 'video_x')[0],(1,0,2,3)) # [100,3,600,600]
test_video_load_y = np.transpose(self.loader(fidx, 'video_y')[0],(1,0,2,3)) # [100,3,600,600]
test_output = np.transpose(self.loader(fidx, 'video_output')[0],(1,0,2,3)) # [100,3,600,600]
# print('test_output',test_output.shape)
return test_video_load_x,test_video_load_y,test_output
def criterion(y, y_pred, loss):
return loss(y,y_pred)
class Model(pl.LightningModule):
def __init__(self, num, kernel, num_filters,criterion = criterion):
super(Model, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(3, num_filters, kernel_size = kernel*3,
padding = 4),
nn.BatchNorm2d(num_filters),
nn.ReLU(inplace=True))
self.conv2 = nn.Sequential(
nn.Conv2d(num_filters, 32, kernel_size = kernel,
stride=2, padding = 1),
nn.BatchNorm2d(32),
nn.ReLU(inplace=True))
self.tsconv1= nn.Sequential(
nn.ConvTranspose2d(32, num_filters, kernel_size = kernel, padding = 1),
nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
nn.ReLU(inplace=True),
nn.BatchNorm2d(num_filters))
self.tsconv2= nn.Sequential(
nn.Conv2d(num_filters, 3, kernel_size = kernel*3, padding = 4, bias=True),
nn.ReLU(inplace=True))
self.num = num
self.mseloss = nn.MSELoss(reduction='mean')
self.criterion = criterion
def forward(self,vidx,vidy):
print('vidx:',vidx.shape) #[2, 100, 3, 600, 600]
print('vidy:',vidy.shape) #[2, 100, 3, 600, 600]
batch_size, timesteps, C, H, W = vidx.size()
vidx = vidx.reshape(batch_size * timesteps, C, H, W)
batch_size, timesteps, C, H, W = vidy.size()
vidy = vidy.view(batch_size * timesteps, C, H, W)
print('vidx_size:',vidx.shape) #[200, 3, 600, 600]
print('vidy_size:',vidy.shape) #[200, 3, 600, 600]
vidx = self.conv1(vidx)
print('conv1-vx',vidx.shape) #([200, 16, 600, 600])
vidx = self.conv2(vidx)
print('conv2-vx',vidx.shape) #([200, 32, 300, 300])
vidx= self.tsconv1(vidx)
print('tconv1_vx',vidx.shape) #([200, 16 600, 600])
vidx = self.tsconv2(vidx)
print('tconv2_vx',vidx.shape) #([200,3, 600, 600])
vidy = self.conv1(vidy) #([200, 16, 600, 600])
vidy = self.conv2(vidy) #([200, 32, 300, 300])
vidy= self.tsconv1(vidy) #([200, 16 600, 600])
vidy = self.tsconv2(vidy) #([200,3, 600, 600])
x = torch.cat((vidx, vidy), dim=0).view(-1,200,3,600,600) #([2,200,3, 600, 600])
print('concat_x',x.shape)
return x
def configure_optimizers(self):
return optim.AdamW(self.parameters(), lr=lr, weight_decay=weight_decay)
def training_step(self, batch, batch_nb):
vidx,vidy,y = batch
pred = self.forward(vidx,vidy) # [2, 200, 3, 600, 600]
loss_x = self.criterion(y, pred[:,:100,:,:,:], self.mseloss)
loss_y = self.criterion(y, pred[:,100:,:,:,:], self.mseloss)
loss = (loss_x+loss_y)/2
self.log('train_loss_mse', loss, on_step=False, on_epoch=True, logger=True)
return loss
def validation_step(self, batch, batch_nb):
vidx,vidy,y = batch
pred = self.forward(vidx,vidy)
print('pred', pred.shape) # [2, 200, 3, 600, 600]
loss_x = self.criterion(y, pred[:,:100,:,:,:], self.mseloss)
loss_y = self.criterion(y, pred[:,100:,:,:,:], self.mseloss)
loss = (loss_x+loss_y)/2
self.log('val_loss_mse', loss, on_step=False, on_epoch=True,sync_dist=True, logger=True)
return loss
def train_dataloader(self):
train_dataset = dataset(self.num, is_train=True)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
num_workers=num_workers, pin_memory=True)
return train_loader
def val_dataloader(self):
val_dataset = dataset(self.num, is_train=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
num_workers=num_workers, pin_memory=True)
return val_loader
model = Model(num, kernel, num_filters)
trainer = pl.Trainer(max_epochs=epochs, progress_bar_refresh_rate=1,
gpus=2,
distributed_backend='ddp',
benchmark=True,
sync_batchnorm=True,
precision=16)
trainer.fit(model)