python - ValueError('需要至少一个数组来堆叠')
问题描述
当我处理视频及其音频时,我遇到了一个错误:
原始回溯(最近一次通话最后): _worker_loop 中的文件“/home/yzx/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py”,第 202 行 数据 = fetcher.fetch(index) 文件“/home/yzx/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py”,第 44 行,在 fetch data = [self.dataset[idx] for idx in possible_batched_index] 文件“/home/yzx/anaconda3/envs/pytorch/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py”,第 44 行,在 data = [self.dataset[idx] for idx in possible_batched_index] 文件“/home/yzx/lunwen/PseudoBinaural_CVPR2021-master/data/Augment_dataset.py”,第 66 行,在 __getitem__ data_ret, data_ret_sep = self._get_pseudo_item(index) 文件“/home/yzx/lunwen/PseudoBinaural_CVPR2021-master/data/Pseudo_dataset.py”,第 263 行,在 _get_pseudo_item 立体声 = self.construct_stereo_ambi(pst_sources) 文件“/home/yzx/lunwen/PseudoBinaural_CVPR2021-master/data/Pseudo_dataset.py”,第 120 行,在construct_stereo_ambi 信号 = np.stack([src.signal for src in pst_sources], axis=1) # 信号形状:[Len, n_signals] 文件“”,第 5 行,在堆栈中 堆栈中的文件“/home/yzx/anaconda3/envs/pytorch/lib/python3.8/site-packages/numpy/core/shape_base.py”,第 423 行 raise ValueError('需要至少一个数组来堆叠') ValueError:需要至少一个数组来堆叠
下面的list_samlple_file 就是train.txt 的内容。每一行与第一行“/home/yzx/lunwen/datasets/FAIR-PLAY/binaural_audios/000383.wav,/home/yzx/lunwen/datasets/FAIR-PLAY/frames/000383”的格式相同。前一个路径是每个音频的路径,后一个是每个视频的帧的文件夹。我在每10秒的视频中每秒提取10帧。它们在Psedo_dataset.py中拆分为'audio_file'和'img_folder' .我检查了这条路径是真的。但我不知道为什么会产生错误。我需要你的帮助。
clss Augment_dataset.py 的内容是:
class AugmentDataset(StereoDataset, PseudoDataset):
def __init__(self, opt, list_sample_file):
self.opt = opt
PseudoDataset.__init__(self, opt, list_sample_file)
StereoDataset.__init__(self, opt)
if "MUSIC" in self.opt.datalist:
dup_times = 1
print("dup_times:", dup_times)
else:
dup_times = 2
print("dup_times:", dup_times)
self.total_samples *= dup_times # in order to align with the length of FAIR-Play dataset
random.shuffle(self.audios)
random.shuffle(self.total_samples)
def __getitem__(self, index):
if random.uniform(0,1) < self.opt.pseudo_ratio:
data_ret, data_ret_sep = self._get_pseudo_item(index)
else:
data_ret = self._get_stereo_item(index)
_, data_ret_sep = self._get_pseudo_item(index)
data_ret.update(data_ret_sep)
return data_ret
def __len__(self):
return min(len(self.audios), len(self.total_samples))
def name(self):
return 'AugmentDataset'
Psedo_dataset.py 的内容是:
def audio_normalize(samples, desired_rms = 0.1, eps = 1e-4):
rms = np.maximum(eps, np.sqrt(np.mean(samples**2)))
samples = samples * (desired_rms / rms)
return rms / desired_rms, samples
def generate_spectrogram(audio):
spectro = librosa.core.stft(audio, n_fft=512, hop_length=160, win_length=400, center=True)
real = np.expand_dims(np.real(spectro), axis=0)
imag = np.expand_dims(np.imag(spectro), axis=0)
spectro_two_channel = np.concatenate((real, imag), axis=0)
return spectro_two_channel
def process_image(image, augment, square=False):
if square:
iH, iW = 240, 240
H, W = 224, 224
else:
iH, iW = 240, 480
H, W = 224, 448
image = mmcv.imresize(image, (iW,iH))
h,w,_ = image.shape
w_offset = w - W
h_offset = h - H
left = random.randrange(0, w_offset + 1)
upper = random.randrange(0, h_offset + 1)
image = mmcv.imcrop(image, np.array([left, upper, left+W-1, upper+H-1]))
if augment:
enhancer = ImageEnhance.Brightness(Image.fromarray(image))
image = enhancer.enhance(random.random()*0.6 + 0.7)
enhancer = ImageEnhance.Color(image)
image = enhancer.enhance(random.random()*0.6 + 0.7)
return image
class PseudoDataset(data.Dataset):
def __init__(self, opt, list_sample_file):
super().__init__()
self.opt = opt
self.total_samples = mmcv.list_from_file(list_sample_file)
normalize = transforms.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]
)
vision_transform_list = [transforms.ToTensor(), normalize]
self.vision_transform = transforms.Compose(vision_transform_list)
random.seed(self.opt.seed)
# load background, just one large-sizeimage
self.bkg_img = mmcv.imread('./data/bkg.png')
# build binauralizer
hrtf_dir = "./data/subject03"
# binauralizer = SourceBinauralizer(use_hrtfs=True, cipic_dir=hrtf_dir)
self.hrir_db = CIPIC_HRIR(hrtf_dir)
# encode to ambisonics, and then stereo
speakers_phi = (2. * np.arange(2*4) / float(2*4) - 1.) * np.pi
self.speakers_pos = [Position(phi, 0, 1, 'polar') for phi in speakers_phi]
self.sph_mat = spherical_harmonics_matrix(self.speakers_pos, max_order=1) # shape: [N_array_speakers, 4]
# parameter of loading audio
self.exp_audio_len = int(self.opt.audio_length * self.opt.audio_sampling_rate)
#print("Now load {} box info".format(self.opt.mode))
#self.box_info = mmcv.load('./dataset/ASMR/scripts/results/{}_box_info.pkl'.format(self.opt.mode))
if opt.fov == '1/3':
self.fov = 1/3.
elif opt.fov == '1/2':
self.fov = 1/2.
elif opt.fov == '5/6':
self.fov = 5/6.
elif opt.fov == '1':
self.fov = 1.
else:
self.fov = 2/3.
#self.categories = ['acoustic_guitar', 'banjo', 'bass', 'cello', 'drum', 'harp', 'piano', 'trumpet', 'ukelele']
def construct_stereo_direct(self, pst_sources):
stereo = np.zeros((2, self.exp_audio_len))
for src in pst_sources:
left_hrir, right_hrir = self.hrir_db.get_closest(src.position)[1:]
left_signal = np.convolve(src.signal, np.flip(left_hrir, axis=0), 'valid')
right_signal = np.convolve(src.signal, np.flip(right_hrir, axis=0), 'valid')
n_valid, i_start = left_signal.shape[0], left_hrir.shape[0] - 1
stereo[0, i_start:(i_start + n_valid)] += left_signal
stereo[1, i_start:(i_start + n_valid)] += right_signal
return stereo
def construct_stereo_ambi(self, pst_sources):
# encode to ambisonics
Y = spherical_harmonics_matrix([src.position for src in pst_sources], max_order=1) # Y shape: [n_signals, 4]
signals = np.stack([src.signal for src in pst_sources], axis=1) # signals shape: [Len, n_signals]
ambisonic = np.dot(signals, Y) # shape: [Len, 4]
array_speakers_sound = np.dot(ambisonic, self.sph_mat.T)
#array_speakers_sound = np.dot(ambisonic, np.linalg.pinv(self.sph_mat))
array_sources = [PositionalSource(array_speakers_sound[:, i], speaker_pos, \
self.opt.audio_sampling_rate) for i, speaker_pos in enumerate(self.speakers_pos)]
return self.construct_stereo_direct(array_sources)
def construct_stereo_ambi_direct(self, pst_sources):
# encode to ambisonics
Y = spherical_harmonics_matrix([src.position for src in pst_sources], max_order=1)
signals = np.stack([src.signal for src in pst_sources], axis=1)
ambisonic = np.dot(signals, Y) # shape: [Len, 4]
stereo = np.stack((
ambisonic[:, 0] / 2 + ambisonic[:, 1] / 2,
ambisonic[:, 0] / 2 - ambisonic[:, 1] / 2
))
return stereo
def _get_pseudo_item(self, index):
# ensure the number of audios in a scene
N = np.random.choice([1,2,3], p=[0.4, 0.5, 0.1])
chosen_samples = [self.total_samples[index]]
# avoid repeat sample
for _ in range(1, N):
while True:
new_sample = random.choice(self.total_samples)
if new_sample not in chosen_samples:
chosen_samples.append(new_sample)
break
audio_margin = 0
init_H = 360
init_W = 640
pst_sources = []
if self.opt.not_use_background:
cur_bkg_img = np.zeros((init_H, init_W, 3)).astype(np.uint8)
else:
# crop background img, exp_shape: [init_H, init_W, 3]
bkg_start_x = np.random.randint(low=0, high=self.bkg_img.shape[1] - init_W)
bkg_start_y = np.random.randint(low=0, high=self.bkg_img.shape[0] - init_H)
cur_bkg_img = mmcv.imcrop(self.bkg_img.copy(),
np.array([bkg_start_x, bkg_start_y, bkg_start_x+init_W-1, bkg_start_y+init_H-1]))
#H_bkg, W_bkg, _ = cur_bkg_img.shape
corner_record = []
patch_size_record = []
center_x_record = []
audio_list = []
patch_list = []
actual_N = 0
#load audio
for idx, chosen_sample in enumerate(chosen_samples):
audio_file, img_folder = chosen_sample.split(',')
# audio part
audio, audio_rate = librosa.load(audio_file, sr=self.opt.audio_sampling_rate, mono=True)
#randomly get a start time for the audio segment from the original clip
audio_len = len(audio) / audio_rate
assert audio_len - self.opt.audio_length - audio_margin > audio_margin
audio_start_time = random.uniform(audio_margin, audio_len - self.opt.audio_length - audio_margin)
audio_end_time = audio_start_time + self.opt.audio_length
audio_start = int(audio_start_time * self.opt.audio_sampling_rate)
audio_end = audio_start + self.exp_audio_len
audio = audio[audio_start:audio_end]
if self.opt.audio_normal:
normalizer0, audio = audio_normalize(audio)
# video part
# load img **patches**, copy bkg_img and construct a new image
# load accurate frame
cur_img_list = natsort.natsorted(glob(osp.join(img_folder, '*.jpg')))
# get the closest frame to the audio segment
frame_idx = (audio_start_time + audio_end_time) / 2 * 10
frame_idx = int(np.clip(frame_idx, 0, len(cur_img_list) - 1))
img_file = cur_img_list[frame_idx]
img_patch = mmcv.imread(img_file)
if self.opt.patch_resize:
h_patch, w_patch, _= img_patch.shape
resize_ratio = min(1/normalizer0, init_H / h_patch, init_W / 2 / w_patch)
img_patch = mmcv.imrescale(img_patch, resize_ratio * random.uniform(0.8, 1))
H_new, W_new, _ = img_patch.shape
# just consider the overlap in the horizontal axis
occupy_matrix = np.ones((init_W))
# avoid cross border in x dim
occupy_matrix[:(-W_new + 1)] = 0
# avoid overlap
for last_corner_x, W_last in zip(corner_record, patch_size_record):
occupy_x = max(0, last_corner_x - W_new)
occupy_matrix[occupy_x : last_corner_x + W_last] = 1
# random sample position for this mono audio
free_x_positions = np.where(occupy_matrix == 0)[0]
if len(free_x_positions) < 2:
break
actual_N += 1
corner_x = random.choice(free_x_positions)
corner_record.append(corner_x)
patch_size_record.append(W_new)
corner_y = random.randint(0, init_H - H_new)
center_y = corner_y + H_new // 2
center_x = corner_x + W_new // 2
center_x_record.append(center_x)
azimuth = (init_W // 2 - center_x) / init_W * pi * self.fov
elevation = (init_H // 2 - center_y) / init_H * pi / 2
if self.opt.visualize_data:
output_dir = 'others/dataset_visual/{:d}_{:d}'.format(N, index)
if not osp.exists(output_dir):
os.mkdir(output_dir)
if librosa.__version__ >= '0.8.0':
import soundfile as sf
sf.write(osp.join(output_dir, '{:d}.wav'.format(idx)), audio.transpose(), audio_rate)
else:
librosa.output.write_wav(osp.join(output_dir, '{:d}.wav'.format(idx)), audio, sr=audio_rate)
audio_list.append(audio)
patch_list.append(img_file)
pst_sources.append(PositionalSource(audio, Position(azimuth, elevation, 3, 'polar'), audio_rate))
pdb.set_trace()
if self.opt.blending:
center = (center_x, center_y)
mask = 255 * np.ones(img_patch.shape, img_patch.dtype)
cur_bkg_img = cv2.seamlessClone(img_patch, cur_bkg_img, mask, center, cv2.NORMAL_CLONE)
else:
patch_in_start_x = corner_x
patch_in_start_y = corner_y
assert patch_in_start_x >= 0
assert patch_in_start_y >= 0
cur_bkg_img[patch_in_start_y : patch_in_start_y + H_new, patch_in_start_x : patch_in_start_x + W_new] = img_patch
if self.opt.stereo_mode == 'direct':
#print("use direct")
stereo = self.construct_stereo_direct(pst_sources)
elif self.opt.stereo_mode == 'ambisonic':
#print("use ambisonic")
stereo = self.construct_stereo_ambi(pst_sources)
elif self.opt.stereo_mode == 'ambidirect':
#print("use ambidirect")
stereo = self.construct_stereo_ambi_direct(pst_sources)
else:
raise ValueError("please choose right stereo mode")
normalizer, _ = audio_normalize(stereo[0] + stereo[1])
stereo = stereo / normalizer
audio_channel1, audio_channel2 = stereo
frame = cur_bkg_img
if self.opt.visualize_data:
output_dir = 'others/dataset_visual/{:d}_{:d}'.format(N, index)
if librosa.__version__ >= '0.8.0':
import soundfile as sf
sf.write(osp.join(output_dir, 'input_binaural.wav'), stereo.transpose(), audio_rate)
else:
librosa.output.write_wav(osp.join(output_dir, 'input_binaural.wav'), stereo, sr=audio_rate)
mmcv.imwrite(frame, osp.join(output_dir, 'reference.jpg'))
frame = process_image(frame, self.opt.enable_data_augmentation)
frame = self.vision_transform(frame)
#passing the spectrogram of the difference
audio_diff_spec = torch.FloatTensor(generate_spectrogram(audio_channel1 - audio_channel2))
audio_mix_spec = torch.FloatTensor(generate_spectrogram(audio_channel1 + audio_channel2))
data_ret = {'frame': frame, 'audio_diff_spec':audio_diff_spec, 'audio_mix_spec':audio_mix_spec}
# incorporate separation part
assert len(patch_list) == len(audio_list)
left_channel = np.zeros(self.exp_audio_len).astype(np.float32)
right_channel = np.zeros(self.exp_audio_len).astype(np.float32)
if len(audio_list) >= 2:
for cur_audio, center_x in zip(audio_list, center_x_record):
if center_x < init_W // 3:
left_channel += cur_audio
elif center_x > init_W // 3:
right_channel += cur_audio
else:
left_channel += cur_audio
right_channel += cur_audio
_, left_channel = audio_normalize(left_channel)
_, right_channel = audio_normalize(right_channel)
else:
left_channel = audio_channel1
right_channel = audio_channel2
if self.opt.visualize_data:
output_dir = 'others/dataset_visual/{:d}_{:d}'.format(N, index)
if librosa.__version__ >= '0.8.0':
import soundfile as sf
sf.write(osp.join(output_dir, 'gt_left.wav'), left_channel, audio_rate)
sf.write(osp.join(output_dir, 'gt_right.wav'), right_channel, audio_rate)
else:
librosa.output.write_wav(osp.join(output_dir, 'gt_left.wav'), left_channel, audio_rate)
librosa.output.write_wav(osp.join(output_dir, 'gt_right.wav'), right_channel, audio_rate)
sep_mix_spec = audio_mix_spec
sep_diff_spec = torch.FloatTensor(generate_spectrogram(left_channel - right_channel))
frame_sep_list = frame
if self.opt.mode == 'train':
data_ret_sep = {'frame_sep': frame_sep_list, 'sep_diff_spec': sep_diff_spec, 'sep_mix_spec': sep_mix_spec}
else:
data_ret_sep = {'frame_sep': frame_sep_list, 'sep_diff_spec': sep_diff_spec, 'sep_mix_spec': sep_mix_spec, 'left_audio': left_channel, 'right_audio': right_channel}
return data_ret, data_ret_sep
def __getitem__(self, index):
data_ret, data_ret_sep = self._get_pseudo_item(index)
if self.opt.dataset_mode == 'Pseudo_stereo':
return data_ret
elif self.opt.dataset_mode == 'Pseudo_sep':
return data_ret_sep
else:
data_ret.update(data_ret_sep)
return data_ret
def __len__(self):
return len(self.total_samples)
def name(self):
return 'PseudoDataset'
def initialize(self, opt):
pass
解决方案
你的问题在这里:
signals = np.stack([src.signal for src in pst_sources], axis=1) # signals shape: [Len, n_signals]
它看起来pst_sources
是空的,因此您正在尝试堆叠一个空列表。
推荐阅读
- azure - 为 Azure 资源编写脚本的最广泛的方法是什么?
- html - 简单的 CSS 动画在 Safari 中不流畅
- content-management-system - 在 Ghost CMS 中创建多语言博客
- java - 模拟间谍可以返回存根值吗?
- ruby - 为什么 `1i.to_i` 会引发 `RangeError` 而不是 `NoMethodError`?
- java - 如何在 java 中使用接受扩展接口类型的参数的函数创建接口?
- ios - PKPaymentAuthorizationController 委托未调用
- python - 正则表达式循环在 60 次迭代后停止
- angular - 角。数字证书
- acumatica - 页面加载后按条件重定向 Acumatica ERP