python - 使用 tensorflow.train.SequenceExample 保存可变二维数据
问题描述
我得到了时间序列数据样本,每个样本包含 3 个时间步长,并且在每个时间步长中,有一个向量包含标量值和可变长度列表。例如
sample_1 = [ [1, 2, [3, 4, 5] ],
[3, 4, [3, 2] ],
[1, 2, [4, 5, 6, 7]]
]
我的样本数据看起来像这样
sample_1 = [ [1, 2, [3, 4, 5] ],
[3, 4, [3, 2] ],
[1, 2, [4, 5, 6, 7]]
]
sample_2 = [ [1, 0, [3, 4] ],
[2, 0, [3, 2, 6]],
[0, 2, [4, 7] ]
]
sample_3 = [ [0, 2, [3, 4, 9, 0 ]],
[2, 3, [3, 2, 9, 1, 0]],
[1, 2, [4] ]
]
sample_data = [sample_1, sample_2, sample_3]
如您所见,每个样本都是一个二维向量,其中包含不同的数据类型(int 或变长列表)。我正在尝试通过使用将其保存到 TFRecords tf.train.SequenceExample
,这是我的代码
import tensorflow as tf
sample_1 = [[1, 2, [3, 4, 5]],
[3, 4, [3, 2]],
[1, 2, [4, 5, 6, 7]]
]
sample_2 = [[1, 0, [3, 4]],
[2, 0, [3, 2, 6]],
[0, 2, [4, 7]]
]
sample_3 = [[0, 2, [3, 4, 9, 0]],
[2, 3, [3, 2, 9, 1, 0]],
[1, 2, [4]]
]
sample_data = [sample_1, sample_2, sample_3]
writer = tf.python_io.TFRecordWriter('test.tfrecord')
for i, sample in enumerate(sample_data):
sample_column_data = []
# only contains 3 columns
for _c in range(len(sample[0])):
_col_data = []
for _r in range(len(sample)):
_col_data.append(sample[_r][_c])
sample_column_data.append(_col_data)
feature_1 = [
tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in sample_column_data[0]
]
feature_2 = [
tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in sample_column_data[1]
]
feature_3 = [
[
tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in v_list
] for v_list in sample_column_data[2]
]
example = tf.train.SequenceExample(
context=tf.train.Features(feature={
"index": tf.train.Feature(int64_list=tf.train.Int64List(value=[i]))
}),
feature_list={
"dim_0": tf.train.FeatureList(feature=feature_1),
"dim_1": tf.train.FeatureList(feature=feature_2),
"dim_2": tf.train.FeatureList(feature=feature_3)
}
)
serialied = example.SerializeToString()
writer.write(serialied)
writer.close()
但我收到以下错误
File "/mnt/f/tf_SequenceExample.py", line 54, in <module>
"dim_2": tf.train.FeatureList(feature=feature_3)
TypeError: Parameter to MergeFrom() must be instance of same class: expected tensorflow.Feature got tensorflow.FeatureList.
那么如何保存和读取具有不同长度列表的二维样本呢?
解决方案
我想出了一个非常容易理解的解决方法,虽然有些特征有可变长度的列表,但我可以将每个特征的每一行保存到一个 FeatureList 中,并用一个单独的名称单独保存,并将其维度分别保存在上下文中。
保存数据的代码
import tensorflow as tf
tf.enable_eager_execution()
sample_1 = [[1, 2, [3, 4, 5]],
[3, 4, [3, 2]],
[1, 2, [4, 5, 6, 7]]
]
sample_2 = [[1, 0, [3, 4]],
[2, 0, [3, 2, 6]],
[0, 2, [4, 7]]
]
sample_3 = [[0, 2, [3, 4, 9, 0]],
[2, 3, [3, 2, 9, 1, 0]],
[1, 2, [4]]
]
sample_data = [sample_1, sample_2, sample_3]
tfrecords_file = 'test.tfrecords'
writer = tf.io.TFRecordWriter(tfrecords_file)
for i, sample in enumerate(sample_data):
sample_column_data = []
# only contains 3 columns
for _c in range(len(sample[0])):
_col_data = []
for _r in range(len(sample)):
_col_data.append(sample[_r][_c])
sample_column_data.append(_col_data)
feature_1 = [
tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in sample_column_data[0]
]
feature_2 = [
tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in sample_column_data[1]
]
feature_3 = [] # store each list in each row
feature_3_len = [] # store the coressponding dimension
for row in sample_column_data[2]:
feature_3_len.append(len(row))
feature_3.append([tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in row])
feature_list = {
'dim_0': tf.train.FeatureList(feature=feature_1),
'dim_1': tf.train.FeatureList(feature=feature_2)
}
context_feature = {
'index': tf.train.Feature(int64_list=tf.train.Int64List(value=[i]))
}
for i, (_feature, _len) in enumerate(zip(feature_3, feature_3_len)):
feature_list['dim_2_{}'.format(str(i))] = tf.train.FeatureList(feature=_feature)
context_feature['context_dim_2_{}'.format(str(i))] = tf.train.Feature(int64_list=tf.train.Int64List(value=[_len]))
example = tf.train.SequenceExample(
context=tf.train.Features(feature=context_feature),
feature_lists=tf.train.FeatureLists(
feature_list=feature_list)
)
serialied = example.SerializeToString()
writer.write(serialied)
writer.close()
读取数据的代码
import tensorflow as tf
tf.enable_eager_execution()
tfrecords_file = 'test.tfrecords'
context_dict = {'index': tf.FixedLenFeature([], dtype=tf.int64)}
features_dict = {'dim_0': tf.VarLenFeature(dtype=tf.int64),
'dim_1': tf.VarLenFeature(dtype=tf.int64)
}
_window_size = 3
for i in range(_window_size):
features_dict['dim_2_{}'.format(str(i))] = tf.VarLenFeature(dtype=tf.int64)
context_dict['context_dim_2_{}'.format(str(i))] = tf.FixedLenFeature([], dtype=tf.int64)
def parse_tfrecord(example):
context, features = tf.parse_single_sequence_example(
example, sequence_features=features_dict, context_features=context_dict)
index = context['index']
context_dim_2 = [context['context_dim_2_{}'.format(str(i))] for i in range(_window_size)]
# import ipdb; ipdb.set_trace(context=20)
dim_0 = tf.sparse_tensor_to_dense(features['dim_0'])
dim_1 = tf.sparse_tensor_to_dense(features['dim_1'])
dim_2 = [
tf.sparse_tensor_to_dense(features['dim_2_{}'.format(str(i))]) for i in range(_window_size)
]
return (index, *context_dim_2, dim_0, dim_1, *dim_2)
Dataset = tf.data.TFRecordDataset(tfrecords_file)
Dataset = Dataset.map(parse_tfrecord)
iterator = Dataset.make_one_shot_iterator()
# with tf.Session() as sess:
tf_data = []
for _i in range(_window_size):
tf_data.append(iterator.get_next())
中的context
参数tf.train.SequenceExample
是 的元数据features_lists
,描述 中的数据features_lists
,例如每个特征的每一行的维度。
完毕。
推荐阅读
- python - Pandas 聚合 - 如何根据活动日期查找最近的活动
- c# - 使用 Fiddler 和 Web REST API
- node.js - 为什么 Node.js 事件循环在空闲时这么慢?
- python - TensorFlow Data 不适用于多输入 keras 模型
- json - 使用 JQ 处理嵌套对象映射中的空值
- java - 使用带有 JWT 的 Angular 和 Spring Boot 服务的 PUT 请求出现 403 错误
- amazon-web-services - 已验证的 AWS 简单电子邮件服务显示欺骗警告
- mongodb - 使用 SpringBoot 部署到 AWS 时,Mongo NoSuchMethodError insertOne
- python - 如何在 Python 中并行扫描多个超参数集?
- r - 当数据值在指定参数内时 ifelse 测试返回 NA