首页 > 解决方案 > 使用 tensorflow.train.SequenceExample 保存可变二维数据

问题描述

我得到了时间序列数据样本,每个样本包含 3 个时间步长,并且在每个时间步长中,有一个向量包含标量值和可变长度列表。例如

sample_1 = [ [1, 2, [3, 4, 5]   ],
             [3, 4, [3, 2]      ],
             [1, 2, [4, 5, 6, 7]]
           ]

我的样本数据看起来像这样

sample_1 = [ [1, 2, [3, 4, 5]   ],
             [3, 4, [3, 2]      ],
             [1, 2, [4, 5, 6, 7]]
           ]

sample_2 = [ [1, 0, [3, 4]   ],
             [2, 0, [3, 2, 6]],
             [0, 2, [4, 7]   ]
           ]

sample_3 = [ [0, 2, [3, 4, 9, 0   ]],
             [2, 3, [3, 2, 9, 1, 0]],
             [1, 2, [4]            ]
           ]

sample_data = [sample_1, sample_2, sample_3]

如您所见,每个样本都是一个二维向量,其中包含不同的数据类型(int 或变长列表)。我正在尝试通过使用将其保存到 TFRecords tf.train.SequenceExample,这是我的代码

import tensorflow as tf


sample_1 = [[1, 2, [3, 4, 5]],
            [3, 4, [3, 2]],
            [1, 2, [4, 5, 6, 7]]
            ]

sample_2 = [[1, 0, [3, 4]],
            [2, 0, [3, 2, 6]],
            [0, 2, [4, 7]]
            ]

sample_3 = [[0, 2, [3, 4, 9, 0]],
            [2, 3, [3, 2, 9, 1, 0]],
            [1, 2, [4]]
            ]

sample_data = [sample_1, sample_2, sample_3]


writer = tf.python_io.TFRecordWriter('test.tfrecord')

for i, sample in enumerate(sample_data):

    sample_column_data = []
    # only contains 3 columns
    for _c in range(len(sample[0])):
        _col_data = []
        for _r in range(len(sample)):
            _col_data.append(sample[_r][_c])
        sample_column_data.append(_col_data)

    feature_1 = [
        tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in sample_column_data[0]
    ]

    feature_2 = [
        tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in sample_column_data[1]
    ]

    feature_3 = [
        [
            tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in v_list
        ] for v_list in sample_column_data[2]
    ]
    example = tf.train.SequenceExample(
        context=tf.train.Features(feature={
            "index": tf.train.Feature(int64_list=tf.train.Int64List(value=[i]))
        }),
        feature_list={
            "dim_0": tf.train.FeatureList(feature=feature_1),
            "dim_1": tf.train.FeatureList(feature=feature_2),
            "dim_2": tf.train.FeatureList(feature=feature_3)
        }
    )
    serialied = example.SerializeToString()
    writer.write(serialied)

writer.close()

但我收到以下错误

File "/mnt/f/tf_SequenceExample.py", line 54, in <module>
    "dim_2": tf.train.FeatureList(feature=feature_3)
TypeError: Parameter to MergeFrom() must be instance of same class: expected tensorflow.Feature got tensorflow.FeatureList.

那么如何保存和读取具有不同长度列表的二维样本呢?

标签: pythontensorflow

解决方案


我想出了一个非常容易理解的解决方法,虽然有些特征有可变长度的列表,但我可以将每个特征的每一行保存到一个 FeatureList 中,并用一个单独的名称单独保存,并将其维度分别保存在上下文中。

保存数据的代码

import tensorflow as tf
tf.enable_eager_execution()

sample_1 = [[1, 2, [3, 4, 5]],
            [3, 4, [3, 2]],
            [1, 2, [4, 5, 6, 7]]
            ]
sample_2 = [[1, 0, [3, 4]],
            [2, 0, [3, 2, 6]],
            [0, 2, [4, 7]]
            ]  
sample_3 = [[0, 2, [3, 4, 9, 0]],
            [2, 3, [3, 2, 9, 1, 0]],
            [1, 2, [4]]
            ]  
sample_data = [sample_1, sample_2, sample_3]


tfrecords_file = 'test.tfrecords'

writer = tf.io.TFRecordWriter(tfrecords_file)

for i, sample in enumerate(sample_data):

    sample_column_data = []
    # only contains 3 columns
    for _c in range(len(sample[0])):
        _col_data = []
        for _r in range(len(sample)):
            _col_data.append(sample[_r][_c])
        sample_column_data.append(_col_data)

    feature_1 = [
        tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in sample_column_data[0]
    ]

    feature_2 = [
        tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in sample_column_data[1]
    ]

    feature_3 = []  # store each list in each row
    feature_3_len = []   # store the coressponding dimension
    for row in sample_column_data[2]:
        feature_3_len.append(len(row))
        feature_3.append([tf.train.Feature(int64_list=tf.train.Int64List(value=[v])) for v in row])

    feature_list = {
        'dim_0': tf.train.FeatureList(feature=feature_1),
        'dim_1': tf.train.FeatureList(feature=feature_2)
    }

    context_feature = {
        'index': tf.train.Feature(int64_list=tf.train.Int64List(value=[i]))
    }

    for i, (_feature, _len) in enumerate(zip(feature_3, feature_3_len)):
        feature_list['dim_2_{}'.format(str(i))] = tf.train.FeatureList(feature=_feature)
        context_feature['context_dim_2_{}'.format(str(i))] = tf.train.Feature(int64_list=tf.train.Int64List(value=[_len]))

    example = tf.train.SequenceExample(
        context=tf.train.Features(feature=context_feature),
        feature_lists=tf.train.FeatureLists(
            feature_list=feature_list)
    )
    serialied = example.SerializeToString()
    writer.write(serialied)

writer.close()

读取数据的代码

import tensorflow as tf
tf.enable_eager_execution()

tfrecords_file = 'test.tfrecords'
context_dict = {'index': tf.FixedLenFeature([], dtype=tf.int64)}
features_dict = {'dim_0': tf.VarLenFeature(dtype=tf.int64),
                 'dim_1': tf.VarLenFeature(dtype=tf.int64)
                 }

_window_size = 3
for i in range(_window_size):
    features_dict['dim_2_{}'.format(str(i))] = tf.VarLenFeature(dtype=tf.int64)
    context_dict['context_dim_2_{}'.format(str(i))] = tf.FixedLenFeature([], dtype=tf.int64)

def parse_tfrecord(example):
    context, features = tf.parse_single_sequence_example(
        example, sequence_features=features_dict, context_features=context_dict)

    index = context['index']
    context_dim_2 = [context['context_dim_2_{}'.format(str(i))] for i in range(_window_size)]
    # import ipdb; ipdb.set_trace(context=20)

    dim_0 = tf.sparse_tensor_to_dense(features['dim_0'])
    dim_1 = tf.sparse_tensor_to_dense(features['dim_1'])
    dim_2 = [
        tf.sparse_tensor_to_dense(features['dim_2_{}'.format(str(i))]) for i in range(_window_size)
    ]
    return (index, *context_dim_2, dim_0, dim_1,  *dim_2)

Dataset = tf.data.TFRecordDataset(tfrecords_file)
Dataset = Dataset.map(parse_tfrecord)
iterator = Dataset.make_one_shot_iterator()
# with tf.Session() as sess:
tf_data = []
for _i in range(_window_size):
    tf_data.append(iterator.get_next())

中的context参数tf.train.SequenceExample是 的元数据features_lists,描述 中的数据features_lists,例如每个特征的每一行的维度。

完毕。


推荐阅读