python - 如何为 tensorflow 服务的导出 tensorflow 2.0 keras 模型的输入层设计/预处理特征
问题描述
我使用 TensorFlow-2.0-beta1 创建了一个模型。这使用 Keras 功能 API 对输入数据执行回归。数据需要对分类特征进行一次性编码,并将数字输入归一化。在过去使用 TF1.11 中的 Estimators API 时,这可以通过使用特征列并将工程应用于 ServingInputReceiver 中的特征来解决。从 keras 导出模型时,有没有办法做类似的事情?
import tensorflow as tf
import pickle
import tensorflow_datasets as tfds
import pandas as pd
tf.keras.backend.clear_session() # For easy reset of notebook state.
VERSION = tf.__version__
CWD = os.getcwd()
PARENT_DIR = os.path.split(CWD)[0]
DATETIME = datetime.datetime.utcnow()
DATA_DIR = os.path.join(PARENT_DIR, 'data')
train_file_path = os.path.join(DATA_DIR, 'traindf.csv')
test_file_path = os.path.join(DATA_DIR, 'testdf.csv')
CATEGORIES = os.path.join(DATA_DIR, "CATEGORIES")
fileObject = open(CATEGORIES, 'rb')
CATEGORIES = pickle.load(fileObject)
fileObject.close()
NUMERICSTATS = os.path.join(DATA_DIR, "NUMERICSTATS")
fileObject = open(NUMERICSTATS, 'rb')
NUMERICSTATS = pickle.load(fileObject)
fileObject.close()
# CSV columns in the input file.
with open(train_file_path, 'r') as f:
names_row = f.readline()
CSV_COLUMNS = names_row.rstrip('\n').split(',')
print(CSV_COLUMNS)
drop_columns = ['SubSilo','Year','StockID', 'QuickRef', 'sumUKQuantity', 'sumNonUKQuantity']
columns_to_use = [col for col in CSV_COLUMNS if col not in drop_columns]
columns_to_use
LABEL_COLUMN = 'totalqty'
FEATURE_COLUMNS = [column for column in columns_to_use if column != LABEL_COLUMN]
test_labels = testdf[LABEL_COLUMN]
COLUMN_DEFAULTS = [tf.dtypes.string, #ProductBrand
tf.dtypes.string, #Department
tf.dtypes.string, #ProductType
tf.dtypes.string, #ProductSubType
tf.dtypes.string, #Silo
tf.dtypes.string, #Level
tf.dtypes.string, #BaseColour
tf.dtypes.string, #Sport
tf.dtypes.string, #UKSize
tf.dtypes.float32, #UnitCostPrice
tf.dtypes.float32, #ExVatSalesValue
tf.dtypes.float32, #RRP_GBP
tf.dtypes.string, #Week
tf.dtypes.int32] #totalqty
def get_dataset(file_path):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=60, # Artificially small to make examples easier to show.
label_name=LABEL_COLUMN,
select_columns=columns_to_use ,
column_defaults=COLUMN_DEFAULTS,
num_epochs=1,
ignore_errors=True,
shuffle=False)
return dataset
raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)
def process_categorical_data(data, categories):
"""Returns a one-hot encoded tensor representing categorical values."""
# Remove leading ' '.
data = tf.strings.regex_replace(data, '^ ', '')
# Remove trailing '.'.
data = tf.strings.regex_replace(data, r'\.$', '')
# ONE HOT ENCODE
# Reshape data from 1d (a list) to a 2d (a list of one-element lists)
data = tf.reshape(data, [-1, 1])
# For each element, create a new list of boolean values the length of categories,
# where the truth value is element == category label
data = tf.equal(categories, data)
# Cast booleans to floats.
data = tf.cast(data, tf.float32)
# The entire encoding can fit on one line:
# data = tf.cast(tf.equal(categories, tf.reshape(data, [-1, 1])), tf.float32)
return data
def process_continuous_data(data, mean, std):
# Normalize data
data = (tf.cast(data, tf.float32) - mean) / std
return tf.reshape(data, [-1, 1])
def preprocess(features, labels):
# Process categorial features.
for feature in CATEGORIES.keys():
features[feature] = process_categorical_data(features[feature], CATEGORIES[feature])
# Process continuous features.
for feature in NUMERICSTATS.keys():
features[feature] = process_continuous_data(features[feature],
NUMERICSTATS[feature]['mean'],
NUMERICSTATS[feature]['std']
)
# Assemble features into a single tensor.
features = tf.concat([features[column] for column in FEATURE_COLUMNS], 1)
return features, labels
train_data = raw_train_data.map(preprocess).shuffle(len(traindf))
test_data = raw_test_data.map(preprocess)
def get_model(input_dim):
"""Create a Keras model with layers.
Args:
input_dim: (int) The shape of an item in a batch.
Returns:
A Keras model.
"""
inputs = tf.keras.Input(shape=(input_dim,))
x = tf.keras.layers.Dense(244, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(inputs)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(200, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(100, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(50, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(inputs, outputs)
return model
input_shape, output_shape = train_data.output_shapes
input_dimension = input_shape.dims[1] # [0] is the batch size
model = get_model(input_dimension)
optimizer = tf.keras.optimizers.Adam(0.001)
model.compile(loss='mse',
optimizer=optimizer,
metrics=['mae', 'mse', tf.keras.metrics.RootMeanSquaredError()])
# The patience parameter is the amount of epochs to check for improvement
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)
# Display training progress by printing a single dot for each completed epoch
class PrintDot(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs):
if epoch % 100 == 0: print('')
print('.', end='')
tensor_board = tf.keras.callbacks.TensorBoard(log_dir=os.path.join(PARENT_DIR, 'tensorBoardLogs'))
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
factor=0.2,
patience=4,
verbose=1,
min_lr=0.00001)
history = model.fit(train_data,
validation_data=test_data,
epochs=100,
verbose=1,
callbacks=[early_stop,
PrintDot(),
tensor_board,
reduce_lr]
)
tf.keras.experimental.export_saved_model(model, saved_model_path=os.path.join(PARENT_DIR, 'models/1'))
我想要的是有一个模型,我可以使用 TensorFlow 服务来提供服务,它将获取训练数据中的特征,其中 13 个并在模型本身中对它们进行预处理。因此不需要使用像 Flask 这样的东西作为中间人
解决方案
您可以考虑 using ,它在 期间应用了与您在Tensorflow Transform
期间应用的相同的转换。Serving
Training
您可以使用以下代码替换您的函数 ,process_categorical_data
和process_continuous_data
:preprocess
def preprocessing_fn(inputs):
"""Preprocess input columns into transformed columns."""
x = inputs['x']
y = inputs['y']
s = inputs['s']
x_centered = x - tft.mean(x)
y_normalized = tft.scale_to_0_1(y)
s_integerized = tft.compute_and_apply_vocabulary(s)
x_centered_times_y_normalized = (x_centered * y_normalized)
return {
'x_centered': x_centered,
'y_normalized': y_normalized,
's_integerized': s_integerized,
'x_centered_times_y_normalized': x_centered_times_y_normalized,
}
# Ignore the warnings
with tft_beam.Context(temp_dir=tempfile.mkdtemp()):
transformed_dataset, transform_fn = ( # pylint: disable=unused-variable
(raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
preprocessing_fn))
transformed_data, transformed_metadata = transformed_dataset
print('\nRaw data:\n{}\n'.format(pprint.pformat(raw_data)))
print('Transformed data:\n{}'.format(pprint.pformat(transformed_data)))
推荐阅读
- java - 无法阻止 BufferReader 在控制台中接受输入
- docker - 带有 Elixir CircleCI 配置的纱线:找不到 package.json 文件
- c# - 如何在 .NET Core 中裁剪图像?
- python - “TypeError:join() 参数必须是 str 或 bytes,而不是 'NoneType'”,使用 Magpie+Tensorflow/Python3
- python - 两台电脑之间通过 ssh 传输文件
- c# - C# 列表集合在方法之外失去其价值
- arrays - 从 Select-String 中取出每一行的最后一个单词
- powerbi - 如何在 Power BI 自定义视觉对象中设置 Bing 地图?
- scala - 单列分隔字符串 rdd 到正确列的数据框
- rest - REST API 创建虚拟存储库