python - 输入数据有问题?Tensorflow:UnimplementedError:不支持将字符串转换为浮点数
问题描述
- 除了“traffic_flow”、“uniques”和“total”之外,所有特征都是字符串。
- 我正在尝试训练模型来预测“实际”特征。
- 我使用 SparseCategoricalCrossEntropy 作为我的损失函数,因为“实际”特征的可能总数很大,并且会随着数据的增长而继续增长。
我假设这是我的输入数据的问题。更具体地说,我认为这可能是我的标签在数据集中的类型有问题,所以我也包括了数据集变量。
数据集变量的结构:
<PrefetchDataset 形状:(OrderedDict([(current, (None,)), (predicted, (None,)), (traffic_flow, (None,))), (uniques, (None,)), (total, (None, )), (os, (None,)), (browser, (None,)), (language, (None,)), (referrer_domain, (None,)), (referrer, (None,)), (country , (None,)), (region, (None,)), (city, (None,))]), (None,)), 类型:(OrderedDict([(current, tf.string), (predicted, tf.string), (traffic_flow, tf.float32), (uniques, tf.int32), (total, tf.int32), (os, tf.string), (browser, tf.string), (language, tf. string), (referrer_domain, tf.string), (referrer, tf.string), (country, tf.string), (region, tf.string), (city, tf.string)]), tf.string)>
CSV 样本:
"lc_BXXXXXXXXXXXX","lc_CXXXXXXXXXXXX","lc_AXXXXXXXXXXXX","0.25","28","49","macOS","Chrome 90.0","en","example.com","https://example.com/en/","United States","North Carolina","Charlotte","2021-04-27 21:07:52"
"lc_NXXXXXXXXXXXX","lc_CXXXXXXXXXXXX","lc_CXXXXXXXXXXXX","0.10","38","66","Win10","Chrome 90.0","en","example.com","https://example.com/en/","United States","North Carolina","Raleigh","2021-04-28 15:41:19"
"lc_JXXXXXXXXXXXX","lc_LXXXXXXXXXXXX","lc_LXXXXXXXXXXXX","0.23","28","49","macOS","Chrome 90.0","en","example.com","https://example.com/en/","United States","North Carolina","Raleigh","2021-04-28 15:41:39"
完整代码:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental import preprocessing
all_columns = ["current", "predicted", "actual", "traffic_flow", "uniques", "total", "os", "browser", "language", "referrer_domain", "referrer", "country", "region", "city", "created"]
feature_columns = ["current", "predicted", "traffic_flow", "uniques", "total", "os", "browser", "language", "referrer_domain", "referrer", "country", "region", "city"]
dataset = tf.data.experimental.make_csv_dataset(
"stackoverflow.csv",
header=False,
batch_size=32,
column_names=all_columns,
select_columns=feature_columns + ['actual'],
label_name="actual",
num_epochs=1,
ignore_errors=False,)
#only using this dataframe to get the number of unique actual rows
dataframe = pd.read_csv("stackoverflow.csv", names=["current", "predicted", "actual", "traffic_flow", "uniques", "total", "os", "browser", "language", "referrer_domain", "referrer", "country", "region", "city", "created"])
#get unique actuals (the target variable)
labels = dataframe.copy().pop("actual")
num_labels = pd.unique(labels).size
#apply featurewise normalization to numerical features
def encode_numerical_feature(feature, name, dataset):
# Create a Normalization layer for our feature
normalizer = preprocessing.Normalization()
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
#take column of shape (N) and make it (N, -1)
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the statistics of the data
normalizer.adapt(feature_ds)
# Normalize the input feature
encoded_feature = normalizer(feature)
return encoded_feature
def encode_categorical_feature(feature, name, dataset, is_string):
# Create a lookup layer which will turn strings into integer indices
lookup = preprocessing.StringLookup(output_mode="int")
# Prepare a Dataset that only yields our feature
feature_ds = dataset.map(lambda x, y: x[name])
feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
# Learn the set of possible string values and assign them a fixed integer index
lookup.adapt(feature_ds)
# Turn the string input into integer indices
encoded_feature = lookup(feature)
#Encode the integer index into a float32 to match the numerical features
encoder = preprocessing.CategoryEncoding(num_tokens=lookup.vocabulary_size())
encoded_feature = encoder(encoded_feature)
return encoded_feature
train_ds = dataset
# Categorical features encoded as string
current = tf.keras.Input(shape=(1,), name="current", dtype="string")
predicted = tf.keras.Input(shape=(1,), name="predicted", dtype="string")
os = tf.keras.Input(shape=(1,), name="os", dtype="string")
browser = tf.keras.Input(shape=(1,), name="browser", dtype="string")
language = tf.keras.Input(shape=(1,), name="language", dtype="string")
referrer_domain = tf.keras.Input(shape=(1,), name="referrer_domain", dtype="string")
referrer = tf.keras.Input(shape=(1,), name="referrer", dtype="string")
country = tf.keras.Input(shape=(1,), name="country", dtype="string")
region = tf.keras.Input(shape=(1,), name="region", dtype="string")
city = tf.keras.Input(shape=(1,), name="city", dtype="string")
#numerical features
traffic_flow = tf.keras.Input(shape=(1,), name="traffic_flow")
uniques = tf.keras.Input(shape=(1,), name="uniques")
total = tf.keras.Input(shape=(1,), name="total")
all_inputs = [
current,
predicted,
os,
browser,
language,
referrer_domain,
referrer,
country,
region,
city,
traffic_flow,
uniques,
total,
]
# String categorical features
current_encoded = encode_categorical_feature(current, "current", train_ds, True)
predicted_encoded = encode_categorical_feature(predicted, "predicted", train_ds, True)
os_encoded = encode_categorical_feature(os, "os", train_ds, True)
browser_encoded = encode_categorical_feature(browser, "browser", train_ds, True)
language_encoded = encode_categorical_feature(language, "language", train_ds, True)
referrer_domain_encoded = encode_categorical_feature(referrer_domain, "referrer_domain", train_ds, True)
referrer_encoded = encode_categorical_feature(referrer, "referrer", train_ds, True)
country_encoded = encode_categorical_feature(country, "country", train_ds, True)
region_encoded = encode_categorical_feature(region, "region", train_ds, True)
city_encoded = encode_categorical_feature(city, "city", train_ds, True)
# Numerical features
traffic_flow_encoded = encode_numerical_feature(traffic_flow, "traffic_flow", train_ds)
uniques_encoded = encode_numerical_feature(uniques, "uniques", train_ds)
total_encoded = encode_numerical_feature(total, "total", train_ds)
all_features = layers.concatenate(
[
current_encoded,
predicted_encoded,
os_encoded,
browser_encoded,
language_encoded,
referrer_domain_encoded,
referrer_encoded,
country_encoded,
region_encoded,
city_encoded,
traffic_flow_encoded,
uniques_encoded,
total_encoded,
]
)
x = layers.Dense(32, activation="relu")(all_features)
output = layers.Dense(num_labels, activation="softmax")(x)
model = tf.keras.Model(all_inputs, output)
model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["categorical_accuracy"])
model.fit(dataset, epochs=10)
解决方案
推荐阅读
- javascript - 请求 POST 的主体来表达
- argo-workflows - 如何为 argo 工作流程手动释放互斥锁?
- reactjs - × TypeError:无法解构'Object(...)(...)'的属性'xxx',因为它是未定义的
- docker - 如何在 Docker 上将 redis-cli 与 Redis 一起使用?
- java - java - 如何通过在java中具有条件值来重复循环语句?
- amazon-web-services - 如何授予开发人员访问我的 AWS 账户的权限?
- reactjs - React 前端 + WordPress API 使用前端注册表单创建/注册用户
- bash - shell 别名在主目录中执行
- php - 在小时数数组中查找最近的下一小时
- javascript - 如何通过 OAuth 客户端身份验证找到用户 YouTube 频道?