首页 > 解决方案 > 输入数据有问题?Tensorflow:UnimplementedError:不支持将字符串转换为浮点数

问题描述

我假设这是我的输入数据的问题。更具体地说,我认为这可能是我的标签在数据集中的类型有问题,所以我也包括了数据集变量。

数据集变量的结构:

<PrefetchDataset 形状:(OrderedDict([(current, (None,)), (predicted, (None,)), (traffic_flow, (None,))), (uniques, (None,)), (total, (None, )), (os, (None,)), (browser, (None,)), (language, (None,)), (referrer_domain, (None,)), (referrer, (None,)), (country , (None,)), (region, (None,)), (city, (None,))]), (None,)), 类型:(OrderedDict([(current, tf.string), (predicted, tf.string), (traffic_flow, tf.float32), (uniques, tf.int32), (total, tf.int32), (os, tf.string), (browser, tf.string), (language, tf. string), (referrer_domain, tf.string), (referrer, tf.string), (country, tf.string), (region, tf.string), (city, tf.string)]), tf.string)>

CSV 样本:

"lc_BXXXXXXXXXXXX","lc_CXXXXXXXXXXXX","lc_AXXXXXXXXXXXX","0.25","28","49","macOS","Chrome 90.0","en","example.com","https://example.com/en/","United States","North Carolina","Charlotte","2021-04-27 21:07:52"
"lc_NXXXXXXXXXXXX","lc_CXXXXXXXXXXXX","lc_CXXXXXXXXXXXX","0.10","38","66","Win10","Chrome 90.0","en","example.com","https://example.com/en/","United States","North Carolina","Raleigh","2021-04-28 15:41:19"
"lc_JXXXXXXXXXXXX","lc_LXXXXXXXXXXXX","lc_LXXXXXXXXXXXX","0.23","28","49","macOS","Chrome 90.0","en","example.com","https://example.com/en/","United States","North Carolina","Raleigh","2021-04-28 15:41:39"

完整代码:

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental import preprocessing

all_columns = ["current", "predicted", "actual", "traffic_flow", "uniques", "total", "os", "browser", "language", "referrer_domain", "referrer", "country", "region", "city", "created"]
feature_columns = ["current", "predicted", "traffic_flow", "uniques", "total", "os", "browser", "language", "referrer_domain", "referrer", "country", "region", "city"]

dataset = tf.data.experimental.make_csv_dataset(
    "stackoverflow.csv",
    header=False,
    batch_size=32,
    column_names=all_columns,
    select_columns=feature_columns + ['actual'],
    label_name="actual",
    num_epochs=1,
    ignore_errors=False,)

#only using this dataframe to get the number of unique actual rows
dataframe = pd.read_csv("stackoverflow.csv", names=["current", "predicted", "actual", "traffic_flow", "uniques", "total", "os", "browser", "language", "referrer_domain", "referrer", "country", "region", "city", "created"])

#get unique actuals (the target variable)
labels = dataframe.copy().pop("actual")
num_labels = pd.unique(labels).size

#apply featurewise normalization to numerical features
def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = preprocessing.Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    #take column of shape (N) and make it (N, -1)
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    # Normalize the input feature
    encoded_feature = normalizer(feature)

    return encoded_feature

def encode_categorical_feature(feature, name, dataset, is_string):
    # Create a lookup layer which will turn strings into integer indices
    lookup = preprocessing.StringLookup(output_mode="int")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    lookup.adapt(feature_ds)

    # Turn the string input into integer indices
    encoded_feature = lookup(feature)

    #Encode the integer index into a float32 to match the numerical features
    encoder = preprocessing.CategoryEncoding(num_tokens=lookup.vocabulary_size())
    encoded_feature = encoder(encoded_feature)
    
    return encoded_feature

train_ds = dataset

# Categorical features encoded as string
current = tf.keras.Input(shape=(1,), name="current", dtype="string")
predicted = tf.keras.Input(shape=(1,), name="predicted", dtype="string")
os = tf.keras.Input(shape=(1,), name="os", dtype="string")
browser = tf.keras.Input(shape=(1,), name="browser", dtype="string")
language = tf.keras.Input(shape=(1,), name="language", dtype="string")
referrer_domain = tf.keras.Input(shape=(1,), name="referrer_domain", dtype="string")
referrer = tf.keras.Input(shape=(1,), name="referrer", dtype="string")
country = tf.keras.Input(shape=(1,), name="country", dtype="string")
region = tf.keras.Input(shape=(1,), name="region", dtype="string")
city = tf.keras.Input(shape=(1,), name="city", dtype="string")

#numerical features
traffic_flow = tf.keras.Input(shape=(1,), name="traffic_flow")
uniques = tf.keras.Input(shape=(1,), name="uniques")
total = tf.keras.Input(shape=(1,), name="total")

all_inputs = [
  current,
  predicted,
  os,
  browser,
  language,
  referrer_domain,
  referrer,
  country,
  region,
  city,
  traffic_flow,
  uniques,
  total,
]

# String categorical features
current_encoded = encode_categorical_feature(current, "current", train_ds, True)
predicted_encoded = encode_categorical_feature(predicted, "predicted", train_ds, True)
os_encoded = encode_categorical_feature(os, "os", train_ds, True)
browser_encoded = encode_categorical_feature(browser, "browser", train_ds, True)
language_encoded = encode_categorical_feature(language, "language", train_ds, True)
referrer_domain_encoded = encode_categorical_feature(referrer_domain, "referrer_domain", train_ds, True)
referrer_encoded = encode_categorical_feature(referrer, "referrer", train_ds, True)
country_encoded = encode_categorical_feature(country, "country", train_ds, True)
region_encoded = encode_categorical_feature(region, "region", train_ds, True)
city_encoded = encode_categorical_feature(city, "city", train_ds, True)

# Numerical features
traffic_flow_encoded = encode_numerical_feature(traffic_flow, "traffic_flow", train_ds)
uniques_encoded = encode_numerical_feature(uniques, "uniques", train_ds)
total_encoded = encode_numerical_feature(total, "total", train_ds)

all_features = layers.concatenate(
    [
      current_encoded,
      predicted_encoded,
      os_encoded,
      browser_encoded,
      language_encoded,
      referrer_domain_encoded,
      referrer_encoded,
      country_encoded,
      region_encoded,
      city_encoded,
      traffic_flow_encoded,
      uniques_encoded,
      total_encoded,
    ]
)

x = layers.Dense(32, activation="relu")(all_features)
output = layers.Dense(num_labels, activation="softmax")(x)
model = tf.keras.Model(all_inputs, output)

model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["categorical_accuracy"])

model.fit(dataset, epochs=10)

标签: pythontensorflowmachine-learning

解决方案


推荐阅读