python - 带有 Scikit-Learn 的 Google Cloud ML 引发:“dict”对象没有“lower”属性
问题描述
我使用以下教程在 Google Cloud 中使用我的 Scikit-learn 情绪分析模型: https ://cloud.google.com/ml-engine/docs/scikit/quickstart
我的模型定义如下:
import csv
import os
from collections import defaultdict
import sys
import re
import numpy as np
import random
import math
import sklearn.datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
def build_data_cv(datafile, trait_number):
"""
Loads data
"""
with open(datafile, "rb") as csvf:
csvreader = csv.reader(csvf,delimiter=',',quotechar='"')
data = []
target = np.array([])
for index, line in enumerate(csvreader):
# escape header
if index < 1:
continue
document = unicode(line[1], errors='replace')
data.append(document)
index_of_trait = trait_number + 2
if line[index_of_trait].lower()=='y':
target = np.append(target, 1.0)
else:
target = np.append(target, 0.0)
dataset = sklearn.datasets.base.Bunch(data=data, target=target)
dataset.target_names = ["positive", "negative"]
return dataset
# main program
if __name__=="__main__":
current_directory = os.getcwd() + "/"
data_file = current_directory + "essays.csv"
class_labels = ['EXT','NEU','AGR','CON','OPN']
for index, selected_trait in enumerate(class_labels):
print selected_trait
dataset = build_data_cv(data_file, index)
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=0)
clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, random_state=42,
max_iter=5, tol=None)),
])
# clf.fit(X_train, y_train)
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
'tfidf__use_idf': (True, False),
'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)
# fit the model
gs_clf.fit(X_train, y_train)
# simple test score
# print clf.score(X_test, y_test)
# 10-fold cross-validation score
scores = cross_val_score(gs_clf, dataset.data, dataset.target, cv=10)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
# Export the classifier to a file
joblib.dump(gs_clf, 'svm_gs_'+selected_trait+'.joblib')
print "______________________"
输入文件可在此处获得: https ://github.com/novinfard/profiler-sentiment-analysis/blob/master/model/input_dataset/essays.csv
当我想使用这个 shell 命令使用gcloud ml-engine local predict测试我的模型时:
MODEL_DIR="gs://MY_BUCKET/"
INPUT_FILE="input.json"
FRAMEWORK="SCIKIT_LEARN"
gcloud ml-engine local predict --model-dir=$MODEL_DIR \
--json-instances $INPUT_FILE \
--framework $FRAMEWORK
它引发了以下错误:
File "/Users/XXX/google-cloud-sdk/lib/third_party/ml_sdk/cloud/ml/prediction/frameworks/sk_xg_prediction_lib.py", line 57, in predict
"Exception during sklearn prediction: " + str(e))
cloud.ml.prediction.prediction_utils.PredictionError: Failed to run the provided model: Exception during sklearn prediction: 'dict' object has no attribute 'lower' (Error code: 2)
input.json 定义如下:
{"instances": [["the quick brown fox"],["another test"]]}
问题是什么以及如何解决?
解决方案
正如评论中所讨论的,包含JSON_INSTANCES
需要在每一行上有一个 JSON 的文件。
在这种情况下,它将是
"the quick brown fox"
"another test"
这可能看起来很奇怪,但它是一个有效的 JSON。
推荐阅读
- android - 资源链接失败是什么意思?
- sendgrid - 有没有办法通过 SendGrid API 调用检索生成的动态模板的 HTML 电子邮件正文?
- ruby-on-rails - 如何在文本字段中输入代码列表,通过“,”并获得相同数量的单独对象Rails
- node.js - 为 PayPal 课程创建订阅系统
- r - 绘图右侧的线标签不重叠
- rest - 我如何使用 wso2 使用 rest api 调用 uipath 机器人?
- events - Windows 应用程序事件日志和 XFF
- python - 为什么在 Windows 上安装 MXNET 的 pip 命令无法正常工作/安装不正确?
- node.js - 我正在尝试测试 node.js
- sql - CTE 多连接