python - Python 'ascii' 编解码器无法对位置 151 中的字符 u'\u2013' 进行编码:序数不在范围内(128)AWS Glue
问题描述
我有一个在 AWS Glue 中运行的 Python 脚本。直到上个月工作正常,我遇到了以下错误。
我在 SO 上看到了几种不同的讨厌的黑客攻击,并且在试图弄清楚如何在我的脚本中实现正确答案时遇到了问题:UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 13: ordinal not in range(128 )
'ascii' codec can't encode character u'\u2013' in position 151: ordinal not in range(128)
. 看起来这个字符是em dash
- 很可能是用户输入的,而这个脚本在我编写它时没有考虑到这一点。我已经在 SO 上查找了与此相关的其他问题,但其中很多都涉及我不想实施的讨厌的黑客攻击。我的 Python 非常稀疏(胶水生成了大部分样板文件)。如何修改我的脚本以消除此编码错误?
import boto3
import sys
from datetime import datetime
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.types import *
from awsglue.context import GlueContext
from awsglue.job import Job
# @params: [JOB_NAME, database, path, company_id, user_id]
args = getResolvedOptions(
sys.argv, ['JOB_NAME', 'database', 'path', 'company_id', 'user_id'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource = glueContext.create_dynamic_frame.from_catalog(
database=args['database'], table_name="employee_data", transformation_ctx="datasource0")
filtered_data = Filter.apply(
frame=datasource, f=lambda x: x["company_id"] == args['company_id'])
filtered_data = Filter.apply(
frame=filtered_data, f=lambda x: x["email"] == args['user_id'])
def get_diffs(rec):
diff_length = 0
try:
diff_length = len(rec["diffs"])
except:
pass
if diff_length > 0:
for key, value in rec["diffs"].items():
if len(value) > 0:
rec[key] = {}
rec[key]["old"] = value[0]
rec[key]["new"] = value[1]
rec[key]["timestamp"] = rec["timestamp"]
rec[key]["action"] = rec["action"]
rec[key]["source"] = rec["source"]
rec[key]["requester"] = rec["requester"] if rec["requester"] != None else ""
rec[key]["employee"] = rec["email"]
rec[key]["field"] = key
del rec["diffs"]
else:
rec["0"] = {}
rec["0"]["old"] = ""
rec["0"]["new"] = ""
rec["0"]["timestamp"] = rec["timestamp"]
rec["0"]["action"] = rec["action"]
rec["0"]["source"] = rec["source"]
rec["0"]["requester"] = rec["requester"] if rec["requester"] != None else ""
rec["0"]["employee"] = rec["email"]
rec["0"]["field"] = ""
del rec["payload"]
del rec["partition"]
del rec["timestamp"]
del rec["source"]
del rec["action"]
del rec["requester"]
del rec["email"]
del rec["company_id"]
return rec
filtered_data = Map.apply(
frame=filtered_data, f=get_diffs)
new_data_rdd = filtered_data.toDF().rdd
new_data = new_data_rdd.flatMap(
lambda row: (
(
row
)
)
)
new_data = new_data.filter(lambda x: x)
schema = StructType([StructField('action', StringType(), False), StructField('field', StringType(), False), StructField('old', StringType(), False), StructField('employee', StringType(
), False), StructField('source', StringType(), False), StructField('timestamp', StringType(), False), StructField('requester', StringType(), False), StructField('new', StringType(), False)])
datasource0 = glueContext.create_dynamic_frame.from_rdd(
new_data, name='unpivoted', schema=schema)
applymapping1 = ApplyMapping.apply(frame=datasource0, mappings=[("timestamp", "string", "date", "string"), ("employee", "string", "employee", "string"), ("action", "string", "action", "string"), ("field", "string", "employee_field_changed", "string"), (
"old", "string", "previous_value", "string"), ("new", "string", "new_value", "string"), ("source", "string", "source", "string"), ("requester", "string", "changed_by", "string")], transformation_ctx="applymapping1")
repartition = applymapping1.repartition(1)
file_path = "s3://"+args["path"]+"/audit_log/" + \
args["company_id"]+"/"+args["user_id"]
datasink2 = glueContext.write_dynamic_frame.from_options(frame=repartition, connection_type="s3", connection_options={
"path": file_path}, format="csv", transformation_ctx="datasink2")
client = boto3.client('s3')
prefix = "audit_log/"+args["company_id"]+"/"+args["user_id"]
audit_key = prefix+"/audit.csv"
client.delete_object(Bucket=args["path"], Key=audit_key)
response = client.list_objects(Bucket=args["path"], Prefix=prefix)
name = response["Contents"][0]["Key"]
client.copy_object(Bucket=args["path"],
CopySource=args["path"] + "/" + name, Key=audit_key)
client.delete_object(Bucket=args["path"], Key=name)
job.commit()
这是回溯
Log Contents:
Traceback (most recent call last):
File "runscript.py", line 63, in <module>
default_error_msg = "
{}
:
{}
".format(e_type.__name__, str(e_value).split("\n")[0])
UnicodeEncodeError: 'ascii' codec can't encode character u'\u2013' in position 151: ordinal not in range(128)
解决方案
推荐阅读
- javascript - onpropertychange 在 cognos 新版本中不起作用
- javascript - 表中的列值未更新
- php - 使用自定义 ssp.file 的 DataTable 服务器端
- r - 从 Github 安装 R 包时出错
- python-3.x - 在以下 Python 代码中未选择文件
- python - 如何验证来自 python 模块的传出 HTTP 请求?
- asp.net-core - 刷新后旧的刷新令牌保留在 [PersistedGrants] 表中
- android - AndroidX 前后 DialogPreference 的区别
- python - 如果熊猫中的值是浮点数(不是字符串),则将列除以其他列?
- sql-server - 从 SQL Server 表的第一行获取列名