json - 在 python 中使用 json 模式验证 csv
问题描述
我想对数据执行验证。我已经使用 pandas 模式而不是 pandas 模式编写了代码,如何传递一个包含所有验证规则的 json 文件,然后将其应用于 csv 文件。
这意味着必须从 json 文件而不是 pandas 模式中对哪一列应用哪个规则并生成错误文件。
def check_decimal(dec):
try:
Decimal(dec)
except InvalidOperation:
return False
return True
def check_int(num):
try:
int(num)
except ValueError:
return False
return True
def do_validation():
# read the data
data = pd.read_csv('data.csv')
# define validation elements
decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
int_validation = [CustomElementValidation(lambda i: check_int(i), 'is not integer')]
null_validation = [CustomElementValidation(lambda d: d is None, 'this field cannot be null')]
# define validation schema
schema = pandas_schema.Schema([
Column('dec1', decimal_validation + null_validation),
Column('dec2', decimal_validation),
Column('dec3', decimal_validation),
Column('dec4', decimal_validation),
Column('dec5', decimal_validation),
Column('dec6', decimal_validation),
Column('dec7', decimal_validation),
Column('company_id', int_validation + null_validation),
Column('currency_id', int_validation + null_validation),
Column('country_id', int_validation + null_validation)])
# apply validation
errors = schema.validate(data)
errors_index_rows = [e.row for e in errors]
data_clean = data.drop(index=errors_index_rows)
# save data
pd.DataFrame({'col':errors}).to_csv('errors55.csv')
解决方案
所以,我对 . 真的一无所知pandas_schema
,但是如果您在这样的 json 中有列及其验证器:
{
"dec1": ['decimal', 'null'],
"dec2": ['decimal'],
"dec3": ['decimal'],
"dec4": ['decimal'],
"dec5": ['decimal'],
"dec6": ['decimal'],
"dec7": ['decimal'],
"company_id": ['int', 'null'],
"currency_id": ['int', 'null'],
"country_id": ['int', 'null']
}
然后,您可以使用验证器的字典和列表推导来生成您的Column
对象Schema
:
def check_decimal(dec):
try:
Decimal(dec)
except InvalidOperation:
return False
return True
def check_int(num):
try:
int(num)
except ValueError:
return False
return True
VALIDATORS = {
'decimal': CustomElementValidation(lambda d: check_decimal(d), 'is not decimal'),
'int': CustomElementValidation(lambda i: check_int(i), 'is not integer'),
'null': CustomElementValidation(lambda d: d is None, 'this field cannot be null'),
}
def do_validation():
# read the data
data = pd.read_csv('data.csv')
with open('my_json_schema.json', 'r') as my_json:
json_schema = json.load(my_json)
column_list = [Column(k, [VALIDATORS[v] for v in vals]) for k, vals in json_schema.items()]
schema = pandas_schema.Schema(column_list)
# apply validation
errors = schema.validate(data)
errors_index_rows = [e.row for e in errors]
data_clean = data.drop(index=errors_index_rows)
# save data
pd.DataFrame({'col':errors}).to_csv('errors55.csv')
编辑:
要使用带有 JSON 中定义的参数的验证器,您需要稍微更改 JSON 格式和代码。以下应该可以工作,但我自己无法测试。
{
"dec1": [['decimal'], ['null']],
"dec2": [['decimal'], ['range', 0, 10]],
"dec3": [['decimal']],
"dec4": [['decimal']],
"dec5": [['decimal']],
"dec6": [['decimal']],
"dec7": [['decimal']],
"company_id": [['int'], ['null']],
"currency_id": [['int'], ['null']],
"country_id": [['int'], ['null']]
}
def get_validator(opts)
VALIDATORS = {
'decimal': (CustomElementValidation, [lambda d: check_decimal(d), 'is not decimal']),
'int': (CustomElementValidation, [lambda i: check_int(i), 'is not integer']),
'null': (CustomElementValidation, [lambda d: d is None, 'this field cannot be null']),
'range': (InRangeValidation, []),
}
func, args = VALIDATORS[opts[0]]
args.extend(opts[1:])
return func(*args)
def do_validation():
# read the data
data = pd.read_csv('data.csv')
with open('my_json_schema.json', 'r') as my_json:
json_schema = json.load(my_json)
column_list = [Column(k, [get_validator(v) for v in vals]) for k, vals in json_schema.items()]
schema = pandas_schema.Schema(column_list)
# apply validation
errors = schema.validate(data)
errors_index_rows = [e.row for e in errors]
data_clean = data.drop(index=errors_index_rows)
# save data
pd.DataFrame({'col':errors}).to_csv('errors55.csv')
推荐阅读
- ruby-on-rails - 为 Amazon S3 设置凭证时出现 KeyError
- python - 如何使用python自动将基于CSV的表创建到postgres中
- java - 当我在android studio中使用Volley将图片上传到远程服务器时得到空响应 - java
- spring-data-jpa - Spring boot JPA多对多关联问题
- postgresql - AfterUpdate 中的查询在 TypeORM 中没有按预期工作
- javascript - TypeError:无法读取 null 的属性“getRange”(第 14 行,文件“代码”)
- sql-server - 用整数值替换字符串,保持字符串长度固定
- javascript - 如何在打字稿中创建接口适配器?
- css - 数据表 - 将 css 添加到特定单元格
- excel - Excel:如何在计算中获取一系列值的单元格的所有结果