首页 > 解决方案 > 在 python 中使用 json 模式验证 csv

问题描述

我想对数据执行验证。我已经使用 pandas 模式而不是 pandas 模式编写了代码,如何传递一个包含所有验证规则的 json 文件,然后将其应用于 csv 文件。

这意味着必须从 json 文件而不是 pandas 模式中对哪一列应用哪个规则并生成错误文件。

def check_decimal(dec):
    try:
        Decimal(dec)
    except InvalidOperation:
        return False
    return True


def check_int(num):
    try:
        int(num)
    except ValueError:
        return False
    return True


def do_validation():
    # read the data
    data = pd.read_csv('data.csv')

    # define validation elements
    decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
    int_validation = [CustomElementValidation(lambda i: check_int(i), 'is not integer')]
    null_validation = [CustomElementValidation(lambda d: d is None, 'this field cannot be null')]

    # define validation schema

    schema = pandas_schema.Schema([
            Column('dec1', decimal_validation + null_validation),
            Column('dec2', decimal_validation),
            Column('dec3', decimal_validation),
            Column('dec4', decimal_validation),
            Column('dec5', decimal_validation),
            Column('dec6', decimal_validation),
            Column('dec7', decimal_validation),
            Column('company_id', int_validation + null_validation),
            Column('currency_id', int_validation + null_validation),
            Column('country_id', int_validation + null_validation)])


    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    data_clean = data.drop(index=errors_index_rows)

    # save data
    pd.DataFrame({'col':errors}).to_csv('errors55.csv')

标签: jsonpython-3.xvalidationschemajsonschema

解决方案


所以,我对 . 真的一无所知pandas_schema,但是如果您在这样的 json 中有列及其验证器:

{
    "dec1": ['decimal', 'null'],
    "dec2": ['decimal'],
    "dec3": ['decimal'],
    "dec4": ['decimal'],
    "dec5": ['decimal'],
    "dec6": ['decimal'],
    "dec7": ['decimal'],
    "company_id": ['int', 'null'],
    "currency_id": ['int', 'null'],
    "country_id": ['int', 'null']
}

然后,您可以使用验证器的字典和列表推导来生成您的Column对象Schema

def check_decimal(dec):
    try:
        Decimal(dec)
    except InvalidOperation:
        return False
    return True


def check_int(num):
    try:
        int(num)
    except ValueError:
        return False
    return True


VALIDATORS = {
    'decimal': CustomElementValidation(lambda d: check_decimal(d), 'is not decimal'),
    'int': CustomElementValidation(lambda i: check_int(i), 'is not integer'),
    'null': CustomElementValidation(lambda d: d is None, 'this field cannot be null'),
}

def do_validation():
    # read the data
    data = pd.read_csv('data.csv')
    with open('my_json_schema.json', 'r') as my_json:
        json_schema = json.load(my_json)

    column_list = [Column(k, [VALIDATORS[v] for v in vals]) for k, vals in json_schema.items()]
    schema = pandas_schema.Schema(column_list)

    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    data_clean = data.drop(index=errors_index_rows)

    # save data
    pd.DataFrame({'col':errors}).to_csv('errors55.csv')

编辑:

要使用带有 JSON 中定义的参数的验证器,您需要稍微更改 JSON 格式和代码。以下应该可以工作,但我自己无法测试。

{
    "dec1": [['decimal'], ['null']],
    "dec2": [['decimal'], ['range', 0, 10]],
    "dec3": [['decimal']],
    "dec4": [['decimal']],
    "dec5": [['decimal']],
    "dec6": [['decimal']],
    "dec7": [['decimal']],
    "company_id": [['int'], ['null']],
    "currency_id": [['int'], ['null']],
    "country_id": [['int'], ['null']]
}


def get_validator(opts)
    VALIDATORS = {
        'decimal': (CustomElementValidation, [lambda d: check_decimal(d), 'is not decimal']),
        'int': (CustomElementValidation, [lambda i: check_int(i), 'is not integer']),
        'null': (CustomElementValidation, [lambda d: d is None, 'this field cannot be null']),
        'range': (InRangeValidation, []),
    }
    func, args = VALIDATORS[opts[0]]
    args.extend(opts[1:])
    return func(*args)


def do_validation():
    # read the data
    data = pd.read_csv('data.csv')
    with open('my_json_schema.json', 'r') as my_json:
        json_schema = json.load(my_json)

    column_list = [Column(k, [get_validator(v) for v in vals]) for k, vals in json_schema.items()]
    schema = pandas_schema.Schema(column_list)

    # apply validation
    errors = schema.validate(data)
    errors_index_rows = [e.row for e in errors]
    data_clean = data.drop(index=errors_index_rows)

    # save data
    pd.DataFrame({'col':errors}).to_csv('errors55.csv')

推荐阅读