python - UnicodeDecodeError:“utf-8”编解码器无法解码位置 14 中的字节 0xa1:无效的起始字节
问题描述
我想为我的 .csv 文件中的所有 csv 文件创建一个数据框input_path
。我的代码引发UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 14: invalid start byte
错误。我也试过用encoding='latin1'
,encoding='iso-8859-1'
或调用 read_csv encoding='cp1252'
。
import os
import pandas as pd
input_path = "../input_data/"
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
for file in files:
with open(os.path.join(root, file), "r") as data:
df = pd.read_csv(data, encoding='utf_8')
追溯:
> --------------------------------------------------------------------------- UnicodeDecodeError Traceback (most recent call
> last) /tmp/ipykernel_136/3748812978.py in <module>
> 3 for file in files:
> 4 with open(os.path.join(root, file), "r") as data:
> ----> 5 df = pd.read_csv(data, encoding='utf_8')
>
> ~/.local/lib/python3.8/site-packages/pandas/util/_decorators.py in
> wrapper(*args, **kwargs)
> 309 stacklevel=stacklevel,
> 310 )
> --> 311 return func(*args, **kwargs)
> 312
> 313 return wrapper
>
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col,
> usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters,
> true_values, false_values, skipinitialspace, skiprows, skipfooter,
> nrows, na_values, keep_default_na, na_filter, verbose,
> skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col,
> date_parser, dayfirst, cache_dates, iterator, chunksize, compression,
> thousands, decimal, lineterminator, quotechar, quoting, doublequote,
> escapechar, comment, encoding, encoding_errors, dialect,
> error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace,
> low_memory, memory_map, float_precision, storage_options)
> 584 kwds.update(kwds_defaults)
> 585
> --> 586 return _read(filepath_or_buffer, kwds)
> 587
> 588
>
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> _read(filepath_or_buffer, kwds)
> 480
> 481 # Create the parser.
> --> 482 parser = TextFileReader(filepath_or_buffer, **kwds)
> 483
> 484 if chunksize or iterator:
>
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> __init__(self, f, engine, **kwds)
> 809 self.options["has_index_names"] = kwds["has_index_names"]
> 810
> --> 811 self._engine = self._make_engine(self.engine)
> 812
> 813 def close(self):
>
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/readers.py in
> _make_engine(self, engine) 1038 ) 1039 # error: Too many arguments for "ParserBase"
> -> 1040 return mapping[engine](self.f, **self.options) # type: ignore[call-arg] 1041 1042 def
> _failover_to_python(self):
>
> ~/.local/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py
> in __init__(self, src, **kwds)
> 67 kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
> 68 try:
> ---> 69 self._reader = parsers.TextReader(self.handles.handle, **kwds)
> 70 except Exception:
> 71 self.handles.close()
>
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader.__cinit__()
>
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader._get_header()
>
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.TextReader._tokenize_rows()
>
> ~/.local/lib/python3.8/site-packages/pandas/_libs/parsers.pyx in
> pandas._libs.parsers.raise_parser_error()
>
> UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position
> 14: invalid start byte
解决方案
尝试使用 package.json 确定您的文件编码chardet
。
演示:
# Python env: pip install chardet
# Anaconda env: conda install chardet
import chardet
import pathlib
input_path = "../input_data/"
detector = chardet.UniversalDetector()
for filename in pathlib.Path(input_path).glob('*.csv'):
detector.reset()
print(f"Filename: {filename}")
for line in open(filename, 'rb'):
detector.feed(line)
if detector.done: break
detector.close()
print(f"Encoding: {detector.result['encoding']} (confidence: {detector.result['confidence']})\n")
输出:
Filename: ../input_data/file1.csv
Encoding: Windows-1252 (confidence: 0.7299263369321677)
Filename: ../input_data/file2.csv
Encoding: ascii (confidence: 1.0)
Filename: ../input_data/file3.csv
Encoding: ISO-8859-1 (confidence: 0.73)
Filename: ../input_data/file4.csv
Encoding: utf-8 (confidence: 0.99)
Filename: ../input_data/file5.csv
Encoding: ISO-8859-1 (confidence: 0.73)
推荐阅读
- javascript - Javascript随机倒计时触发函数
- python - 如何让接收器在 Django 中监听用户特定的信号?
- ruby-on-rails - 我如何在红宝石中使用三元
- c - 无法使用 Intel Intrinsics 将值转换为向量
- c++ - 如何从 unordered_set 中删除元素而不释放元素
- javascript - 制作一个检查字符串是否包含 url 的小应用程序
- python - Django 搜索栏和大写字母
- sql - 在树中查找孩子的父母 -Oracle
- flutter - 参数不匹配的闭包调用:函数'_MyAppState.build。
' 按下按钮时 - android - 通过 android 中的意图打开谷歌地图的特定部分