首页 > 解决方案 > KeyError:“存档中没有名为 'dataset.csv' 的项目”

问题描述

raw_file_processing函数中,我将原始数据处理为可用格式,然后dataset.csvdataset_csv函数中创建一个。然后我想在zip_files函数中压缩这个文件。我的代码提出了KeyError( KeyError: "There is no item named 'dataset.csv' in the archive"

from zipfile import ZipFile
import pandas as pd
import warnings
import numpy as np
import os
import re
import shutil


class DataProcesser:

    def __init__(self, raw_path, col_id='ID', col_class='class', col_classname='class_name', col_set='set',
                 read_on_init=True, **kwargs):

        self.archive = ZipFile('data.zip', 'w')
        self.raw_path = raw_path
        self.col_id = col_id
        self.col_class = col_class
        self.col_classname = col_classname
        self.col_set = col_set
        self.dataset = None
        self.dataset_cropped = None
        self.id_set = None
        self.classes = None
        self.train_set = None
        self.validation_set = None
        self.test_set = None
        self.logs = []
        self.stats = None
        self.flag_subset = False
        self.flag_process = False
        self.flag_split = False
        self.measurement_df = None
        if read_on_init:
            self.read_archive(**kwargs)

    def raw_file_processing(self):

        # If the path contains HTAN CODEX data, perform the following processing steps
        if os.path.isdir(self.raw_path):

            self.col_classname = self.raw_path.iloc[2]

            # Dummy-code the classes
            self.col_class = pd.get_dummies(self.col_classname)

            # Create the ID series by concatenating columns 1-3
            self.col_id = self.raw_path.assign(
                ID=self.raw_path[['cell_id:cell_id', 'region:region', 'tile_num:tile_num']].apply(
                    lambda row: '_'.join([str(each) for each in row]), axis=1))
            self.col_id = self.raw_path.drop(columns=['cell_id:cell_id', 'region:region', 'tile_num:tile_num'])

            # Obtain measurement info
            # Normalize data against blank/empty columns
            # log-transform the data
            for col in self.raw_path[9:]:
                if re.findall(r"Blank|Empty", col):
                    background = col
                else:
                    for index, row in col:
                        norm_data = row / background
                        self.measurement_df = np.log2(norm_data)

        return self.raw_path, self.col_id, self.col_class, self.measurement_df

    def dataset_csv(self):

        # If the path contains HTAN CODEX data, perform the following processing steps
        if os.path.isdir(self.raw_path):
            """Col 1: ID
            Col 2: class
            Col 3-n: measurements"""
            id_col = self.col_id

            self.col_class = self.col_class.to_frame()

            frames = [id_col, self.col_class, self.measurement_df]
            self.dataset = pd.concat(frames)
            data_csv = self.dataset.to_csv("../input_data/data/dataset.csv")

        return data_csv

    def zip_files(self):

        # Create a ZipFile object for dataset.csv, classes.csv, and id_set.csv
        self.archive = shutil.make_archive(self.archive_path, "zip", "../input_data/data/")
        return self.archive

    def read_archive(self, datatable=True, **kwargs):
        if datatable:
            try:
                from datatable import fread
                self.dataset = fread(self.archive.open('dataset.csv'), **kwargs).to_pandas()
                self.id_set = fread(self.archive.open('id_set.csv'), **kwargs).to_pandas()
                self.classes = fread(self.archive.open('classes.csv'), **kwargs).to_pandas()
            except ModuleNotFoundError:
                warnings.warn('datatable module not found, using pandas instead. To prevent this message from appearing'
                              ' use "datatable = False" when reading the archive.')
                self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
                self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
                self.classes = pd.read_csv(self.archive.open('classes.csv'))
        else:
            self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
            self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
            self.classes = pd.read_csv(self.archive.open('classes.csv'))
        self.check_datasets()
        return None


input_path = "//wsl$/Ubuntu-20.04/home/melissachua/CODEX/input_data"
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
    for file in files:
        with open(os.path.join(root, file), "r") as data:
            raw_files = pd.read_csv(data)


input_path = "//wsl$/Ubuntu-20.04/home/melissachua/CODEX/input_data"
# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
    for file in files:
        with open(os.path.join(root, file), "r") as data:
            data_file = pd.read_csv(data)
            data = DataProcesser(data_file, datatable=False)

meas_var = None
start_time = None
end_time = None

# Open all the subfolders within path
for root, dirs, files in os.walk(input_path):
    for file in files:
        with open(os.path.join(root, file), "r") as data:
            raw_files = pd.read_csv(data)
            data_file = "data.zip"

            # The data object is used to automatically derive some parameters (e.g. number of classes)
            data = DataProcesser(data_file, datatable=False)

追溯:

> --------------------------------------------------------------------------- KeyError                                  Traceback (most recent call
> last) /tmp/ipykernel_17522/1589195238.py in <module>
>       1 # The data object is used to automatically derive some parameters (e.g. number of classes)
> ----> 2 data = DataProcesser(raw_files, datatable=False)
>       3 
>       4 # Update default for the data
>       5 meas_var = data.detect_groups_times()['groups'] if meas_var is None else meas_var
> 
> ~/CODEX/Notebooks/../source/load_data.py in __init__(self, raw_path,
> col_id, col_class, col_classname, col_set, read_on_init, **kwargs)
>      74         self.measurement_df = None
>      75         if read_on_init:
> ---> 76             self.read_archive(**kwargs)
>      77 
>      78     def raw_file_processing(self):
> 
> ~/CODEX/Notebooks/../source/load_data.py in read_archive(self,
> datatable, **kwargs)
>     194                 self.classes = pd.read_csv(self.archive.open('classes.csv'))
>     195         else:
> --> 196             self.dataset = pd.read_csv(self.archive.open('dataset.csv'))
>     197             self.id_set = pd.read_csv(self.archive.open('id_set.csv'))
>     198             self.classes = pd.read_csv(self.archive.open('classes.csv'))
> 
> /usr/lib/python3.8/zipfile.py in open(self, name, mode, pwd,
> force_zip64)    1512         else:    1513             # Get info
> object for name
> -> 1514             zinfo = self.getinfo(name)    1515     1516         if mode == 'w':
> 
> /usr/lib/python3.8/zipfile.py in getinfo(self, name)    1439        
> info = self.NameToInfo.get(name)    1440         if info is None:
> -> 1441             raise KeyError(    1442                 'There is no item named %r in the archive' % name)    1443 
> 
> KeyError: "There is no item named 'dataset.csv' in the archive"

标签: pythondataframe

解决方案


推荐阅读