首页 > 解决方案 > 已经使用 XLRD python 包编写了代码,需要使用 python 的 openpyxl 包复制相同的代码

问题描述

import pandas as pd

import sys

!{sys.executable} -m pip install xlrd

excel_file = pd.ExcelFile(r'C:\Users\user\Desktop\Documents\Census Sample.xlsx') 

sheets_available = excel_file.sheet_names

sheet_to_df_map = {}

for sheet_name in sheets_available:
    #Read Excel for each sheet
    index_val = 0
    sheet_to_df_map[sheet_name] = pd.read_excel(excel_file,sheet_name,header=None)
    current_df = sheet_to_df_map[sheet_name] 

list_of_dataframes = []
initial_rows = 0
end_rows = 0
file_count = 0
while((end_rows+1) < current_df.shape[0]):
   
    initial_rows = remove_empty_rows(current_df.iloc[0:,:],initial_rows)
   
    end_rows = initial_rows

    current_series = pd.Series(current_df.iloc[end_rows])
    while((end_rows +1) < current_df.shape[0] and not current_series.isna().all()): 
        end_rows = end_rows+1
        current_series = pd.Series(current_df.iloc[end_rows]) 
        
    modified_df = current_df.iloc[initial_rows:end_rows+1,:]
    initial_rows = end_rows
    modified_df = modified_df.dropna(axis=1, how='all')
    modified_df = modified_df.dropna(axis=0, how='all')
    if(modified_df.shape[0] != 1 and modified_df.shape[1] !=1):
        curr_index = remove_junk_rows(modified_df.iloc[0:,:],0)
        modified_df = modified_df.iloc[curr_index:,:]


        current_series = pd.Series(modified_df.iloc[0])

        if(current_series.is_unique == False or (current_series.is_unique == True and current_series.str.isnumeric().any())):
            modified_df = modified_df.T

        if(len(modified_df.columns) == modified_df.shape[1]):
            modified_df.columns = modified_df.iloc[0]
       
        modified_df = modified_df.iloc[1: , :]
        for i in range(len(list_of_dataframes)):
            if(len(list_of_dataframes[i].columns.difference(modified_df.columns)) == 0 and list_of_dataframes[i].shape[1] == modified_df.shape[1]):
               modified_df = pd.concat([list_of_dataframes[i],modified_df])
               file_count = i
               list_of_dataframes.pop(i)

        modified_df.dropna(how='all',inplace = True)
        list_of_dataframes.append(modified_df)
        modified_df.to_excel("C:\\Users\\user\\Desktop\\Documents\\"+sheet_name+"_"+str(file_count)+".xlsx",index=False, engine='openpyxl')
        file_count = file_count + 1
    

标签: pythonpandas

解决方案


推荐阅读