问题描述
import pandas as pd
import sys
!{sys.executable} -m pip install xlrd
excel_file = pd.ExcelFile(r'C:\Users\user\Desktop\Documents\Census Sample.xlsx')
sheets_available = excel_file.sheet_names
sheet_to_df_map = {}
for sheet_name in sheets_available:
#Read Excel for each sheet
index_val = 0
sheet_to_df_map[sheet_name] = pd.read_excel(excel_file,sheet_name,header=None)
current_df = sheet_to_df_map[sheet_name]
list_of_dataframes = []
initial_rows = 0
end_rows = 0
file_count = 0
while((end_rows+1) < current_df.shape[0]):
initial_rows = remove_empty_rows(current_df.iloc[0:,:],initial_rows)
end_rows = initial_rows
current_series = pd.Series(current_df.iloc[end_rows])
while((end_rows +1) < current_df.shape[0] and not current_series.isna().all()):
end_rows = end_rows+1
current_series = pd.Series(current_df.iloc[end_rows])
modified_df = current_df.iloc[initial_rows:end_rows+1,:]
initial_rows = end_rows
modified_df = modified_df.dropna(axis=1, how='all')
modified_df = modified_df.dropna(axis=0, how='all')
if(modified_df.shape[0] != 1 and modified_df.shape[1] !=1):
curr_index = remove_junk_rows(modified_df.iloc[0:,:],0)
modified_df = modified_df.iloc[curr_index:,:]
current_series = pd.Series(modified_df.iloc[0])
if(current_series.is_unique == False or (current_series.is_unique == True and current_series.str.isnumeric().any())):
modified_df = modified_df.T
if(len(modified_df.columns) == modified_df.shape[1]):
modified_df.columns = modified_df.iloc[0]
modified_df = modified_df.iloc[1: , :]
for i in range(len(list_of_dataframes)):
if(len(list_of_dataframes[i].columns.difference(modified_df.columns)) == 0 and list_of_dataframes[i].shape[1] == modified_df.shape[1]):
modified_df = pd.concat([list_of_dataframes[i],modified_df])
file_count = i
list_of_dataframes.pop(i)
modified_df.dropna(how='all',inplace = True)
list_of_dataframes.append(modified_df)
modified_df.to_excel("C:\\Users\\user\\Desktop\\Documents\\"+sheet_name+"_"+str(file_count)+".xlsx",index=False, engine='openpyxl')
file_count = file_count + 1
标签: pythonpandas
解决方案
推荐阅读