python - Code for exploding a list inside a dataframe cell into rows fails when pandas dataframe is loaded from csv.
问题描述
I have been trying to explode a list inside a dataframe into expanded/exploded columns like explained on stack overflow How to explode a list inside a Dataframe cell into separate rows
I have been having trouble.
I got the stackoverflow code to work from the example. But I couldn't get it to work for my personal data file (which is on a csv file I used ` pd.read_csv.
I then simplified the code to get rid of one unnecessary column. I got the simplified example code working. Then I tried loading my .csv again. Again, it failed.
So, I took my file out of the equation by writing the data frame from the simplified example to a new csv file, loaded that file using the pd.read_csv` , and re-ran the simplified "explode" code. I got the same error. Code and error output is below.
import pandas as pd
#Create The DataFrame
df = (pd.DataFrame({
'opponent': ['76ers', 'blazers', 'bobcats'],
'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3})
.set_index([ 'opponent']))
#Explode The Data Frame
(pd.melt(df.nearest_neighbors.apply(pd.Series).reset_index(),
id_vars=[ 'opponent'],
value_name='nearest_neighbors')
.set_index([ 'opponent'])
.drop('variable', axis=1)
.dropna()
.sort_index()
)
#Save DF to CSV
df.to_csv("Baskets.CSV")
#Load DF to CSV
df = pd.read_csv("Baskets.csv")
# Re Run Explode Code on csv
(pd.melt(df.nearest_neighbors.apply(pd.Series).reset_index(),
id_vars=[ 'opponent'],
value_name='nearest_neighbors')
.set_index([ 'opponent'])
.drop('variable', axis=1)
.dropna()
.sort_index()
)
KeyError Traceback (most recent call last)
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2441 try:
-> 2442 return self._engine.get_loc(key)
2443 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'opponent'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-25-6ffdc50e8f2c> in <module>()
1 (pd.melt(df.nearest_neighbors.apply(pd.Series).reset_index(),
2 id_vars=[ 'opponent'],
----> 3 value_name='nearest_neighbors')
4 .set_index([ 'opponent'])
5 .drop('variable', axis=1)
~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/reshape.py in melt(frame, id_vars, value_vars, var_name, value_name, col_level)
761 mdata = {}
762 for col in id_vars:
--> 763 mdata[col] = np.tile(frame.pop(col).values, K)
764
765 mcolumns = id_vars + var_name + [value_name]
~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in pop(self, item)
548 Return item and drop from frame. Raise KeyError if not found.
549 """
--> 550 result = self[item]
551 del self[item]
552 try:
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
1962 return self._getitem_multilevel(key)
1963 else:
-> 1964 return self._getitem_column(key)
1965
1966 def _getitem_column(self, key):
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
1969 # get column
1970 if self.columns.is_unique:
-> 1971 return self._get_item_cache(key)
1972
1973 # duplicate columns & possible reduce dimensionality
~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
1643 res = cache.get(item)
1644 if res is None:
-> 1645 values = self._data.get(item)
1646 res = self._box_item_values(item, values)
1647 cache[item] = res
~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
3588
3589 if not isnull(item):
-> 3590 loc = self.items.get_loc(item)
3591 else:
3592 indexer = np.arange(len(self.items))[isnull(self.items)]
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2442 return self._engine.get_loc(key)
2443 except KeyError:
-> 2444 return self._engine.get_loc(self._maybe_cast_indexer(key))
2445
2446 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'opponent'
解决方案
Your initial index is being reset to the default index when you save to and then read from the csv file. To fix it, you'll need to read the csv with index_col
set to opponent
.
Instead of:
#Load DF to CSV
df = pd.read_csv("Baskets.csv")
Try using:
#Load DF to CSV
df = pd.read_csv("Baskets.csv", index_col='opponent')
In order to convert the nearest_neighbors
column to a list, you will also need to do this:
from ast import literal_eval
df.nearest_neighbors=df.nearest_neighbors.apply(literal_eval)
After that, I was able to get the melting to work:
(pd.melt(df.nearest_neighbors.apply(pd.Series).reset_index(),
id_vars=[ 'opponent'],
value_name='nearest_neighbors')
.set_index([ 'opponent'])
.drop('variable', axis=1)
.dropna()
.sort_index()
)
Output:
nearest_neighbors
opponent
76ers Zach LaVine
76ers Jeremy Lin
76ers Nate Robinson
76ers Isaia
blazers Zach LaVine
blazers Jeremy Lin
blazers Nate Robinson
blazers Isaia
bobcats Zach LaVine
bobcats Jeremy Lin
bobcats Nate Robinson
bobcats Isaia