首页 > 解决方案 > 使用 For 循环生成 Pandas 数据帧

问题描述

我正在尝试生成 500 个 DataFrame,每个学生一个。每个 DataFrame 都将是他们的出勤记录。

(奖励)如何将每个 DataFrame 的名称设置为attendance_i. 其中 i 是学生的 ID。因此,学生 1 的数据帧将是出勤_1,学生 2 的数据帧将是出勤_2,依此类推。

这是代码:

import pandas as pd
import numpy as np
from numpy import random
from datetime import datetime
from datetime import timedelta
from dateutil.rrule import DAILY, rrule, MO, TU, WE, TH, FR, SA, SU
import calendar

# DEFINE GLOBAL VARIABLES
student_count = 500

# DECLARE START AND END DATES
start = '2013-03-01'
end = '2015-07-31'
# GET NUMBER OF WORKDAYS
weekmask = 'Wed Thu Fri Sat Sun'
customworkweek = pd.offsets.CustomBusinessDay(weekmask=weekmask)
workdates = pd.bdate_range(start = start, end = end, freq = customworkweek).to_pydatetime().tolist()
# GET NUMBER OF WEEKS AND MONTHS 
x = pd.to_datetime(end) - pd.to_datetime(start)
workweeks_int = int(x / np.timedelta64(1, 'W'))
workmonths_int = int(x / np.timedelta64(1, 'M'))
# MORE GLOBAL VARAIBLES
wed = pd.offsets.CustomBusinessDay(weekmask='Wed')
thu = pd.offsets.CustomBusinessDay(weekmask='Thu')
fri = pd.offsets.CustomBusinessDay(weekmask='Fri')
sat = pd.offsets.CustomBusinessDay(weekmask='Sat')
sun = pd.offsets.CustomBusinessDay(weekmask='Sun')

wed_classes = pd.bdate_range(start = start, end = end, freq = wed).to_pydatetime().tolist()
thu_classes = pd.bdate_range(start = start, end = end, freq = thu).to_pydatetime().tolist()
fri_classes = pd.bdate_range(start = start, end = end, freq = fri).to_pydatetime().tolist()
sat_classes = pd.bdate_range(start = start, end = end, freq = sat).to_pydatetime().tolist()
sun_classes = pd.bdate_range(start = start, end = end, freq = sun).to_pydatetime().tolist()

# SET UP STUDENT DATAFRAME
student_age = random.randint(4, 13, size = (student_count))
student_sex = random.choice(["male", "female"], size = (student_count))
student_id = list(range(0,student_count))
student_name_local = pd.read_csv('Database\student_name_local.csv')
student_name_english = pd.read_csv('Database\student_name_english.csv')
class_id = random.randint(1, 27, size = (student_count))

column_names = ['STUDENT_ID', 'STUDENT_NAME_LOCAL', 'STUDENT_NAME_ENGLISH', 'STUDENT_AGE', 'STUDENT_SEX']
students = pd.DataFrame(columns = column_names)

students['STUDENT_ID'] = student_id
students['STUDENT_NAME_LOCAL'] = student_name_local
students['STUDENT_NAME_ENGLISH'] = student_name_english
students['STUDENT_AGE'] = student_age
students['STUDENT_SEX'] = student_sex
students['CLASS_ID'] = class_id

# SET UP CLASSES DATAFRAME
data = [['0', 'Wednesday', '16:30:00'], ['1', 'Wednesday', '17:30:00'], ['2', 'Wednesday', '18:30:00'], ['3', 'Thursday', '16:30:00'], ['4', 'Thursday', '17:30:00'], ['5', 'Thursday', '18:30:00'], ['6', 'Friday', '16:30:00'], ['7', 'Friday', '17:30:00'], ['8', 'Friday', '18:30:00'], ['9', 'Saturday', '8:30:00'], ['10', 'Saturday', '9:30:00'], ['11', 'Saturday', '10:30:00'], ['12', 'Saturday', '11:30:00'], ['13', 'Saturday', '13:30:00'], ['14', 'Saturday', '14:30:00'], ['15', 'Saturday', '15:30:00'], ['16', 'Saturday', '16:30:00'], ['17', 'Saturday', '17:30:00'], ['18', 'Sunday', '8:30:00'], ['19', 'Sunday', '9:30:00'], ['20', 'Sunday', '10:30:00'], ['21', 'Sunday', '11:30:00'], ['22', 'Sunday', '13:30:00'], ['23', 'Sunday', '14:30:00'], ['24', 'Sunday', '15:30:00'], ['25', 'Sunday', '16:30:00'], ['26', 'Sunday', '17:30:00']]

column_names = ['CLASS_ID', 'CLASS_DAY', 'CLASS_TIME']
classes = pd.DataFrame(data = data, columns = column_names)
classes['CLASS_ID'] = classes['CLASS_ID'].astype(int)

# FOR LOOP VARIABLES
attendance_status = random.choice(["present", "absent"], p = [0.95, 0.05], size = workweeks_int)

# THE FOR LOOP
for i in student_id:
    ids = list([i])*workweeks_int
    column_names = ['STUDENT_ID', 'CLASS_DATE', 'ATTENDANCE_STATUS']
    i = pd.DataFrame(columns = column_names)

    i['STUDENT_ID'] = ids
    i['ATTENDANCE_STATUS'] = attendance_status

    x = students['CLASS_ID'].loc[students['STUDENT_ID'] == i].to_list()
    x = x[-1]
    c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()
    c = c[-1]

    if c == 'Wednesday':
        i['CLASS_DATE'] = wed_classes
    elif c == 'Thursday':
        i['CLASS_DATE'] = thu_classes
    elif c == 'Friday':
        i['CLASS_DATE'] = fri_classes
    elif c == 'Saturday':
        i['CLASS_DATE'] = sat_classes
    elif c == 'Sunday':
        i['CLASS_DATE'] = sun_classes

这是for循环期间发生的错误:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-c04451cf4f9b> in <module>
     69     i['ATTENDANCE_STATUS'] = attendance_status
     70 
---> 71     x = students['CLASS_ID'].loc[students['STUDENT_ID'] == i].to_list()
     72     x = x[-1]
     73     c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()

~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
   1766 
   1767             maybe_callable = com.apply_if_callable(key, self.obj)
-> 1768             return self._getitem_axis(maybe_callable, axis=axis)
   1769 
   1770     def _is_scalar_access(self, key: Tuple):

~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1950 
   1951                 if hasattr(key, "ndim") and key.ndim > 1:
-> 1952                     raise ValueError("Cannot index with multidimensional key")
   1953 
   1954                 return self._getitem_iterable(key, axis=axis)

ValueError: Cannot index with multidimensional key

有趣的是,以下代码在 for 循环之外工作:

attendance_status = random.choice(["present", "absent"], p = [0.95, 0.05], size = workweeks_int)
id = 326
i = list([id])*workweeks_int

column_names = ['STUDENT_ID', 'CLASS_DATE', 'ATTENDANCE_STATUS']
att1 = pd.DataFrame(columns = column_names)

att1['STUDENT_ID'] = i
att1['ATTENDANCE_STATUS'] = attendance_status

x = students['CLASS_ID'].loc[students['STUDENT_ID'] == id].to_list()
x = x[-1]
c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()
c = c[-1]

if c == 'Wednesday':
    att1['CLASS_DATE'] = wed_classes
elif c == 'Thursday':
    att1['CLASS_DATE'] = thu_classes
elif c == 'Friday':
    att1['CLASS_DATE'] = fri_classes
elif c == 'Saturday':
    att1['CLASS_DATE'] = sat_classes
elif c == 'Sunday':
    att1['CLASS_DATE'] = sun_classes

att1

标签: pythonpython-3.xpandasdataframefor-loop

解决方案


推荐阅读