python - 使用 For 循环生成 Pandas 数据帧
问题描述
我正在尝试生成 500 个 DataFrame,每个学生一个。每个 DataFrame 都将是他们的出勤记录。
(奖励)如何将每个 DataFrame 的名称设置为attendance_i
. 其中 i 是学生的 ID。因此,学生 1 的数据帧将是出勤_1,学生 2 的数据帧将是出勤_2,依此类推。
这是代码:
import pandas as pd
import numpy as np
from numpy import random
from datetime import datetime
from datetime import timedelta
from dateutil.rrule import DAILY, rrule, MO, TU, WE, TH, FR, SA, SU
import calendar
# DEFINE GLOBAL VARIABLES
student_count = 500
# DECLARE START AND END DATES
start = '2013-03-01'
end = '2015-07-31'
# GET NUMBER OF WORKDAYS
weekmask = 'Wed Thu Fri Sat Sun'
customworkweek = pd.offsets.CustomBusinessDay(weekmask=weekmask)
workdates = pd.bdate_range(start = start, end = end, freq = customworkweek).to_pydatetime().tolist()
# GET NUMBER OF WEEKS AND MONTHS
x = pd.to_datetime(end) - pd.to_datetime(start)
workweeks_int = int(x / np.timedelta64(1, 'W'))
workmonths_int = int(x / np.timedelta64(1, 'M'))
# MORE GLOBAL VARAIBLES
wed = pd.offsets.CustomBusinessDay(weekmask='Wed')
thu = pd.offsets.CustomBusinessDay(weekmask='Thu')
fri = pd.offsets.CustomBusinessDay(weekmask='Fri')
sat = pd.offsets.CustomBusinessDay(weekmask='Sat')
sun = pd.offsets.CustomBusinessDay(weekmask='Sun')
wed_classes = pd.bdate_range(start = start, end = end, freq = wed).to_pydatetime().tolist()
thu_classes = pd.bdate_range(start = start, end = end, freq = thu).to_pydatetime().tolist()
fri_classes = pd.bdate_range(start = start, end = end, freq = fri).to_pydatetime().tolist()
sat_classes = pd.bdate_range(start = start, end = end, freq = sat).to_pydatetime().tolist()
sun_classes = pd.bdate_range(start = start, end = end, freq = sun).to_pydatetime().tolist()
# SET UP STUDENT DATAFRAME
student_age = random.randint(4, 13, size = (student_count))
student_sex = random.choice(["male", "female"], size = (student_count))
student_id = list(range(0,student_count))
student_name_local = pd.read_csv('Database\student_name_local.csv')
student_name_english = pd.read_csv('Database\student_name_english.csv')
class_id = random.randint(1, 27, size = (student_count))
column_names = ['STUDENT_ID', 'STUDENT_NAME_LOCAL', 'STUDENT_NAME_ENGLISH', 'STUDENT_AGE', 'STUDENT_SEX']
students = pd.DataFrame(columns = column_names)
students['STUDENT_ID'] = student_id
students['STUDENT_NAME_LOCAL'] = student_name_local
students['STUDENT_NAME_ENGLISH'] = student_name_english
students['STUDENT_AGE'] = student_age
students['STUDENT_SEX'] = student_sex
students['CLASS_ID'] = class_id
# SET UP CLASSES DATAFRAME
data = [['0', 'Wednesday', '16:30:00'], ['1', 'Wednesday', '17:30:00'], ['2', 'Wednesday', '18:30:00'], ['3', 'Thursday', '16:30:00'], ['4', 'Thursday', '17:30:00'], ['5', 'Thursday', '18:30:00'], ['6', 'Friday', '16:30:00'], ['7', 'Friday', '17:30:00'], ['8', 'Friday', '18:30:00'], ['9', 'Saturday', '8:30:00'], ['10', 'Saturday', '9:30:00'], ['11', 'Saturday', '10:30:00'], ['12', 'Saturday', '11:30:00'], ['13', 'Saturday', '13:30:00'], ['14', 'Saturday', '14:30:00'], ['15', 'Saturday', '15:30:00'], ['16', 'Saturday', '16:30:00'], ['17', 'Saturday', '17:30:00'], ['18', 'Sunday', '8:30:00'], ['19', 'Sunday', '9:30:00'], ['20', 'Sunday', '10:30:00'], ['21', 'Sunday', '11:30:00'], ['22', 'Sunday', '13:30:00'], ['23', 'Sunday', '14:30:00'], ['24', 'Sunday', '15:30:00'], ['25', 'Sunday', '16:30:00'], ['26', 'Sunday', '17:30:00']]
column_names = ['CLASS_ID', 'CLASS_DAY', 'CLASS_TIME']
classes = pd.DataFrame(data = data, columns = column_names)
classes['CLASS_ID'] = classes['CLASS_ID'].astype(int)
# FOR LOOP VARIABLES
attendance_status = random.choice(["present", "absent"], p = [0.95, 0.05], size = workweeks_int)
# THE FOR LOOP
for i in student_id:
ids = list([i])*workweeks_int
column_names = ['STUDENT_ID', 'CLASS_DATE', 'ATTENDANCE_STATUS']
i = pd.DataFrame(columns = column_names)
i['STUDENT_ID'] = ids
i['ATTENDANCE_STATUS'] = attendance_status
x = students['CLASS_ID'].loc[students['STUDENT_ID'] == i].to_list()
x = x[-1]
c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()
c = c[-1]
if c == 'Wednesday':
i['CLASS_DATE'] = wed_classes
elif c == 'Thursday':
i['CLASS_DATE'] = thu_classes
elif c == 'Friday':
i['CLASS_DATE'] = fri_classes
elif c == 'Saturday':
i['CLASS_DATE'] = sat_classes
elif c == 'Sunday':
i['CLASS_DATE'] = sun_classes
这是for循环期间发生的错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-4-c04451cf4f9b> in <module>
69 i['ATTENDANCE_STATUS'] = attendance_status
70
---> 71 x = students['CLASS_ID'].loc[students['STUDENT_ID'] == i].to_list()
72 x = x[-1]
73 c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()
~\anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1766
1767 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1768 return self._getitem_axis(maybe_callable, axis=axis)
1769
1770 def _is_scalar_access(self, key: Tuple):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1950
1951 if hasattr(key, "ndim") and key.ndim > 1:
-> 1952 raise ValueError("Cannot index with multidimensional key")
1953
1954 return self._getitem_iterable(key, axis=axis)
ValueError: Cannot index with multidimensional key
有趣的是,以下代码在 for 循环之外工作:
attendance_status = random.choice(["present", "absent"], p = [0.95, 0.05], size = workweeks_int)
id = 326
i = list([id])*workweeks_int
column_names = ['STUDENT_ID', 'CLASS_DATE', 'ATTENDANCE_STATUS']
att1 = pd.DataFrame(columns = column_names)
att1['STUDENT_ID'] = i
att1['ATTENDANCE_STATUS'] = attendance_status
x = students['CLASS_ID'].loc[students['STUDENT_ID'] == id].to_list()
x = x[-1]
c = classes['CLASS_DAY'].loc[classes['CLASS_ID'] == x].to_list()
c = c[-1]
if c == 'Wednesday':
att1['CLASS_DATE'] = wed_classes
elif c == 'Thursday':
att1['CLASS_DATE'] = thu_classes
elif c == 'Friday':
att1['CLASS_DATE'] = fri_classes
elif c == 'Saturday':
att1['CLASS_DATE'] = sat_classes
elif c == 'Sunday':
att1['CLASS_DATE'] = sun_classes
att1
解决方案
推荐阅读
- javascript - 使用 AJV 要求某些元素存在于数组中
- python - 暂停递归深度优先搜索
- javascript - 简单频率排序 - Codewars 挑战 - JavaScript
- javascript - javascript减少返回未定义
- git - http.sslVerify=false 不会在 git config 中禁用 ssl 验证
- sql - Oracle Sql groupBy 分别在 2 列上
- python - Tkinter:选中时用于背景颜色的单选按钮
- apache-spark - 在 DBT 管道中使用外部 parquet 表
- javascript - TinyMCE 无法读取视频标签
- prisma - 如何使用从 prisma1 到 prisma2 的分页,使用 Connection