python - 分析 Python 的 For 循环中包含的数据帧
问题描述
现在的情况:
我有一个函数将二进制类目标变量分成“1”和“0”,然后读取每个变量的所有自变量。该函数还根据类确定每个自变量的 KDE:“1”和“0”,然后计算相交面积:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
def intersection_area(data, bandwidth, margin,target_variable_name):
#target_variable_name is the column name of the response variable
data = data.dropna()
X = data.drop(columns = [str(target_variable_name)], axis = 1)
names = list(X.columns)
new_columns = []
for column_name in names[:-1]:
x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]
x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]
kde0 = gaussian_kde(x0, bw_method=bandwidth)
kde1 = gaussian_kde(x1, bw_method=bandwidth)
x_min = min(x0.min(), x1.min()) #find the lowest value between two minimum points
x_max = min(x0.max(), x1.max()) #finds the lowest value between two maximum points
dx = margin * (x_max - x_min) # add a margin since the kde is wider than the data
x_min -= dx
x_max += dx
x = np.linspace(x_min, x_max, 500)
kde0_x = kde0(x)
kde1_x = kde1(x)
inters_x = np.minimum(kde0_x, kde1_x)
area_inters_x = np.trapz(inters_x, x) #intersection of two kde
print(area_inters_x)
问题: 如果我有 n_class = 4 函数将如下所示:
def intersection_area(data, bandwidth, margin,target_variable_name):
#target_variable_name is the column name of the response variable
data = data.dropna()
X = data.drop(columns = [str(target_variable_name)], axis = 1)
names = list(X.columns)
new_columns = []
for column_name in names[:-1]:
x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]
x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]
x2= data.loc[data[str(target_variable_name)] == 2,str(column_name)]
x3= data.loc[data[str(target_variable_name)] == 3,str(column_name)]
kde0 = gaussian_kde(x0, bw_method=bandwidth)
kde1 = gaussian_kde(x1, bw_method=bandwidth)
kde2 = gaussian_kde(x2, bw_method=bandwidth)
kde3 = gaussian_kde(x3, bw_method=bandwidth)
x_min = min(x0.min(), x1.min(),x2.min(),x3.min())
x_max = min(x0.max(), x1.max(),x2.min(),x3.min())
dx = margin * (x_max - x_min)
x_min -= dx
x_max += dx
x = np.linspace(x_min, x_max, 500)
kde0_x = kde0(x)
kde1_x = kde1(x)
kde2_x = kde1(x)
kde3_x = kde1(x)
inters_x = np.minimum(kde0_x, kde1_x, kde2_x, kde3_x)
area_inters_x = np.trapz(inters_x, x)
print(area_inters_x)
现在,如果我有一个包含 n 个类的未知数据集怎么办?我正在尝试改进我的旧代码,使其对多类数据集变得健壮,确定给定类的独立变量的 KDE 并计算区域的交集。但是我被困在x = data.loc[data[str(target_name)] == i,str(column_name)]
部分:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
def intersection_area(data, bandwidth, margin,target_variable_name):
# Collect the names of the independent variables
data = data.dropna()
X = data.drop(columns = [str(target_variable_name)], axis = 1)
names = list(X.columns)
# determine the number of unique classes from a multi-class and save them as a list.
classes = []
for unique_class in data.target_variable_name.unique():
classes.append(unique_class)
new_columns = []
# for each unique class, run through the different independent variables
for i in classes:
for column_name in names[:-1]:
print(i) #to show the class (target variable: 0,1,...,n)
print(column_name) #to show the variable name to be analyzed
'''This is the part where I got stuck'''
x = data.loc[data[str(target_name)] == i,str(column_name)]
任何有兴趣复制问题的人的模拟数据集:
from sklearn.datasets import make_classification
#note: to create a binary class target change n_class = 2
X,y = make_classification(n_samples=50000, n_features=6,n_informative=6, n_redundant=0, n_repeated=0, n_classes=4
,n_clusters_per_class=3,class_sep=0.95,flip_y=0.2,weights=[0.7,0.2,0.1], shuffle=True,random_state=93)
dataset_x = pd.DataFrame({'var1': X[:, 0], 'var2': X[:, 1],'var3': X[:, 2], 'var4': X[:, 3]
,'var5': X[:, 4], 'var6': X[:, 5]})
dataset_y = pd.DataFrame({'target': y})
sample_dataset = pd.concat([dataset_x,dataset_y], axis=1)
print(sample_dataset)
解决方案
考虑使用每个目标级别的多个类的列表理解来构建 x 和 kde 的列表。而不是在每次迭代中打印出结果,而是将结果绑定到数据框中:
def intersection_area_new(data, bandwidth, margin, target_variable_name):
# Collect the names of the independent variables
data = data.dropna()
# determine the number of unique classes from a multi-class target variable and save them as a list.
classes = data['target'].unique()
kde_dicts = []
for column_name in data.columns[:-1]:
# BUILD LIST OF x's AND kde's
x_s = [data.loc[(data[target_variable_name] == i), str(column_name)] for i in classes]
kde_s = [gaussian_kde(x, bw_method=bandwidth) for x in x_s]
x_min = min([x.min() for x in x_s]) # find the lowest value between two minimum points
x_max = min([x.max() for x in x_s]) # find the lowest value between two maximum points
dx = margin * (x_max - x_min) # add a margin since the kde is wider than the data
x_min -= dx
x_max += dx
x_array = np.linspace(x_min, x_max, 500)
kde_x_s = [kde(x_array) for kde in kde_s]
inters_x = np.array(kde_x_s).min(axis=0)
area_inters_x = np.trapz(inters_x, x_array) # intersection of kdes
kde_dicts.append({'target': target_variable_name,
'column': column_name,
'intersection': area_inters_x})
return pd.DataFrame(kde_dicts)
输出
output = intersection_area_new(sample_dataset, None, 0.5, "target")
print(output.head(10))
# target column intersection
# 0 target var1 0.842256
# 1 target var2 0.757190
# 2 target var3 0.676021
# 3 target var4 0.873074
# 4 target var5 0.763626
# 5 target var6 0.868560
推荐阅读
- javascript - 映射对象内部存在的数组仅在条件语句中给出时才起作用
- package - 有什么方法可以从另一个仓库导入一个特定的 lerna 包
- powershell - 使用 .CSV 文件添加 DNS 服务器条目
- amazon-ec2 - EC2 使用 Centos 8 AMI 官方无法扩展 EBS 卷
- python - 将字节数组转换为 Ctype 结构值
- javascript - 在 Laravel 上使用 Jquery 和 Ajax 检查单选按钮时填充的下拉列表
- javascript - 如何解决正则表达式中的易受攻击问题
- python - seaborn:我怎么能告诉它一个分类变量有一个特定的颜色?
- amazon-web-services - Terraform .12 是否可以将标签添加到现有 RDS
- java - JAVA:合并多个属性文件