首页 > 解决方案 > Python KNN 回归

问题描述

我正在尝试使用 KNN 算法来预测汽车 MPG。我首先清理了我的数据,制作了一个测试和训练数据集,然后我制作了一个归一化和非归一化的 KNN 函数。现在我试图通过 KNN 算法传递我的测试数据,然后创建所有预测的列表。然后我想使用均方误差来分析我的预测。目前,我无法设置传递我的测试数据的功能。任何指导将不胜感激!

import pandas as pd
import numpy as np
import math
from google.colab import drive
drive.mount('/content/drive')
pd.set_option('display.max_columns', 100)
vehicles = pd.read_csv('/content/drive/MyDrive/CS_167/vehicles (2).csv')
subset_cars = vehicles[vehicles["fuelType"] == 'Regular']
final_sub = subset_cars[["comb08", "year", "cylinders", "displ"]] 
column_nulls = final_sub.isna().any()
Cylinder_no_null = final_sub.cylinders.dropna()
displ_no_null = final_sub.displ.dropna()
pure_data = final_sub.dropna()
# pure_data.head()


shuffled_data = pure_data.sample(frac=1, random_state=41)
test_data = shuffled_data.iloc[0:500]
train_data = shuffled_data.iloc[500:]
train_data_euc = train_data.copy()
test_data_euc = test_data.copy()

def Regression_KNN(MPG,train_data_euc,k):
    train_data_euc['euc_dis'] = np.sqrt(
        (MPG['year']-train_data_euc['year'])**2+
        (MPG['cylinders']-train_data_euc['cylinders'])**2+
        (MPG['displ']-train_data_euc['displ'])**2)
    
    sorted_train_data = train_data_euc.sort_values(['euc_dis'])
    prediction = sorted_train_data.iloc[0:k]['comb08'].mean()
    return prediction

MPG ={}
MPG['year'] = 2020
MPG['cylinders'] = 4
MPG['displ'] = 5.2

print(f"The average MPG for this car is: %d" %Regression_KNN(MPG, train_data_euc, 5))

z_train_copy = train_data_euc.copy()

z_train_year_std = z_train_copy['year'].std()
z_train_year_mean = z_train_copy['year'].mean()

z_train_cylinders_std = z_train_copy['cylinders'].std()
z_train_cylinders_mean = z_train_copy['cylinders'].mean()

z_train_displ_std = z_train_copy['displ'].std()
z_train_displ_mean = z_train_copy['displ'].mean()

z_train_euc_std = z_train_copy['euc_dis'].std()
z_train_euc_mean = z_train_copy['euc_dis'].mean()

z_train_copy['year'] = (z_train_copy['year'] - z_train_year_mean)/z_train_year_std
z_train_copy['cylinders'] = (z_train_copy['cylinders'] - z_train_cylinders_mean)/z_train_cylinders_std
z_train_copy['displ'] = (z_train_copy['displ'] - z_train_displ_mean)/z_train_displ_std
z_train_copy['euc_dis'] = (z_train_copy['euc_dis'] - z_train_euc_mean)/z_train_euc_std

def Z_TRAIN_KNN(MPG, z_train_copy, k):
  z_train_copy['euc_dis'] = np.sqrt(
    (MPG['year']-z_train_copy['year'])**2+
    (MPG['cylinders']-z_train_copy['cylinders'])**2+
    (MPG['displ']-z_train_copy['displ'])**2)

  z_train_sorted_data = z_train_copy.sort_values(['euc_dis'])
  z_train_prediction = z_train_sorted_data.iloc[0:k]['comb08'].mean()
  return z_train_prediction

MPG ={}
MPG['year'] = 2020
MPG['cylinders'] = 4
MPG['displ'] = 5.2

print(f"The average MPG for this car is: %d" %Z_TRAIN_KNN(MPG, z_train_copy, 5))

def regression_all_kNN(test_data_euc,z_train_data,k):
    #apply the classify_kNN function to each item in the test data with the train
    #data and k passed as the other two arguments. The result will be a series of 
    #the individual results.
    for i in test_data:
        z_train_data['euc_dis'] = np.sqrt(
          (test_data['year']- z_train_data['year'])**2+
          (test_data['cylinders']- z_train_data['cylinders'])**2+
          (test_data['displ']- z_train_data['displ'])**2)
    
    sorted_train_data = z_train_data.sort_values(['euc_dis'])
    prediction = test_data.apply(regression_all_kNN,args=(z_train_data,k))
    return prediction

predictions5NN = regression_all_kNN(test_data, train_data, 5)

标签: pythonmachine-learningregressionknn

解决方案


推荐阅读