python - 在进行预测时为未来的数据框提供什么价值?
问题描述
我有一个数据集“group_by_df”,它有一个“day”列和“o3”。我想在未来做一些预测,例如未来 5 天。我已经使用以下代码在接下来的 5 天内完成了“day”列中的数据框:
days_predicted = 5
rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')
df = pd.DataFrame({'day': rng})
df[sensor_name] = group_by_df["o3"] #the current existent values for o3
df[sensor_name][len(group_by_df):] = "" #the future values
问题是我不能用这种格式对“o3”列进行线性回归,因为我要么将其保留为 Nan,要么将其保留为“”。但是线性回归会抛出这个错误:ValueError: could not convert string to float:
我应该给“o3”列什么值,以便我可以做未来的预测?这是更新的代码,我用已知天数的平均值替换了未知的“o3”值。这是一个好方法吗?
import datetime
import datetime as dt
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from distributed.deploy.ssh import bcolors
from flask_babel import _
from pandas.plotting import register_matplotlib_converters
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None # default='warn'
register_matplotlib_converters()
def create_columns(data):
data["readable time"] = ""
data["day"] = ""
for i in range(0, len(data)):
data.loc[i, ['readable time']] = datetime.datetime.fromtimestamp(
data["time"][i]).strftime(
'%d/%m/%Y %H:%M:%S')
data.loc[i, ['day']] = datetime.datetime.fromtimestamp(data["time"][i]).strftime(
'%d/%m/%Y')
def calculate_linear_regression(data, sensor_name):
create_columns(data) # from timeseries
data['day'] = pd.to_datetime(data['day'], dayfirst=True)
data = data.sort_values(by=['readable time'])
group_by_df = pd.DataFrame([name, group.mean()[sensor_name]] for name, group in data.groupby('day'))
group_by_df.columns = ['day', sensor_name]
print("group by df ", group_by_df)
group_by_df['day'] = pd.to_datetime(group_by_df['day'])
# initial length of dataframe(before future prediction)
initial_len_df = len(group_by_df)
days_predicted = 5
rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')
df = pd.DataFrame({'day': rng})
df[sensor_name] = group_by_df[sensor_name]
df[sensor_name][len(group_by_df):] = group_by_df[sensor_name].mean() # ""
print("after... \n", df)
group_by_df = df
print("GROUP BY DF\n", group_by_df)
group_by_df['day'] = group_by_df['day'].map(dt.datetime.toordinal)
def split(group_by_df):
X = group_by_df[['day']].values
y = group_by_df[[sensor_name]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split(group_by_df)
def analyse_forecast():
print("MSE linear regression(mean squared error)",
mean_squared_error(group_by_df[sensor_name], group_by_df['predicted']))
print("r2 score ", r2_score(group_by_df[sensor_name], group_by_df['predicted']))
rmse = np.sqrt(mean_squared_error(group_by_df[sensor_name], group_by_df['predicted']))
print("RMSE for linear regression=", rmse)
print( "MSE TEST ", mean_squared_error(y_test, group_by_df['predicted'][len(X_train):]))
print("MSE TRAIN ", mean_squared_error(y_train, group_by_df['predicted'][:len(X_train)]))
print("r2 score TEST", r2_score(y_test, group_by_df['predicted'][len(X_train):]))
return mean_squared_error(group_by_df[sensor_name], group_by_df['predicted'])
def calculate_linear_reg():
group_by_df.reset_index(inplace=True)
mse_list = []
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(group_by_df[['day']])
group_by_df['predicted'] = y_pred
mse_list.append(analyse_forecast())
print(group_by_df) # print predicted values
calculate_linear_reg()
return group_by_df, X_train, sensor_name, initial_len_df
def create_figure(group_by_df, X_train, sensor_name, initial_len_df):
print("INITIAL LEN DF IS", initial_len_df)
linear_regression_fig = go.Figure()
# plot predicted values
linear_regression_fig.add_trace(go.Scatter(
x=group_by_df['day'].map(dt.datetime.fromordinal),
y=group_by_df['predicted'],
name=_("Linear Regression"),
mode='lines+markers',
marker=dict(
color=np.where(group_by_df['day'].index < len(X_train), 'red', 'green'))))
# plot actual values
linear_regression_fig.add_trace(go.Scatter(
x=group_by_df['day'].map(dt.datetime.fromordinal)[:initial_len_df],
y=group_by_df[sensor_name][:initial_len_df],
name=_('Actual values'),
mode='lines+markers'))
linear_regression_fig.update_layout(
height=700,
font=dict(color="grey"),
paper_bgcolor='rgba(0,0,0,0)',
title=_('Linear Regression for ') + _(sensor_name),
yaxis_title=_(sensor_name),
xaxis_title=_('Day'),
showlegend=True)
linear_regression_fig.show()
data="https://raw.githubusercontent.com/iulianastroia/csv_data/master/final_dataframe.csv"
data = pd.read_csv(data)
group_by_df, X_train, sensor_name, initial_len_df = calculate_linear_regression(data, "o3")
linear_reg_fig = create_figure(group_by_df, X_train, sensor_name, initial_len_df)
``
解决方案
推荐阅读
- php - WordPress add_rewrite_url 用于多个查询变量
- python - lambda 表达式相对于常规函数的优势是什么?
- oracle - 如何有条件地仅连接多个表中的一些?
- python-2.7 - 为什么在 Python 2.7 中写入输出文件时将 ^M 添加到文件的最后一行?
- java - Discord JDA - 如何检查用户邀请了多少用户
- javascript - 获取数据时有没有办法将变量添加到构造函数?
- google-cloud-storage - How to cancel resumable UploadChunk with google-cloud-cpp
- mrtg - 新的 MRTG 安装副本正在制作丑陋的重新缩放图表
- apache-spark-sql - SparkSQL查询计划中的HashAggregate
- android - didCrashOnPreviousExecution() 未清除