首页 > 解决方案 > 在进行预测时为未来的数据框提供什么价值?

问题描述

我有一个数据集“group_by_df”,它有一个“day”列和“o3”。我想在未来做一些预测,例如未来 5 天。我已经使用以下代码在接下来的 5 天内完成了“day”列中的数据框:

days_predicted = 5
    rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')
    df = pd.DataFrame({'day': rng})
    df[sensor_name] = group_by_df["o3"] #the current existent values for o3
    df[sensor_name][len(group_by_df):] = "" #the future values

问题是我不能用这种格式对“o3”列进行线性回归,因为我要么将其保留为 Nan,要么将其保留为“”。但是线性回归会抛出这个错误:ValueError: could not convert string to float: 我应该给“o3”列什么值,以便我可以做未来的预测?这是更新的代码,我用已知天数的平均值替换了未知的“o3”值。这是一个好方法吗?

import datetime
import datetime as dt

import numpy as np
import pandas as pd
import plotly.graph_objs as go
from distributed.deploy.ssh import bcolors
from flask_babel import _
from pandas.plotting import register_matplotlib_converters
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None  # default='warn'

register_matplotlib_converters()


def create_columns(data):
    data["readable time"] = ""
    data["day"] = ""
    for i in range(0, len(data)):
        data.loc[i, ['readable time']] = datetime.datetime.fromtimestamp(
            data["time"][i]).strftime(
            '%d/%m/%Y %H:%M:%S')

        data.loc[i, ['day']] = datetime.datetime.fromtimestamp(data["time"][i]).strftime(
            '%d/%m/%Y')


def calculate_linear_regression(data, sensor_name):
    create_columns(data)  # from timeseries
    data['day'] = pd.to_datetime(data['day'], dayfirst=True)
    data = data.sort_values(by=['readable time'])

    group_by_df = pd.DataFrame([name, group.mean()[sensor_name]] for name, group in data.groupby('day'))

    group_by_df.columns = ['day', sensor_name]
    print("group by df ", group_by_df)
    group_by_df['day'] = pd.to_datetime(group_by_df['day'])

    # initial length of dataframe(before future prediction)
    initial_len_df = len(group_by_df)
    days_predicted = 5
    rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')

    df = pd.DataFrame({'day': rng})
    df[sensor_name] = group_by_df[sensor_name]
    df[sensor_name][len(group_by_df):] = group_by_df[sensor_name].mean()  # ""

    print("after... \n", df)
    group_by_df = df
    print("GROUP BY DF\n", group_by_df)
    group_by_df['day'] = group_by_df['day'].map(dt.datetime.toordinal)

    def split(group_by_df):
        X = group_by_df[['day']].values
        y = group_by_df[[sensor_name]].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
        return X_train, X_test, y_train, y_test

    X_train, X_test, y_train, y_test = split(group_by_df)

    def analyse_forecast():
        print("MSE linear regression(mean squared error)",
              mean_squared_error(group_by_df[sensor_name], group_by_df['predicted']))
        print("r2 score ", r2_score(group_by_df[sensor_name], group_by_df['predicted']))
        rmse = np.sqrt(mean_squared_error(group_by_df[sensor_name], group_by_df['predicted']))
        print("RMSE for linear regression=", rmse)
        print( "MSE TEST ", mean_squared_error(y_test, group_by_df['predicted'][len(X_train):]))
        print("MSE TRAIN ", mean_squared_error(y_train, group_by_df['predicted'][:len(X_train)]))
        print("r2 score TEST", r2_score(y_test, group_by_df['predicted'][len(X_train):]))
        return mean_squared_error(group_by_df[sensor_name], group_by_df['predicted'])

    def calculate_linear_reg():
        group_by_df.reset_index(inplace=True)
        mse_list = []

        model = LinearRegression()
        model.fit(X_train, y_train)

        y_pred = model.predict(group_by_df[['day']])
        group_by_df['predicted'] = y_pred
        mse_list.append(analyse_forecast())
        print(group_by_df)  # print predicted values

    calculate_linear_reg()

    return group_by_df, X_train, sensor_name, initial_len_df


def create_figure(group_by_df, X_train, sensor_name, initial_len_df):
    print("INITIAL LEN DF IS", initial_len_df)
    linear_regression_fig = go.Figure()
    # plot predicted values
    linear_regression_fig.add_trace(go.Scatter(
        x=group_by_df['day'].map(dt.datetime.fromordinal),
        y=group_by_df['predicted'],
        name=_("Linear Regression"),
        mode='lines+markers',
        marker=dict(
            color=np.where(group_by_df['day'].index < len(X_train), 'red', 'green'))))
    # plot actual values
    linear_regression_fig.add_trace(go.Scatter(
        x=group_by_df['day'].map(dt.datetime.fromordinal)[:initial_len_df],
        y=group_by_df[sensor_name][:initial_len_df],
        name=_('Actual values'),
        mode='lines+markers'))

    linear_regression_fig.update_layout(
        height=700,
        font=dict(color="grey"),
        paper_bgcolor='rgba(0,0,0,0)',
        title=_('Linear Regression for ') + _(sensor_name),
        yaxis_title=_(sensor_name),
        xaxis_title=_('Day'),
        showlegend=True)
    linear_regression_fig.show()

data="https://raw.githubusercontent.com/iulianastroia/csv_data/master/final_dataframe.csv"
data = pd.read_csv(data)


group_by_df, X_train, sensor_name, initial_len_df = calculate_linear_regression(data, "o3")
linear_reg_fig = create_figure(group_by_df, X_train, sensor_name, initial_len_df)

``

标签: pythonpandasdataframeregressionprediction

解决方案


推荐阅读