首页 > 解决方案 > 如何获得回归以优化系数和数据数组?Python

问题描述

编辑:31/7/19

我有一个数据集,其中包含 4 个站点的 PGR(牧场增长率)和 Foo(牧场数量)读数,一年中大约 6 周。PGR 和 Foo 之间的关系是反指数的。

我想做的是将几周分成3个批次。PGR 和 Foo 之间具有相似关系的几周将在一起。

组大小不必相同。但是周必须是连续的,即
第一组 - 第 1 周,第 2 周,第 3 周。
第 2 组 - 第 4 周。第 3 组
- 第 5 周,第 6 周。

我想做的是创建 3 个回归来优化以减少平方和,同时优化周选择。

上面的例子表明第 1 - 3 周相似,第 4 周与第 3 周不同,第 5 周和第 6 周彼此相似但与第 4 周不同。(我希望这个分组根据回归自动发生)

下面的代码是我的尝试,但它不起作用(我将其包含在内以帮助更好地解释我正在尝试做的事情)。

data = {'Week':[1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6],
'PGR':[10,29,34.93,32,10,29,34.93,35,31,36,34.93,37,40,46,50,52,40,60,65,68,42,62,65,68],
'Foo': [20,45,102.28,66.79,25,50,90,75,50,75,90,130,50,75,90,130,30,60,105,150,35,60,110,140]}

df = pd.DataFrame(data)


def group(x):
    a, b, c, e, f, g, h, i, j, w, z, y = x

    #below defines the groups, I want z, y & w to be optimised when this function is solverd
    #This determines which weeks are in which groups
    group1 = df.loc[(df['Week'] == range(1,z))]
    group2 = df.loc[(df['Week'] == range(z,y))]
    group3 = df.loc[(df['Week'] == range(y,w))]

    #Once the groups are defined this will extract Foo and PGR values for regressions
    xm1 = group1['Foo'].to_numpy()
    ym1 = group1['PGR'].to_numpy()

    xm2 = group2['Foo'].to_numpy()
    ym2 = group2['PGR'].to_numpy()

    xm3 = group3['Foo'].to_numpy()
    ym3 = group3['PGR'].to_numpy()

    #These are the 3 regressions
    y1 = a + b / xm1 + c * np.log(xm1)
    SSE1 = (y1 - ym1)**2
    y2 = e + f / xm2 + g * np.log(xm2)
    SSE2 = (y2 - ym2) ** 2
    y3 = h + i / xm3 + j * np.log(xm3)
    SSE3 = (y3 - ym3) ** 2
    return SSE1, SSE2, SSE3

#I now have the sum of squares for all the regressions, which I want to minimise
#Minimising can happen by selecting groups that are more similar or by changing the regression coefficients 
def objective(x):
    return np.sum(group(x))

x0 = np.zeros(12)

# bounds for a, b, c,  e, f, g, h, i, j, w, z, y
bndspositive = (0,52)
bnds100 = (-100.0, 100.0)
no_bnds = (-1.0e10, 1.0e10)
bnds = (no_bnds, no_bnds, bnds100, no_bnds, no_bnds, bnds100, no_bnds, no_bnds, bnds100, bndspositive, bndspositive, bndspositive)

# optimise groups and regressions for best fit
solution = minimize(objective, x0, method=None, bounds=bnds)

# solution
x = solution.x

希望这是有道理的,谢谢

标签: pythondataframeoptimizationregression

解决方案


另一种方法是将数据拟合为 3D 表面。我的方程搜索出现“Foo = a * PGR + b * week^2 + Offset”作为可能的候选方程,参数 a = 2.90940013、b = -2.33138779 和 Offset = -10.04234205 产生 RMSE = 22.02 和 R 平方= 0.6338。这是一个带有您的数据和这个等式的图形 Python 拟合器。

散点图

曲面图

等高线图

import numpy, scipy, scipy.optimize
import matplotlib
from mpl_toolkits.mplot3d import  Axes3D
from matplotlib import cm # to colormap 3D surfaces from blue to red
import matplotlib.pyplot as plt

graphWidth = 800 # units are pixels
graphHeight = 600 # units are pixels

# 3D contour plot lines
numberOfContourLines = 16


def SurfacePlot(func, data, fittedParameters):
    f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)

    matplotlib.pyplot.grid(True)
    axes = Axes3D(f)

    x_data = data[0]
    y_data = data[1]
    z_data = data[2]

    xModel = numpy.linspace(min(x_data), max(x_data), 20)
    yModel = numpy.linspace(min(y_data), max(y_data), 20)
    X, Y = numpy.meshgrid(xModel, yModel)

    Z = func(numpy.array([X, Y]), *fittedParameters)

    axes.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=1, antialiased=True)

    axes.scatter(x_data, y_data, z_data) # show data along with plotted surface

    axes.set_title('Surface Plot (click-drag with mouse)') # add a title for surface plot
    axes.set_xlabel('Week') # X axis data label
    axes.set_ylabel('PGR') # Y axis data label
    axes.set_zlabel('Foo') # Z axis data label

    plt.show()
    plt.close('all') # clean up after using pyplot or else there can be memory and process problems


def ContourPlot(func, data, fittedParameters):
    f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
    axes = f.add_subplot(111)

    x_data = data[0]
    y_data = data[1]
    z_data = data[2]

    xModel = numpy.linspace(min(x_data), max(x_data), 20)
    yModel = numpy.linspace(min(y_data), max(y_data), 20)
    X, Y = numpy.meshgrid(xModel, yModel)

    Z = func(numpy.array([X, Y]), *fittedParameters)

    axes.plot(x_data, y_data, 'o')

    axes.set_title('Contour Plot') # add a title for contour plot
    axes.set_xlabel('Week') # X axis data label
    axes.set_ylabel('PGR') # Y axis data label

    CS = matplotlib.pyplot.contour(X, Y, Z, numberOfContourLines, colors='k')
    matplotlib.pyplot.clabel(CS, inline=1, fontsize=10) # labels for contours

    plt.show()
    plt.close('all') # clean up after using pyplot or else there can be memory and process problems


def ScatterPlot(data):
    f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)

    matplotlib.pyplot.grid(True)
    axes = Axes3D(f)
    x_data = data[0]
    y_data = data[1]
    z_data = data[2]

    axes.scatter(x_data, y_data, z_data)

    axes.set_title('Scatter Plot (click-drag with mouse)')
    axes.set_xlabel('Week')
    axes.set_ylabel('PGR')
    axes.set_zlabel('Z Foo')

    plt.show()
    plt.close('all') # clean up after using pyplot or else there can be memory and process problems


def func(data, a, b, Offset):
    x = data[0]
    y = data[1]
    return a*y + b*numpy.square(x) + Offset


if __name__ == "__main__":

    week = [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6]
    PGR = [10,29,34.93,32,10,29,34.93,35,31,36,34.93,37,40,46,50,52,40,60,65,68,42,62,65,68]
    Foo = [20,45,102.28,66.79,25,50,90,75,50,75,90,130,50,75,90,130,30,60,105,150,35,60,110,140]

    xData = numpy.array(week, dtype=numpy.float32)
    yData = numpy.array(PGR, dtype=numpy.float32)
    zData = numpy.array(Foo, dtype=numpy.float32)

    data = [xData, yData, zData]

    initialParameters = [1.0, 1.0, 1.0] # these are the same as scipy default values in this example

    # here a non-linear surface fit is made with scipy's curve_fit()
    fittedParameters, pcov = scipy.optimize.curve_fit(func, [xData, yData], zData, p0 = initialParameters)

    ScatterPlot(data)
    SurfacePlot(func, data, fittedParameters)
    ContourPlot(func, data, fittedParameters)

    print('fitted prameters', fittedParameters)

    modelPredictions = func(data, *fittedParameters) 

    absError = modelPredictions - zData

    SE = numpy.square(absError) # squared errors
    MSE = numpy.mean(SE) # mean squared errors
    RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
    Rsquared = 1.0 - (numpy.var(absError) / numpy.var(zData))
    print('RMSE:', RMSE)
    print('R-squared:', Rsquared)

推荐阅读