How to correct my over fitted spline curve in python

Question

I have a data set and I've been asked to assign a smooth curve over its bar graph. However my created graph seems very overfitted. they asked me to assign some sort of graph which can join some adjacent minimums and maximums and I don't know how to do that. Any help would be appreciated.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.interpolate import spline



fig, ax = plt.subplots()
fig.set_size_inches(13,7, forward=True)

width=1.0



data=pd.DataFrame({'x':[-29, -28, -27, -26, -25, -24, -23, -22, -21, -20, -19, -18, -17, -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
                   'y':[0.002383790226460071, 0.002383790226460071, 0.005164878823996822, 0.004370282081843465, 0.005164878823996822, 0.002383790226460071, 0.003178386968613429, 0.005959475566150178, 0.005959475566150178, 0.006754072308303536, 0.007945967421533572, 0.003575685339690107, 0.001986491855383393, 0.003575685339690107, 0.003972983710766786, 0.00278108859753675, 0.005959475566150178, 0.007151370679380214, 0.004767580452920143, 0.005164878823996822, 0.003575685339690107, 0.004370282081843465, 0.007945967421533572, 0.01311084624553039, 0.01072705601907032, 0.01430274135876043, 0.01231624950337704, 0.01589193484306714, 0.02264600715137068, 0.09654350417163289, 0.05164878823996821, 0.0166865315852205, 0.01549463647199046, 0.01350814461660707, 0.01191895113230036, 0.01191895113230036, 0.00874056416368693, 0.01152165276122368, 0.007151370679380214, 0.009137862534763607, 0.006356773937226857, 0.007151370679380214, 0.00834326579261025, 0.006356773937226857, 0.005562177195073501, 0.006754072308303536, 0.005164878823996822, 0.005164878823996822, 0.005959475566150178, 0.004767580452920143, 0.00278108859753675, 0.007945967421533572, 0.001589193484306714, 0.00278108859753675, 0.003178386968613429, 0.003575685339690107, 0.003178386968613429, 0.004370282081843465, 0.005562177195073501, 0.004370282081843465]})

plt.bar(data['x'],data['y'],width, color='r',  alpha=0.95)
x=data['x']
y=data['y']

x_new = np.linspace(x.min(),x.max(),300)

y_smooth =spline(x,y,x_new, order=3,kind='smoothest')

plt.plot(x_new,y_smooth, color='b')

Answer 1

Here is my fitting and plotting code for a variation of the Lorentzian peak equation from equation search of over 80 peak equations. This code uses scipy's differential_evolution genetic algorithm to estimate initial parameters for the non-linear solver in curve_fit(). That scipy genetic algorithm module uses the Latin Hypercube algorithm to ensure a thorough search of parameter space and requires bounds within which to search, here those bounds are taken from the data max and min values.

import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import differential_evolution
import warnings

xData = numpy.array([-29.0, -28.0, -27.0, -26.0, -25.0, -24.0, -23.0, -22.0, -21.0, -20.0, -19.0, -18.0, -17.0, -16.0, -15.0, -14.0, -13.0, -12.0, -11.0, -10.0, -9.0, -8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0])
yData = numpy.array([0.002383790226460071, 0.002383790226460071, 0.005164878823996822, 0.004370282081843465, 0.005164878823996822, 0.002383790226460071, 0.003178386968613429, 0.005959475566150178, 0.005959475566150178, 0.006754072308303536, 0.007945967421533572, 0.003575685339690107, 0.001986491855383393, 0.003575685339690107, 0.003972983710766786, 0.00278108859753675, 0.005959475566150178, 0.007151370679380214, 0.004767580452920143, 0.005164878823996822, 0.003575685339690107, 0.004370282081843465, 0.007945967421533572, 0.01311084624553039, 0.01072705601907032, 0.01430274135876043, 0.01231624950337704, 0.01589193484306714, 0.02264600715137068, 0.09654350417163289, 0.05164878823996821, 0.0166865315852205, 0.01549463647199046, 0.01350814461660707, 0.01191895113230036, 0.01191895113230036, 0.00874056416368693, 0.01152165276122368, 0.007151370679380214, 0.009137862534763607, 0.006356773937226857, 0.007151370679380214, 0.00834326579261025, 0.006356773937226857, 0.005562177195073501, 0.006754072308303536, 0.005164878823996822, 0.005164878823996822, 0.005959475566150178, 0.004767580452920143, 0.00278108859753675, 0.007945967421533572, 0.001589193484306714, 0.00278108859753675, 0.003178386968613429, 0.003575685339690107, 0.003178386968613429, 0.004370282081843465, 0.005562177195073501, 0.004370282081843465])


def LorentzianPeakG_Offset(x_in, a, b, c, Offset): # from zunzun.com peak equation search
    temp = 0.0
    temp = a/ (1.0 + numpy.power((x_in-b)/c, 2.0))
    temp += Offset
    return temp

# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
    warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
    val = LorentzianPeakG_Offset(xData, *parameterTuple)
    return numpy.sum((yData - val) ** 2.0)


def generate_Initial_Parameters():
    # min and max used for bounds
    maxX = max(xData)
    minX = min(xData)
    maxY = max(yData)
    minY = min(yData)

    parameterBounds = []
    parameterBounds.append([minX, maxX]) # seach bounds for a
    parameterBounds.append([minX, maxX]) # seach bounds for b
    parameterBounds.append([minX, maxX]) # seach bounds for c
    parameterBounds.append([0.0, maxY]) # seach bounds for Offset

    # "seed" the numpy random number generator for repeatable results
    result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
    return result.x

# generate initial parameter values
geneticParameters = generate_Initial_Parameters()

# curve fit the test data
fittedParameters, pcov = curve_fit(LorentzianPeakG_Offset, xData, yData, geneticParameters)

print('Parameters', fittedParameters)

modelPredictions = LorentzianPeakG_Offset(xData, *fittedParameters) 

absError = modelPredictions - yData

SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('RMSE:', RMSE)
print('R-squared:', Rsquared)

print()


##########################################################
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
    f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
    axes = f.add_subplot(111)

    # first the raw data as a scatter plot
    axes.plot(xData, yData,  'D')

    # create data for the fitted equation plot
    xModel = numpy.linspace(min(xData), max(xData), 250)
    yModel = LorentzianPeakG_Offset(xModel, *fittedParameters)

    # now the model as a line plot
    axes.plot(xModel, yModel)

    axes.set_xlabel('X Data') # X axis data label
    axes.set_ylabel('Y Data') # Y axis data label

    plt.show()
    plt.close('all') # clean up after using pyplot

graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)

This is the result:

Answer 2

This is the answer that I ended up with:

[import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.interpolate import UnivariateSpline



fig, ax = plt.subplots()
fig.set_size_inches(13,7, forward=True)

width=1.0

#data=pd.read_excel("h:/projects/psc/output/data.xlsx")

data=pd.DataFrame({'x':\[-29, -28, -27, -26, -25, -24, -23, -22, -21, -20, -19, -18, -17, -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30\],
                   'y':\[0.002383790226460071, 0.002383790226460071, 0.005164878823996822, 0.004370282081843465, 0.005164878823996822, 0.002383790226460071, 0.003178386968613429, 0.005959475566150178, 0.005959475566150178, 0.006754072308303536, 0.007945967421533572, 0.003575685339690107, 0.001986491855383393, 0.003575685339690107, 0.003972983710766786, 0.00278108859753675, 0.005959475566150178, 0.007151370679380214, 0.004767580452920143, 0.005164878823996822, 0.003575685339690107, 0.004370282081843465, 0.007945967421533572, 0.01311084624553039, 0.01072705601907032, 0.01430274135876043, 0.01231624950337704, 0.01589193484306714, 0.02264600715137068, 0.09654350417163289, 0.05164878823996821, 0.0166865315852205, 0.01549463647199046, 0.01350814461660707, 0.01191895113230036, 0.01191895113230036, 0.00874056416368693, 0.01152165276122368, 0.007151370679380214, 0.009137862534763607, 0.006356773937226857, 0.007151370679380214, 0.00834326579261025, 0.006356773937226857, 0.005562177195073501, 0.006754072308303536, 0.005164878823996822, 0.005164878823996822, 0.005959475566150178, 0.004767580452920143, 0.00278108859753675, 0.007945967421533572, 0.001589193484306714, 0.00278108859753675, 0.003178386968613429, 0.003575685339690107, 0.003178386968613429, 0.004370282081843465, 0.005562177195073501, 0.004370282081843465\]})


plt.bar(data\['x'\],data\['y'\],width, color='r',  alpha=0.95)

x=data\['x'\]
y=data\['y'\]
x_new = np.linspace(x.min(),x.max(),300)
spl=UnivariateSpline(x,y)
spl.set_smoothing_factor(0.001)
y_smooth =spl(x_new)
plt.plot(x_new,y_smooth, color='b', alpha=0.95)][1]

enter image description here enter image description here

How to correct my over fitted spline curve in python

Question

2 answers

solution1
0 2018-09-26 23:04:25

solution2
0 ACCPTED 2018-10-02 16:59:11

How to correct my over fitted spline curve in python

Question

2 answers

solution1 0 2018-09-26 23:04:25

solution2 0 ACCPTED 2018-10-02 16:59:11

solution1
0 2018-09-26 23:04:25

solution2
0 ACCPTED 2018-10-02 16:59:11