I get there are many solutions on here, but I have literally gone through them all and still can't fix my problem. Im trying to print the rSquared for each graph but I am getting the error 'the local variable "m" referenced before assignment. Please help! I know the spacing is off on here. I have all of that right in my runner. Thanks!
def readData(fileName):
hsGPA = [] #High School GPA
mathSAT = [] #Math SAT scores
crSAT = [] #Verbal SAT scores
collegeGPA = [] #College GPA
compGPA=[] #ComputerScience GPA
FullList=[]
inputFile = open(fileName, 'r', encoding = 'utf-8')
for line in inputFile:
FullList=line.split(',')
hsGPA.append(float(FullList[0]))
mathSAT.append(int(FullList[1]))
crSAT.append(int(FullList[2]))
collegeGPA.append(float(FullList[3]))
compGPA.append(float(FullList[4]))
return hsGPA, mathSAT, crSAT, collegeGPA, compGPA
def plotData(hsGPA, mathSAT, crSAT, collegeGPA,compGPA):
GPA1 = [] #High School GPA
Score1 = [] #Math SAT scores
Score2= [] #Verbal SAT scores
GPA2 = [] #College GPA
GPA3=[] #ComputerScience GPA
hsGPA, mathGPA, crSAT, collegeGPA,compGPA = readData('satFINAL.txt')
pyplot.figure(1)
pyplot.subplot(5,1,1)
for line in range(len(hsGPA)):
GPA1.append(line)
pyplot.plot(GPA1,hsGPA)
pyplot.subplot(5,1,2)
for line in range(len(mathSAT)):
Score1.append(line)
pyplot.plot(Score1,mathSAT)
pyplot.subplot(5,1,3)
for line in range(len(crSAT)):
Score2.append(line)
pyplot.plot(Score2,crSAT)
pyplot.subplot(5,1,4)
for line in range(len(collegeGPA)):
GPA2.append(line)
pyplot.plot(GPA2,collegeGPA)
pyplot.subplot(5,1,5)
for line in range(len(compGPA)):
GPA3.append(line)
pyplot.plot(GPA3,compGPA)
pyplot.show()
def LinearRegression(xList, yList):
'''
This function finds the constants in the y = mx+b, or linear regression
forumula
xList - a list of the x values
yList - a list of the y values
m - the slope f the line
b - where the line intercepts the y axis
'''
n = len(xList)
sumX = 0
sumXX = 0
sumXY = 0
sumY = 0
for index in range(n):
sumX += xList[index]
sumXY += xList[index] * yList[index]
sumXX += xList[index]**2
sumY += yList[index]
#the components needed to find m and b
m = (n*(sumXY - (sumX*sumY)))/(n*(sumXX - (sumX**2)))
b = (sumY - (m*sumX))/n
#actually implements formula
return m, b
def plotRegression(x,y, xLabel, yLabel):
pyplot.scatter(x,y)
m,b = LinearRegression(x,y)
minX = min(x)
maxX = max(x)
pyplot.plot([minX, maxX], [m * minX + b, m * maxX + b], color ='red')
pyplot.xlabel(xLabel)
pyplot.ylabel(yLabel)
pyplot.show()
def rSquared(x,y):
n = len(x)
R=0
sumS=0
sumT=0
sumY=0
for index in range(n):
a=(y[index]-((m*x[index])+b))**2
sumS = sumS+a
for index in range(len(y)):
sumY = sumY= y[index]
MeanY= sumY/(len(y))
e=(y[index]-MeanY)**2
sumT = sumT+e
m,b= LinearRegression(x, y)
RG=1-(sumS/sumT)
def main():
data = readData('satFINAL.txt')
print(data)
plotData(*data)
hsGPA, mathSAT, crSAT, collegeGPA,compGPA = data
# added ScoreT calculation here
ScoreT = [sum(x) for x in zip(mathSAT, crSAT)]
plotRegression(hsGPA,collegeGPA, 'highGPA', 'collegeGPA')
plotRegression(mathSAT,collegeGPA, 'mathSAT' , 'collegeGPA')
plotRegression(crSAT,collegeGPA, 'crSAT' , 'collegeGPA')
plotRegression(ScoreT,collegeGPA, 'Math and CR SAT' , 'collegeGPA')
plotRegression(mathSAT,crSAT, 'mathSAT', 'CR SAT')
plotRegression(mathSAT,compGPA, 'mathSAT', 'CompGPA')
plotRegression(hsGPA,compGPA, 'HsGPA', 'CompGPA')
plotRegression(ScoreT,compGPA, 'SATscore ', 'CompGPA')
print(rSquared(hsGPA,collegeGPA))
main()
It's very hard to tell - your indentation is messed up, and you've got an awful lot of code, and you haven't actually given the error trace (which would actually identify the line the error is on!) - but it looks like, in the definition of rSquared
, you call a=(y[index]-((m*x[index])+b))**2
before assigning a value to m
.
Edit : I went through and refactored a lot of your repeated code into loops; it is hopefully more readable now. I also cross-checked the linear_regression
function against scipy.stats.linregress
and got identical results; I have not verified r_squared
, so you should check that.
import matplotlib.pyplot as plt
# column indices
HS, MATH, VERBAL, COLLEGE, COMPSCI = range(5)
# column labels
LABELS = ["High school GPA", "Math SAT", "Verbal SAT", "College GPA", "CompSci GPA"]
# column data types
DTYPES = [ float, int, int, float, float ]
def read_columns(fname, encoding="utf-8", separator=",", dtypes=None):
"""
Return columns of data from a file
If dtypes is specified, convert each column to the given data type
"""
# read rows from data file
with open(fname, encoding=encoding) as inf:
rows = [line.split(separator) for line in inf]
# transpose to columns
cols = zip(*rows)
# apply data types
if dtypes is not None:
cols = [[dtype(cell) for cell in col] for dtype,col in zip(dtypes,cols)]
return cols
def linear_regression(xs, ys):
"""
Return the linear regression constants m,b
in the least-squares best fit to y = m*x+b
"""
# if you have SciPy you can use scipy.stats.linregress instead
n = len(xs)
xsum = sum(xs)
ysum = sum(ys)
xxsum = sum(x*x for x in xs)
xysum = sum(x*y for x,y in zip(xs, ys))
m = (n * xysum - xsum * ysum) / (n * xxsum - xsum * xsum)
b = (ysum - m * xsum) / n
return m, b
def r_squared(xs, ys):
m, b = linear_regression(xs, ys)
ysum, n = sum(ys), len(ys)
ymean = ysum / n
ssum = sum((y - (m * x + b))**2 for x,y in zip(xs, ys))
tsum = sum((y - ymean)**2 for y in ys)
return 1 - ssum / tsum
def plot_regression(xs, xlabel, ys, ylabel):
m, b = linear_regression(xs, ys)
min_, max_ = min(xs), max(xs)
plt.scatter(xs, ys)
plt.plot([min_, max_], [m * min_ + b, m * max_ + b], "r")
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.show()
def main():
# read data
scores = read_columns("satFINAL.txt", dtypes=DTYPES)
# add composite math-and-verbal score
MATH_VERBAL = 5
LABELS.append("Math+Verbal SAT")
DTYPES.append(int)
scores.append([math+verbal for math,verbal in zip(scores[MATH], scores[VERBAL])])
# do raw score plots
plt.figure(1)
num_figs = len(LABELS)
# draw subplots
for fig, column, nums in zip(range(num_figs), LABELS, scores):
plt.subplot(num_figs, 1, fig+1)
plt.plot(range(len(nums)), nums)
plt.xlabel(LABELS[fig])
# display results
plt.show()
# do regression plots
regressions = [
(HS, COLLEGE),
(MATH, COLLEGE),
(VERBAL, COLLEGE),
(MATH_VERBAL, COLLEGE),
(MATH, VERBAL),
(MATH, COMPSCI),
(HS, COMPSCI),
(MATH_VERBAL, COMPSCI)
]
for x,y in regressions:
print("r**2 for {} and {}: {}".format(LABELS[x], LABELS[y], r_squared(scores[x], scores[y])))
plot_regression(scores[x], LABELS[x], scores[y], LABELS[y])
if __name__=="__main__":
main()
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.