Delete studentdata

sofieaasheim · Nov 8, 2020 · 891b5b8 · 891b5b8
1 parent 4d90c1e
commit 891b5b8
Show file tree

Hide file tree

Showing 8 changed files with 41 additions and 1,207 deletions.
diff --git a/__pycache__/app.cpython-38.pyc b/__pycache__/app.cpython-38.pyc
diff --git a/app.py b/app.py
@@ -10,7 +10,7 @@
 import dash_html_components as html
 
 external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
-data_url = 'https://raw.githubusercontent.com/sofieaasheim/supervised-learning-project/test/data/life-expectancy.csv'
+data_url = 'https://raw.githubusercontent.com/sofieaasheim/supervised-learning-project/main/data/life-expectancy.csv'
 
 app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
 

diff --git a/data/student-mat.csv b/data/student-mat.csv
diff --git a/data/student-por.csv b/data/student-por.csv
diff --git a/data/student.txt b/data/student.txt
diff --git a/tests/dataimports.py b/tests/dataimports.py
diff --git a/tests/linear_reg.py b/tests/linear_reg.py
@@ -1,81 +1,42 @@
-#Import Library
-import numpy as np
 import pandas as pd
+import numpy as np
+import plotly.graph_objects as go
 from sklearn import linear_model
-import sklearn
-from sklearn.utils import shuffle
-import matplotlib.pyplot as plt
-from matplotlib import style
-import pickle
-
-style.use("ggplot")
-
-# Loading data 
-data = pd.read_csv("./supervised-learning-project/data/student-mat.csv", sep=";")
-
-# Trimming data 
-data = data[["freetime", "age", "health", "Dalc", "Walc", "Medu", "Fedu", "G3"]]
-data = shuffle(data) # Optional - shuffle the data
-
-# Separating data 
-predict = "G3" # = respons
-
-x = np.array(data.drop([predict], 1)) # parameters
-y = np.array(data[predict]) # respons
-
-# Splitting in testing and training sets 
-x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.1)
-
-
-# TRAIN MODEL MULTIPLE TIMES FOR BEST SCORE
-best = 0
-for _ in range(20):
-    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.1)
-
-    # Implementing linear regression 
-    linear = linear_model.LinearRegression()
-
-    linear.fit(x_train, y_train)
-    acc = linear.score(x_test, y_test)
-    print("Accuracy: " + str(acc))
-
-    if acc > best:
-        best = acc
-        with open("studentgrades.pickle", "wb") as f: # Saving the model if it has a better score than one we've already trained
-            pickle.dump(linear, f)
-
-print("Best accuracy:")
-print(best)
-
-
-# LOAD MODEL
-pickle_in = open("studentgrades.pickle", "rb")
-linear = pickle.load(pickle_in)
-
-
-print("-------------------------")
-print('Coefficient: \n', linear.coef_) # each slope value
-print('Intercept: \n', linear.intercept_) 
-print("-------------------------")
-
-# List of all predictions
-print("List of predictions:")
-predicted = linear.predict(x_test)
-#predicted_data = list[]
-for x in range(len(predicted)):
-    print(predicted[x], x_test[x], y_test[x])
-    #predicted_data.append(predicted[x])
-#printe
-
-# Drawing and plotting model
-plot = "Fedu"
-plt.scatter(data[plot], data["G3"])
-plt.legend(loc=4)
-plt.xlabel(plot)
-plt.ylabel("Final Grade")
-plt.show()
-
-
-# Plotting predicted grade against grade
-
-#plt.scatter(predicted)
+import statsmodels.api as sm
+
+# Import the entire data sets
+df = pd.read_csv("../data/life-expectancy.csv", sep=",")
+df.drop(['Country', 'Year', 'Status'], axis=1, inplace=True)
+df_regr = df[np.isfinite(df).all(1)]
+
+# Dependent and independent variables
+X = df_regr[['Schooling', 'Income', 'AdultMortality']].round(decimals=2)
+y = df_regr['LifeExpectancy'].round(decimals=2)
+print(np.any(np.isnan(df_regr))) #and gets False
+print(np.all(np.isfinite(df_regr)))
+
+
+# Make correlation plots for all parameters
+"""for col in X:
+    fig = go.Figure(data=go.Scatter(x=X[col], y=y, mode='markers'))
+    fig.update_layout(title=col)
+    fig.show()"""
+
+"""Parameters showing linearity:
+AdultMortality, InfantDeaths, UnderFiveDeaths, Polio, Diphteria, HIVAIDS, GDP, Income, Schooling
+"""
+
+# Regression test
+def linear_prediction_model():
+    regr = linear_model.LinearRegression()
+    regr.fit(X, y)
+
+    print('Intercept: \n', regr.intercept_)
+    print('Coefficients: \n', regr.coef_)
+
+    test_y = [[10, 200, 300]]
+
+    prediction_result  = regr.predict(test_y)[0]
+    print(prediction_result)
+    return 0
+linear_prediction_model()
diff --git a/tests/sofietest.py b/tests/sofietest.py