From 238d76968da5c820f73148ace7e05b892f01d77c Mon Sep 17 00:00:00 2001 From: sastaachar <42416647+sastaachar@users.noreply.github.com> Date: Fri, 4 Oct 2019 10:10:49 +0530 Subject: [PATCH 1/3] Added Code for Linear Regression Linear Regression has been performed using the basic algorithm, a basic example is also provided using the boston housing data and the result is compared with sk learn models. --- machine_learning/LinearRegression | 174 ++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 machine_learning/LinearRegression diff --git a/machine_learning/LinearRegression b/machine_learning/LinearRegression new file mode 100644 index 0000000..ebc4e0a --- /dev/null +++ b/machine_learning/LinearRegression @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +""" +@author: sasta_achar +""" + +#import pandas to import datasets +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +import matplotlib.pyplot as plt + + +#We can use this instead of the dot function +def mx(x,slopes): + mx_sum =0 + for i in range(len(slopes)): + mx_sum += x[i]*slopes[i] + return mx_sum + +#dot product +def dot(vector_a,vector_b): + #print(vector_a,vector_b) + dot_product = 0 + if len(vector_a) != len(vector_b): + return -1; + for i in range(len(vector_a)): + dot_product += vector_a[i]*vector_b[i] + return dot_product + +def R_sq_value(slopes, intercept, x_train, y_train): + + #may need to change to int64 , cause float32 sometimes gives wrong answers due to overflow \(^o^)/ + #x_train = x_train.astype("int64") + sum_of_errors = 0 + + for i in range(len(x_train)): + sum_of_errors += (y_train[i] - ((np.dot(x_train[i,:],slopes)) + intercept))**2 + + #avg error + sum_of_errors = sum_of_errors / len(x_train) + + return sum_of_errors + +def gradient_decent(x_train, y_train, slopes, intercept, LearningRate = 0.3): + + x_gradients = np.zeros(x_train.shape[1]) + c_gradient = 0 + + for i in range(len(y_train)): + #yp is the predicted value + yp = (dot(x_train[i,:],slopes)) + intercept + #yp = (slopes.dot(x_train[i])) + intercept + + + for j in range(x_train.shape[1]): + x_gradients[j] += -2*((x_train[i,j])*(y_train[i] - yp)) + c_gradient += 2*(-(y_train[i] - yp)) + + x_gradients = x_gradients * (1/len(y_train)) + c_gradient = c_gradient * (1/len(y_train)) + + updated_x_gradient = slopes - (LearningRate*x_gradients) + updated_c_gradient = intercept - (LearningRate*c_gradient) + + + return [updated_x_gradient,updated_c_gradient] + + +def LinearRegression(x_train, y_train, LearningRate = 0.1, iteration = 1000): + + # y = m1*x1 + m2*x2 + m3*x3 +....+ mn*xn + c (n is the size of x_train i.e no of features), c is the intercept + n = x_train.shape[1] + intercept = 0 + + #These m1,m2....mn is stored in slopes , we will initialiae it with 0, then keep updating it + slopes = np.zeros(n) + #slopes = slopes.astype("int64") + + #Gradient Decent + + #first we will define the loss(to measure how wrong we are i.e the error value) + loss = [] + + #we update the slopes using their Gradient values + #Gradient is a vector with the partial differential of the function (in our case the error value) with respect to differnt variables + for i in range(iteration): + + slopes, intercept = gradient_decent(x_train, y_train, slopes, intercept) + loss.append( [ i, R_sq_value(slopes, intercept, x_train, y_train) ]) + print( R_sq_value(slopes, intercept, x_train, y_train)) + + return [slopes,intercept,loss] + +def predict(x_test,slope,intercept): + y_predict = np.zeros(len(x_test)) + for i in range(len(x_test)): + y_predict[i] = np.dot((x_test[i,]),slope) + intercept + return y_predict + +def scale(x): + + + for j in range(x.shape[1]): + mean_x = 0 + for i in range(len(x)): + mean_x += x[i,j] + mean_x = mean_x / len(x) + sum_of_sq = 0 + for i in range(len(x)): + sum_of_sq += (x[i,j] - mean_x)**2 + stdev = sum_of_sq / (x.shape[0] -1) + for i in range(len(x)): + x[i,j] = (x[i,j] - mean_x) / stdev + return x + +def plot(loss): + for i in loss: + #here i took 300 cause we can visualize the min value of error at 300 + if(i[0] == 300): + break; + plt.plot(i[0],i[1],'r.') + + plt.show() + +if __name__ == "__main__": + + #import the dataset + from sklearn.datasets import load_boston + boston_dataset = load_boston() + boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names) + boston['MEDV'] = boston_dataset.target + data = boston + data_x = pd.DataFrame(np.c_[boston['LSTAT'], boston['RM']], columns = ['LSTAT','RM']) + data_y = boston['MEDV'] + x = data_x.iloc[:,:].values + y = data_y.iloc[:].values + + #split the data + x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state=0) + + #Standard Scaling + x_train = scale(x_train) + x_test = scale(x_test) + + #Performing the Regression + slope,intercept,loss = LinearRegression(x_train,y_train) + + plot(loss) + #To predict + y_predict = predict(x_test,slope,intercept) + + print("\nResults using the Sk Learn Model") + from sklearn.linear_model import LinearRegression + regression_model = LinearRegression() + # Fit the data(train the model) + regression_model.fit(x_train, y_train) + # Predict + y_predicted = regression_model.predict(x_test) + + from sklearn.metrics import mean_squared_error, r2_score + # model evaluation + rmse = mean_squared_error(y_test, y_predicted) + r2 = r2_score(y_test, y_predicted) + slopes2 = regression_model.coef_ + # printing values + print('Slope:' ,regression_model.coef_) + print('Intercept:', regression_model.intercept_) + print('Root mean squared error: ', rmse) + print('R2 score: ', r2) + + print("\nResults using our Learn Model") + print('Slope:' ,slope) + print('Intercept:', intercept) + From a26d54a04e49abbe97b183b2db9e1f98e3a2e58d Mon Sep 17 00:00:00 2001 From: sastaachar <42416647+sastaachar@users.noreply.github.com> Date: Tue, 8 Oct 2019 22:33:25 +0530 Subject: [PATCH 2/3] Added Trie MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In computer science, a trie, also called digital tree or prefix tree, is a kind of search treeā€”an ordered tree data structure used to store a dynamic set or associative array where the keys are usually strings. --- data_structure/trie.cpp | 68 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 data_structure/trie.cpp diff --git a/data_structure/trie.cpp b/data_structure/trie.cpp new file mode 100644 index 0000000..42130d0 --- /dev/null +++ b/data_structure/trie.cpp @@ -0,0 +1,68 @@ +#include +#define fr(siz,i) for(int i=0;i +#define pll pair< ll , ll > +#define vi vector +#define vvi vector< vi > +#define vl vector +#define vvl vector< vl > +const int maxn=(int)(2e5+5); +const ll mod=(ll)(1e9+7); +//ios_base::sync_with_stdio(0);cin.tie(0); +using namespace std; + + +struct trie_node{ + char data; + bool isEnd; + map children; +}; + +trie_node * newNode(char data,bool isEnd) { + trie_node* temp = new trie_node; + temp->data = data; + temp->isEnd = isEnd; + return temp; +} + +bool insert(trie_node* head, string data) { + trie_node* currentNode = head; + bool wasThere = true; + for(int i=0;ichildren[data[i] - 'a'] == NULL) { + bool isEnd = ( i == data.length()-1 ); + trie_node* temp = newNode(data[i],isEnd); + currentNode->children[data[i] - 'a'] = temp; + wasThere = false; + } + currentNode = currentNode->children[data[i] - 'a']; + } + return wasThere; +} + +bool find(trie_node* head, string data, bool prefix) { + trie_node* currentNode = head; + bool found = true; + for(int i=0;ichildren[data[i] - 'a'] == NULL) { + return false; + } + currentNode = currentNode->children[data[i] - 'a']; + } + return prefix ? true : currentNode->isEnd ; +} + +int main() { + + trie_node * head = newNode('*',true); + cout< Date: Tue, 8 Oct 2019 22:46:04 +0530 Subject: [PATCH 3/3] Delete trie.cpp --- data_structure/trie.cpp | 68 ----------------------------------------- 1 file changed, 68 deletions(-) delete mode 100644 data_structure/trie.cpp diff --git a/data_structure/trie.cpp b/data_structure/trie.cpp deleted file mode 100644 index 42130d0..0000000 --- a/data_structure/trie.cpp +++ /dev/null @@ -1,68 +0,0 @@ -#include -#define fr(siz,i) for(int i=0;i -#define pll pair< ll , ll > -#define vi vector -#define vvi vector< vi > -#define vl vector -#define vvl vector< vl > -const int maxn=(int)(2e5+5); -const ll mod=(ll)(1e9+7); -//ios_base::sync_with_stdio(0);cin.tie(0); -using namespace std; - - -struct trie_node{ - char data; - bool isEnd; - map children; -}; - -trie_node * newNode(char data,bool isEnd) { - trie_node* temp = new trie_node; - temp->data = data; - temp->isEnd = isEnd; - return temp; -} - -bool insert(trie_node* head, string data) { - trie_node* currentNode = head; - bool wasThere = true; - for(int i=0;ichildren[data[i] - 'a'] == NULL) { - bool isEnd = ( i == data.length()-1 ); - trie_node* temp = newNode(data[i],isEnd); - currentNode->children[data[i] - 'a'] = temp; - wasThere = false; - } - currentNode = currentNode->children[data[i] - 'a']; - } - return wasThere; -} - -bool find(trie_node* head, string data, bool prefix) { - trie_node* currentNode = head; - bool found = true; - for(int i=0;ichildren[data[i] - 'a'] == NULL) { - return false; - } - currentNode = currentNode->children[data[i] - 'a']; - } - return prefix ? true : currentNode->isEnd ; -} - -int main() { - - trie_node * head = newNode('*',true); - cout<