ML: simple linear regression

2019-10-18 19:26:43 +02:00 · 2019-10-18 19:26:43 +02:00 · 897831b91b
parent 95c3cb4e89
commit 897831b91b
9 changed files with 94 additions and 2 deletions
--- a/ML/udemy/.vscode/settings.json
+++ b/ML/udemy/.vscode/settings.json
@ -1,3 +1,4 @@
 {
-    "python.pythonPath": "/home/chris/Projects/_LAB/training/ML/udemy/.env/bin/python3.7"
+    "python.pythonPath": "/home/chris/Projects/_LAB/training/ML/udemy/.env/bin/python3.7",
+    "python.linting.enabled": true
 }
--- a/ML/udemy/1/Data.csv
+++ b/ML/udemy/1/Data.csv
--- a/ML/udemy/1/data_preprocessing_template.py
+++ b/ML/udemy/1/data_preprocessing_template.py
@ -0,0 +1,16 @@
+# Data preprocessing
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+# Importing the dataset
+dataset = pd.read_csv('Data.csv')
+X = dataset.iloc[:, :-1].values
+y = dataset.iloc[:, 3].values
+
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=0)
+
+# feature scaling
+# sc_X = StandardScaler()
+# X_train = sc_X.fit_transform(X_train)
+# X_test = sc_X.fit_transform(X_test)
--- a/ML/udemy/1/preprocess_data.R
+++ b/ML/udemy/1/preprocess_data.R
--- a/ML/udemy/1/preprocess_data.py
+++ b/ML/udemy/1/preprocess_data.py
--- a/ML/udemy/2/Salary_Data.csv
+++ b/ML/udemy/2/Salary_Data.csv
@ -0,0 +1,31 @@
+YearsExperience,Salary
+1.1,39343.00
+1.3,46205.00
+1.5,37731.00
+2.0,43525.00
+2.2,39891.00
+2.9,56642.00
+3.0,60150.00
+3.2,54445.00
+3.2,64445.00
+3.7,57189.00
+3.9,63218.00
+4.0,55794.00
+4.0,56957.00
+4.1,57081.00
+4.5,61111.00
+4.9,67938.00
+5.1,66029.00
+5.3,83088.00
+5.9,81363.00
+6.0,93940.00
+6.8,91738.00
+7.1,98273.00
+7.9,101302.00
+8.2,113812.00
+8.7,109431.00
+9.0,105582.00
+9.5,116969.00
+9.6,112635.00
+10.3,122391.00
+10.5,121872.00
--- a/ML/udemy/2/simple_linear_reg.py
+++ b/ML/udemy/2/simple_linear_reg.py
@ -0,0 +1,33 @@
+# Data preprocessing
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+
+# Importing the dataset
+dataset = pd.read_csv('Salary_Data.csv')
+X = dataset.iloc[:, :-1].values
+y = dataset.iloc[:, 1].values
+
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=0)
+
+# Fitting Simple Linear Regreesion to the Training set
+regressor = LinearRegression()
+regressor.fit(X_train, y_train)
+
+# Prediction the Test set result
+# y_pred contains the predicted salary from the test sample, y_test is the actual salary.
+y_pred = regressor.predict(X_test)
+
+# Visualize the data
+# data use to train the regression
+plt.scatter(X_train, y_train, color='red')
+# actual data we compared with our trained regression
+plt.scatter(X_test, y_test, color='green')
+plt.plot(X_train, regressor.predict(X_train), color='blue')
+plt.title('Salary vs experience (training set)')
+plt.xlabel('years of experience')
+plt.ylabel('salary')
+plt.show()
--- a/ML/udemy/NOTES.md
+++ b/ML/udemy/NOTES.md
@ -2,4 +2,14 @@
 * you need to split the training set and a test set to balance the machine learning (you train on the test set and test those assumptions on the test set)
 ? what is categorical data, why whould you use it?

-* feature scaling: put all values on the same scale so the larger number do not destroy other numbers => standardisation ou normalisation  
+* feature scaling: put all values on the same scale so the larger number do not destroy other numbers => standardisation ou normalisation  
+
+# Linear regression
+
+Formula: `y = b0 + b1*x1`
+
+y is the dependent variable. What is the value that changes in our model, the part that we want to understand from the work. How is this value changing. 
+
+x is the independant variable, the one that has an implied association with y.
+
+b1 is the coef for the independant variable, how a unit change in x1. 
--- a/ML/udemy/data_preprocessing_template.py
+++ b/ML/udemy/data_preprocessing_template.py
@ -1,4 +1,5 @@
 # Data preprocessing
+import numpy as np
 import pandas as pd
 from sklearn.model_selection import train_test_split