diff --git a/ML/udemy/.vscode/settings.json b/ML/udemy/.vscode/settings.json index 0de2132..3758345 100644 --- a/ML/udemy/.vscode/settings.json +++ b/ML/udemy/.vscode/settings.json @@ -1,3 +1,4 @@ { - "python.pythonPath": "/home/chris/Projects/_LAB/training/ML/udemy/.env/bin/python3.7" + "python.pythonPath": "/home/chris/Projects/_LAB/training/ML/udemy/.env/bin/python3.7", + "python.linting.enabled": true } \ No newline at end of file diff --git a/ML/udemy/Data.csv b/ML/udemy/1/Data.csv similarity index 100% rename from ML/udemy/Data.csv rename to ML/udemy/1/Data.csv diff --git a/ML/udemy/1/data_preprocessing_template.py b/ML/udemy/1/data_preprocessing_template.py new file mode 100644 index 0000000..9fe4697 --- /dev/null +++ b/ML/udemy/1/data_preprocessing_template.py @@ -0,0 +1,16 @@ +# Data preprocessing +import pandas as pd +from sklearn.model_selection import train_test_split + +# Importing the dataset +dataset = pd.read_csv('Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 3].values + +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=0) + +# feature scaling +# sc_X = StandardScaler() +# X_train = sc_X.fit_transform(X_train) +# X_test = sc_X.fit_transform(X_test) diff --git a/ML/udemy/preprocess_data.R b/ML/udemy/1/preprocess_data.R similarity index 100% rename from ML/udemy/preprocess_data.R rename to ML/udemy/1/preprocess_data.R diff --git a/ML/udemy/preprocess_data.py b/ML/udemy/1/preprocess_data.py similarity index 100% rename from ML/udemy/preprocess_data.py rename to ML/udemy/1/preprocess_data.py diff --git a/ML/udemy/2/Salary_Data.csv b/ML/udemy/2/Salary_Data.csv new file mode 100644 index 0000000..a6863aa --- /dev/null +++ b/ML/udemy/2/Salary_Data.csv @@ -0,0 +1,31 @@ +YearsExperience,Salary +1.1,39343.00 +1.3,46205.00 +1.5,37731.00 +2.0,43525.00 +2.2,39891.00 +2.9,56642.00 +3.0,60150.00 +3.2,54445.00 +3.2,64445.00 +3.7,57189.00 +3.9,63218.00 +4.0,55794.00 +4.0,56957.00 +4.1,57081.00 +4.5,61111.00 +4.9,67938.00 +5.1,66029.00 +5.3,83088.00 +5.9,81363.00 +6.0,93940.00 +6.8,91738.00 +7.1,98273.00 +7.9,101302.00 +8.2,113812.00 +8.7,109431.00 +9.0,105582.00 +9.5,116969.00 +9.6,112635.00 +10.3,122391.00 +10.5,121872.00 diff --git a/ML/udemy/2/simple_linear_reg.py b/ML/udemy/2/simple_linear_reg.py new file mode 100644 index 0000000..3be5795 --- /dev/null +++ b/ML/udemy/2/simple_linear_reg.py @@ -0,0 +1,33 @@ +# Data preprocessing +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LinearRegression + +# Importing the dataset +dataset = pd.read_csv('Salary_Data.csv') +X = dataset.iloc[:, :-1].values +y = dataset.iloc[:, 1].values + +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=0) + +# Fitting Simple Linear Regreesion to the Training set +regressor = LinearRegression() +regressor.fit(X_train, y_train) + +# Prediction the Test set result +# y_pred contains the predicted salary from the test sample, y_test is the actual salary. +y_pred = regressor.predict(X_test) + +# Visualize the data +# data use to train the regression +plt.scatter(X_train, y_train, color='red') +# actual data we compared with our trained regression +plt.scatter(X_test, y_test, color='green') +plt.plot(X_train, regressor.predict(X_train), color='blue') +plt.title('Salary vs experience (training set)') +plt.xlabel('years of experience') +plt.ylabel('salary') +plt.show() diff --git a/ML/udemy/NOTES.md b/ML/udemy/NOTES.md index 44800b7..cc50f94 100644 --- a/ML/udemy/NOTES.md +++ b/ML/udemy/NOTES.md @@ -2,4 +2,14 @@ * you need to split the training set and a test set to balance the machine learning (you train on the test set and test those assumptions on the test set) ? what is categorical data, why whould you use it? -* feature scaling: put all values on the same scale so the larger number do not destroy other numbers => standardisation ou normalisation \ No newline at end of file +* feature scaling: put all values on the same scale so the larger number do not destroy other numbers => standardisation ou normalisation + +# Linear regression + +Formula: `y = b0 + b1*x1` + +y is the dependent variable. What is the value that changes in our model, the part that we want to understand from the work. How is this value changing. + +x is the independant variable, the one that has an implied association with y. + +b1 is the coef for the independant variable, how a unit change in x1. \ No newline at end of file diff --git a/ML/udemy/data_preprocessing_template.py b/ML/udemy/data_preprocessing_template.py index 9fe4697..c856400 100644 --- a/ML/udemy/data_preprocessing_template.py +++ b/ML/udemy/data_preprocessing_template.py @@ -1,4 +1,5 @@ # Data preprocessing +import numpy as np import pandas as pd from sklearn.model_selection import train_test_split