update
This commit is contained in:
parent
654a5ed2c0
commit
0679a1656f
3
ML/udemy/.vscode/settings.json
vendored
Normal file
3
ML/udemy/.vscode/settings.json
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
"python.pythonPath": "/home/chris/Projects/_LAB/training/ML/udemy/.env/bin/python3.7"
|
||||||
|
}
|
3
ML/udemy/NOTES.md
Normal file
3
ML/udemy/NOTES.md
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
# SECTION 2
|
||||||
|
* you need to split the training set and a test set to balance the machine learning (you train on the test set and test those assumptions on the test set)
|
||||||
|
? what is categorical data, why whould you use it?
|
|
@ -1,21 +0,0 @@
|
||||||
# Data preprocessing
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
# Importing the dataset
|
|
||||||
dataset = pd.read_csv('Data.csv')
|
|
||||||
# Create the matrix of features (independant variables)
|
|
||||||
# [:, = lines -- all of them
|
|
||||||
# :-1] = colums -- all of them unless the last one
|
|
||||||
# X = (Country, Age, Salary)
|
|
||||||
X = dataset.iloc[:, :-1].values
|
|
||||||
# Create vector of linked variables
|
|
||||||
# [:, 3] = all values of the 3rd column
|
|
||||||
# Y = (Purchased)
|
|
||||||
X = dataset.iloc[:, 3].values
|
|
||||||
|
|
||||||
# Taking care of the missing data
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
|
|
||||||
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
|
|
19
ML/udemy/preprocess_data.R
Normal file
19
ML/udemy/preprocess_data.R
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
# Data preprocessing
|
||||||
|
|
||||||
|
# Import the dataset
|
||||||
|
dataset = read.csv("Data.csv")
|
||||||
|
|
||||||
|
# Taking care of the missing data
|
||||||
|
dataset$Age = ifelse(is.na(dataset$Age),
|
||||||
|
ave(dataset$Age, FUN = function(x) mean(x, na.rm = TRUE)),
|
||||||
|
dataset$Age)
|
||||||
|
|
||||||
|
dataset$Salary = ifelse(is.na(dataset$Salary),
|
||||||
|
ave(dataset$Salary, FUN = function(x) mean(x, na.rm = TRUE)),
|
||||||
|
dataset$Salary)
|
||||||
|
|
||||||
|
# Encoding categorical data
|
||||||
|
# ! c is a vector
|
||||||
|
dataset$Country = factor(dataset$Country, levels=c('France', 'Spain', 'Germany'), labels=c(1, 2, 3))
|
||||||
|
|
||||||
|
dataset$Purchased= factor(dataset$Purchased, levels=c('No', 'Yes'), labels=c(0, 1))
|
32
ML/udemy/preprocess_data.py
Normal file
32
ML/udemy/preprocess_data.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
# Data preprocessing
|
||||||
|
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Importing the dataset
|
||||||
|
dataset = pd.read_csv('Data.csv')
|
||||||
|
# Create the matrix of features (independant variables)
|
||||||
|
# [:, = lines -- all of them
|
||||||
|
# :-1] = colums -- all of them unless the last one
|
||||||
|
# X = (Country, Age, Salary)
|
||||||
|
X = dataset.iloc[:, :-1].values
|
||||||
|
# Create vector of linked variables
|
||||||
|
# [:, 3] = all values of the 3rd column
|
||||||
|
# Y = (Purchased)
|
||||||
|
y = dataset.iloc[:, 3].values
|
||||||
|
|
||||||
|
# Taking care of the missing data
|
||||||
|
|
||||||
|
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
|
||||||
|
imputer = imputer.fit(X[:, 1:3])
|
||||||
|
X[:, 1:3] = imputer.transform(X[:, 1:3])
|
||||||
|
|
||||||
|
# Encoding categorical data
|
||||||
|
labelencoder_X = LabelEncoder()
|
||||||
|
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
|
||||||
|
onehotencoder = OneHotEncoder()
|
||||||
|
X = onehotencoder.fit_transform(X).toarray()
|
||||||
|
labelencoder_y = LabelEncoder()
|
||||||
|
y = labelencoder_y.fit_transform(y)
|
|
@ -6,3 +6,4 @@ ReactDOM.render(
|
||||||
<App />,
|
<App />,
|
||||||
document.querySelector('#root')
|
document.querySelector('#root')
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue