From 0679a1656f0ce4e0687743d56300ac28c2a89568 Mon Sep 17 00:00:00 2001
From: christalib <piks3l@mailoo.org>
Date: Mon, 16 Sep 2019 08:45:51 +0200
Subject: [PATCH] update

---
 ML/udemy/.vscode/settings.json |  3 +++
 ML/udemy/NOTES.md              |  3 +++
 ML/udemy/fill_empty_data.py    | 21 ---------------------
 ML/udemy/preprocess_data.R     | 19 +++++++++++++++++++
 ML/udemy/preprocess_data.py    | 32 ++++++++++++++++++++++++++++++++
 react/7/pics/src/index.js      |  1 +
 6 files changed, 58 insertions(+), 21 deletions(-)
 create mode 100644 ML/udemy/.vscode/settings.json
 create mode 100644 ML/udemy/NOTES.md
 delete mode 100644 ML/udemy/fill_empty_data.py
 create mode 100644 ML/udemy/preprocess_data.R
 create mode 100644 ML/udemy/preprocess_data.py

diff --git a/ML/udemy/.vscode/settings.json b/ML/udemy/.vscode/settings.json
new file mode 100644
index 0000000..0de2132
--- /dev/null
+++ b/ML/udemy/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.pythonPath": "/home/chris/Projects/_LAB/training/ML/udemy/.env/bin/python3.7"
+}
\ No newline at end of file
diff --git a/ML/udemy/NOTES.md b/ML/udemy/NOTES.md
new file mode 100644
index 0000000..3988695
--- /dev/null
+++ b/ML/udemy/NOTES.md
@@ -0,0 +1,3 @@
+# SECTION 2
+* you need to split the training set and a test set to balance the machine learning (you train on the test set and test those assumptions on the test set)
+? what is categorical data, why whould you use it?
\ No newline at end of file
diff --git a/ML/udemy/fill_empty_data.py b/ML/udemy/fill_empty_data.py
deleted file mode 100644
index 06c6970..0000000
--- a/ML/udemy/fill_empty_data.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Data preprocessing
-import numpy as np
-import matplotlib.pyplot as plt
-import pandas as pd 
-
-# Importing the dataset
-dataset = pd.read_csv('Data.csv')
-# Create the matrix of features (independant variables)
-# [:, = lines -- all of them
-# :-1] = colums -- all of them unless the last one 
-# X = (Country, Age, Salary)
-X = dataset.iloc[:, :-1].values
-# Create vector of linked variables
-# [:, 3] = all values of the 3rd column
-# Y = (Purchased)
-X = dataset.iloc[:, 3].values
-
-# Taking care of the missing data
-from sklearn.model_selection import train_test_split
-
-imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
diff --git a/ML/udemy/preprocess_data.R b/ML/udemy/preprocess_data.R
new file mode 100644
index 0000000..eeee032
--- /dev/null
+++ b/ML/udemy/preprocess_data.R
@@ -0,0 +1,19 @@
+# Data preprocessing
+
+# Import the dataset
+dataset = read.csv("Data.csv")
+
+# Taking care of the missing data
+dataset$Age = ifelse(is.na(dataset$Age),
+ave(dataset$Age, FUN = function(x) mean(x, na.rm = TRUE)),
+dataset$Age)
+
+dataset$Salary = ifelse(is.na(dataset$Salary),
+ave(dataset$Salary, FUN = function(x) mean(x, na.rm = TRUE)),
+dataset$Salary)
+
+# Encoding categorical data
+# ! c is a vector
+dataset$Country = factor(dataset$Country, levels=c('France', 'Spain', 'Germany'), labels=c(1, 2, 3))
+
+dataset$Purchased= factor(dataset$Purchased, levels=c('No', 'Yes'), labels=c(0, 1))
\ No newline at end of file
diff --git a/ML/udemy/preprocess_data.py b/ML/udemy/preprocess_data.py
new file mode 100644
index 0000000..8688c17
--- /dev/null
+++ b/ML/udemy/preprocess_data.py
@@ -0,0 +1,32 @@
+# Data preprocessing
+from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder
+from sklearn.model_selection import train_test_split
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+# Importing the dataset
+dataset = pd.read_csv('Data.csv')
+# Create the matrix of features (independant variables)
+# [:, = lines -- all of them
+# :-1] = colums -- all of them unless the last one
+# X = (Country, Age, Salary)
+X = dataset.iloc[:, :-1].values
+# Create vector of linked variables
+# [:, 3] = all values of the 3rd column
+# Y = (Purchased)
+y = dataset.iloc[:, 3].values
+
+# Taking care of the missing data
+
+imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
+imputer = imputer.fit(X[:, 1:3])
+X[:, 1:3] = imputer.transform(X[:, 1:3])
+
+# Encoding categorical data
+labelencoder_X = LabelEncoder()
+X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
+onehotencoder = OneHotEncoder()
+X = onehotencoder.fit_transform(X).toarray()
+labelencoder_y = LabelEncoder()
+y = labelencoder_y.fit_transform(y)
diff --git a/react/7/pics/src/index.js b/react/7/pics/src/index.js
index e4bc450..88a12b3 100644
--- a/react/7/pics/src/index.js
+++ b/react/7/pics/src/index.js
@@ -6,3 +6,4 @@ ReactDOM.render(
     <App />,
     document.querySelector('#root')
 );
+