paddockpass/ML/udemy/preprocess_data.py

# Data preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Data.csv')
# Create the matrix of features (independant variables)
# [:, = lines -- all of them
# :-1] = colums -- all of them unless the last one
# X = (Country, Age, Salary)
X = dataset.iloc[:, :-1].values
# Create vector of linked variables
# [:, 3] = all values of the 3rd column
# Y = (Purchased)
y = dataset.iloc[:, 3].values

# Taking care of the missing data

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# Encoding categorical data
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder()
X = onehotencoder.fit_transform(X).toarray()
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)


# splitting into a training and a test set
# test_size is the amount of data that we are going to send in the test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

# feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)