32 lines
1 KiB
Python
32 lines
1 KiB
Python
# Data preprocessing
|
|
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder
|
|
from sklearn.model_selection import train_test_split
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import pandas as pd
|
|
|
|
# Importing the dataset
|
|
dataset = pd.read_csv('Data.csv')
|
|
# Create the matrix of features (independant variables)
|
|
# [:, = lines -- all of them
|
|
# :-1] = colums -- all of them unless the last one
|
|
# X = (Country, Age, Salary)
|
|
X = dataset.iloc[:, :-1].values
|
|
# Create vector of linked variables
|
|
# [:, 3] = all values of the 3rd column
|
|
# Y = (Purchased)
|
|
y = dataset.iloc[:, 3].values
|
|
|
|
# Taking care of the missing data
|
|
|
|
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
|
|
imputer = imputer.fit(X[:, 1:3])
|
|
X[:, 1:3] = imputer.transform(X[:, 1:3])
|
|
|
|
# Encoding categorical data
|
|
labelencoder_X = LabelEncoder()
|
|
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
|
|
onehotencoder = OneHotEncoder()
|
|
X = onehotencoder.fit_transform(X).toarray()
|
|
labelencoder_y = LabelEncoder()
|
|
y = labelencoder_y.fit_transform(y)
|