This post consists of:
In this section we just generate 200 randomly sampled data which are linearly separable.
import numpy as np
######################
# Create random data
#####################
Positive = np.random.normal(100,15,1000)
Negative = np.random.normal(-100,15,1000)
PN=[(x,y) for x,y in zip(Positive,Negative)]
NP=[(x,y) for x,y in zip(Negative,Positive)]
PP=[(x,y) for x,y in zip(Positive,-1*(Negative))]
NN=[(x,y) for x,y in zip(-1*(Positive),1*(Negative))]
N_class = PN + NP
P_class = PP + NN
########################
# Data Visualization
########################
import matplotlib.pyplot as plt
plt.plot([x for x,y in N_class], [y for x,y in N_class], 'go', label='P')
plt.plot([x for x,y in P_class], [y for x,y in P_class], 'ro', label='N')
plt.title('synthetic data-set for XOR problem')
plt.xlabel('x')
plt.ylabel('y')
plt.legend(loc='lower right')
plt.show()
In this step we take care of data using following steps:
As I am going to use numpy matrix operations, I need to convert all of our data to numpy arrays.
data = []
size = 4000
data.extend(PN)
data.extend(NP)
data.extend(PP)
data.extend(NN)
x = np.zeros((size, 2))
for idx, s in enumerate(data):
x[idx, 0] = s[0]
x[idx, 1] = s[1]
Perceptron algorithm is a supervised learning task, so the data needs labels and I have assigned:
Note any combination is possible.
y = np.ones((size, 1))
y[size//2:,] = 0
I have shuffled data because initial data was in order. In other words, first 2000 data had same label and latter half had same label so the network may memorize this order.
perm = np.random.permutation(size)
x = x[perm]
y = y[perm]
For validation purposes, I splitted data into train and test set to have more reliable accuracy.
train_test_ratio = 0.9
x_train = x[0:int(train_test_ratio*size)]
y_train = y[0:int(train_test_ratio*size)]
x_test = x[int(train_test_ratio*size):]
y_test = y[int(train_test_ratio*size):]
sigmoid
and tanh
.tanh
in hidden layers and sigmoid
at the end of network, so I used same convetion.w = w - alpha*dw
where w: weights
, alpha: learning_rate
and dw: derivative of w
.bias
variable too. bias
can be considered as a single weight with input value 1
permanently, so I omited separate explanation for it.sigmoid
, we need a threshold system
to convert (0, 1)
output of sigmoid
to {0, 1}
and 0.5
will do this for us.class MLP:
def __init__(self, num_features, hidden_size = 2, learning_rate = 0.001):
"""
Builds a MLP
:param num_features: Number of input features
:param learning_rate: eta or learning rate as a coefficient for weight updating
:param hidden_size: Number of neurons in hidden layer
:return: A object
"""
self.w1 = np.random.randn(hidden_size, num_features) * 0.01 # weights of input to hidden layer
self.b1 = np.ones((hidden_size, 1)) # bias of hidden layer
self.w2 = np.random.randn(1, hidden_size) * 0.01 # weights of hidden layer to output
self.b2 = np.ones((1, 1)) # bias of output layer
self.hidden_size = hidden_size
self.learning_rate = learning_rate
self.num_features = num_features
def _sigmoid(self, x):
"""
Applies sigmoid function on data
:param x: numpy ndarray
:return: numpy ndarray
"""
return 1 / (1 + np.exp(-x))
def _tanh(self, x):
"""
Applies Tan Hyperbolic function on data
:param x: numpy ndarray
:return: numpy ndarray
"""
return np.tanh(x)
def forward(self, x):
"""
Applies feed forward logic regarding predefined activation function
:param x: numpy ndarray of input data
:return: a tuple of (input of activation function of hidden layer,
output of activation function of hidden layer,
input of activation function of output layer,
output of activation function of output layer)
"""
z1 = np.dot(self.w1, x) + self.b1
a1 = self._tanh(z1)
z2 = np.dot(self.w2, a1) + self.b2
a2 = self._sigmoid(z2)
return z1, a1, z2, a2
def backpropagation(self, x, y, z1, a1, z2, a2):
"""
Applies backpropagation algorithm on network
:param x: numpy ndarray of input data
:param y: numpy ndarray of input labels
:param z1: input of activation function of hidden layer
:param a1: output of activation function of hidden layer
:param z2: input of activation function of output layer
:param a2: output of activation function of output layer
:return: A tuple of (delta_w1, delta_b1, delta_w2, delta_b2)
"""
m = x.shape[1]
dz2 = a2 - y
dw2 = np.dot(dz2, a1.T) / m
db2 = np.sum(dz2, axis=1, keepdims=True) / m
dz1 = np.multiply(np.dot(self.w2.T, dz2), (1-np.power(a1, 2)))
dw1 = np.dot(dz1, x.T) / m
db1 = np.sum(dz1, axis=1, keepdims=True) / m
return dw1, db1, dw2, db2
def update(self, dw1, db1, dw2, db2):
"""
Updates parameters using optained gradient changes.
:param dw1: Amount of change in w1
:param db1: Amount of change in b1
:param dw2: Amount of change in w2
:param db2: Amount of change in b2
:return: None
"""
self.w1 = self.w1 - self.learning_rate*dw1
self.b1 = self.b1 - self.learning_rate*db1
self.w2 = self.w2 - self.learning_rate*dw2
self.b2 = self.b2 - self.learning_rate*db2
def compute_cost(self, a2, y):
"""
Calculates cross-entropy loss
:param a2: output of final layer
:param y: true labels of input data
:return: a float value (less better!)
"""
logprobs = np.multiply(y, np.log(a2))+ np.multiply((1-y), (np.log(1-a2)))
cost = -np.sum(logprobs)/y.shape[1]
cost = np.squeeze(cost)
return cost
def train(self, x, y, epochs, print_cost=True):
"""
Applies training procedure
:param x: numpy ndarray of input features
:param y: numpy ndarray of input labels
:param epochs: Number of epochs to be trained
:print_cost: Whether or not print cost value (debugging purposes) every 1000 epochs
:return: None
"""
for e in range(epochs):
z1, a1, z2, a2 = self.forward(x)
cost = self.compute_cost(a2, y)
dw1, db1, dw2, db2 = self.backpropagation(x, y, z1, a1, z2, a2)
self.update(dw1, db1, dw2, db2)
if print_cost and e % 1000 == 0:
print ("Cost after iteration %i: %f" %(e, cost))
def test(self, x):
"""
Tests network using trained weights obtained from `train` method.
Note: Mapping is not happening here, if you want to map, use 0.5 as threshold for sigmoid
:param x: numpy ndarray of input features
:return: numpy ndarray of predictions
"""
_, _, _, preds = self.forward(x)
return preds
def score(self, x, y):
return np.count_nonzero((self.test(x)>=0.5)*1 == y) *100/ x.shape[1]
x_train_mlp = x_train.transpose()
y_train_mlp = y_train.transpose()
mlp = MLP(num_features=2, hidden_size=2)
mlp.train(x_train_mlp, y_train_mlp, epochs=15000)
pred = mlp.test(x_test.T)
mlp.score(x_test.T, y_test.T)
Cost after iteration 0: 0.811356
Cost after iteration 1000: 0.661713
Cost after iteration 2000: 0.561364
Cost after iteration 3000: 0.493442
Cost after iteration 4000: 0.444247
Cost after iteration 5000: 0.406091
Cost after iteration 6000: 0.374814
Cost after iteration 7000: 0.348163
Cost after iteration 8000: 0.324874
Cost after iteration 9000: 0.304196
Cost after iteration 10000: 0.285649
Cost after iteration 11000: 0.268904
Cost after iteration 12000: 0.253713
Cost after iteration 13000: 0.239883
Cost after iteration 14000: 0.227255
100.0