Neural Network 라이브러리 없이 구현하기

irirs 데이터의 sepal.width, sepal.length, petal.width, petal.length를 input으로 사용하여, versicolor, virginica를 분류하는 3층 신경망을 직접 구현해봤다. W1는 [4x3] 행렬, b1은 [1x3] 행렬, W2는 [3x2] 행렬, b2는 [1x2] 행렬, W3는 [2x1] 행렬, b3는 [1x1] 행렬이다. 따라서 W1은 12개, b1은 3개, W2는 6개 b2는 2개, W3는 2개 b3는 1개 총 26개의 파라미터를 델타를 이용하여 학습했다.

● 데이터 전처리

import csv
import numpy as np
import matplotlib.pyplot as plt

ver = []
vir = []
ver_y = []
vir_y = []
with open('./iris.csv', 'r') as f:
    data = csv.reader(f)
    for row in data:
        if (row[4] == "Versicolor"):
            ver.append(list(map(float,row[0:4])))
            ver_y.append(0)
        elif (row[4] == "Virginica"):
            vir.append(list(map(float,row[0:4])))
            vir_y.append(1)
          
ver = np.array(ver)
vir = np.array(vir)
y = np.concatenate((np.array(ver_y), np.array(vir_y))).reshape(100,1)
tol = np.concatenate((ver, vir))

사용한 데이터 개수는 versicolor 50개, virginica 50개, 총 100개이다.

● 3층 신경망 모델

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def lossFunction(x, y):
     return -np.sum(y*np.log(x)+(1-y)*np.log(1-x))/len(y)

class NeuralNet:
    def __init__(self, input_size, hidden1_size, hidden2_size, output_size):
        self.W1 = np.random.randn(input_size, hidden1_size) #[4x3] 열로 가중치 구분
        self.b1 = np.random.rand(1, hidden1_size)           #[1x3]
        
        self.W2 = np.random.randn(hidden1_size, hidden2_size)   #[3x2]
        self.b2 = np.random.randn(1, hidden2_size)              #[1x2]

        self.W3 = np.random.randn(hidden2_size, output_size)    #[2x1]
        self.b3 = np.random.randn(1, output_size)               #[1x1]
        
    def forward(self, x):
        self.z1 = np.dot(x,self.W1) + self.b1      #[100x4]*[4x3] + [1x3]
        self.a1 = sigmoid(self.z1)                  #[100x3]
        self.z2 = np.dot(self.a1,self.W2) + self.b2 #[100x3]*[3x2] + [1x2]
        self.a2 = sigmoid(self.z2)                  #[100x2]
        self.z3 = np.dot(self.a2,self.W3) + self.b3 #[100x2]*[2x1] + [1x1]
        self.a3 = sigmoid(self.z3)                  #[100x1]
    
    def update(self, x, y, lr):
        self.del3 = (self.a3-y)*self.a3*(1-self.a3) #([100x1]-[100x1])*[100x1]*(1-[100x1])
        self.del2 = np.dot(self.del3,self.W3.T)*self.a2*(1-self.a2)  #[100x1]*[1x2]*[100x2]*(1-[100x2]) 
        self.del1 = np.dot(self.del2,self.W2.T)*self.a1*(1-self.a1) #[100x2]*[2x3]*[100x3]*(1-[100x3])
        
        self.W3[0] = self.W3[0] - lr*np.sum(self.del3*self.a2[:,0])/len(y) #[1x1]-lr*[100x1]*[100x1]
        self.W3[1] = self.W3[1] - lr*np.sum(self.del3*self.a2[:,1])/len(y)
        self.b3 = self.b3 - lr*np.sum(self.del3)/len(y)
        
        #self.del2 = np.dot(self.del3,self.W3.T)*self.a2*(1-self.a2)  #[100x1]*[1x2]*[100x2]*(1-[100x2]) 
        self.W2[0,0] = self.W2[0,0] - lr*np.sum(self.del2[:,0]*self.a1[:,0])/len(y)  
        self.W2[1,0] = self.W2[1,0] - lr*np.sum(self.del2[:,0]*self.a1[:,1])/len(y)  
        self.W2[2,0] = self.W2[2,0] - lr*np.sum(self.del2[:,0]*self.a1[:,2])/len(y)
        self.b2[0,0] = self.b2[0,0] - lr*np.sum(self.del2[:,0])/len(y)
        
        self.W2[0,1] = self.W2[0,1] - lr*np.sum(self.del2[:,1]*self.a1[:,0])/len(y)  
        self.W2[1,1] = self.W2[1,1] - lr*np.sum(self.del2[:,1]*self.a1[:,1])/len(y)  
        self.W2[2,1] = self.W2[2,1] - lr*np.sum(self.del2[:,1]*self.a1[:,2])/len(y)
        self.b2[0,1] = self.b2[0,1] - lr*np.sum(self.del2[:,1])/len(y)
        
        #self.del1 = np.dot(self.del2,self.W2.T)*self.a1*(1-self.a1) #[100x2]*[2x3]*[100x3]*(1-[100x3])
        self.W1[0,0] = self.W1[0,0] - lr*np.sum(self.del1[:,0]*x[:,0])/len(y)
        self.W1[1,0] = self.W1[1,0] - lr*np.sum(self.del1[:,0]*x[:,1])/len(y)
        self.W1[2,0] = self.W1[2,0] - lr*np.sum(self.del1[:,0]*x[:,2])/len(y)
        self.W1[3,0] = self.W1[3,0] - lr*np.sum(self.del1[:,0]*x[:,3])/len(y)
        self.b1[0,0] = self.b1[0,0] - lr*np.sum(self.del1[:,0])/len(y)
        
        self.W1[0,1] = self.W1[0,1] - lr*np.sum(self.del1[:,1]*x[:,0])/len(y)
        self.W1[1,1] = self.W1[1,1] - lr*np.sum(self.del1[:,1]*x[:,1])/len(y)
        self.W1[2,1] = self.W1[2,1] - lr*np.sum(self.del1[:,1]*x[:,2])/len(y)
        self.W1[3,1] = self.W1[3,1] - lr*np.sum(self.del1[:,1]*x[:,3])/len(y)
        self.b1[0,1] = self.b1[0,1] - lr*np.sum(self.del1[:,1])/len(y)
        
        self.W1[0,2] = self.W1[0,2] - lr*np.sum(self.del1[:,2]*x[:,0])/len(y)
        self.W1[1,2] = self.W1[1,2] - lr*np.sum(self.del1[:,2]*x[:,1])/len(y)
        self.W1[2,2] = self.W1[2,2] - lr*np.sum(self.del1[:,2]*x[:,2])/len(y)
        self.W1[3,2] = self.W1[3,2] - lr*np.sum(self.del1[:,2]*x[:,3])/len(y)
        self.b1[0,2] = self.b1[0,2] - lr*np.sum(self.del1[:,2])/len(y)

파라미터 행렬들은 열을 기준으로 구분된다. 델타 위치에 따라 학습 방법이 차이가 있다.

●3층 신경망 모델 간단 버전

def update(self, x, y, lr):
        self.d3 = (self.a3-y)*self.a3*(1-self.a3)        #[1x100]
        self.d2 = np.matmul(np.transpose(self.w3), self.d3)*self.a2*(1-self.a2) #[2x100]
        self.d1 = np.matmul(np.transpose(self.w2), self.d2)*self.a1*(1-self.a1)	
        
        self.w3 = self.w3 - lr*np.matmul(self.d3, np.transpose(self.a2))/self.d_size     #[1x100]*[100x2]
        self.b3 = self.b3 - lr*np.expand_dims(np.mean(self.d3, axis=1), axis=1)          
        self.w2 = self.w2 - lr*np.matmul(self.d2, np.transpose(self.a1))/self.d_size
        self.b2 = self.b2 - lr*np.expand_dims(np.mean(self.d2, axis=1), axis=1)
        self.w1 = self.w1 - lr*np.matmul(self.d1, np.transpose(x))/self.d_size
        self.b1 = self.b1 - lr*np.expand_dims(np.mean(self.d1, axis=1),axis=1)

먼저 본 코드는 가중치 각각이 학습되는 과정을 자세히 나타낸 것이고, 이번 코드는 가중치가 학습되는 것을 간단하게 나타낸 것이다. 파리미터 행렬들은 행을 기준으로 구분했다. 이 코드에서 np.expand_dims(np.mean(self.d3, axis=1), axis=1)은 np.sum(self.d3)/self.d_size와 같다. self.d_size는 len(y)로 보면 된다.

● 학습 및 Loss curve 그리기

test = NeuralNet(4, 3, 2, 1)
test.forward(tol)

for i in range(20000):
    test.update(tol, y, 0.2)
    test.forward(tol)
    print(lossFunction(test.a3,y))
    plt.scatter(i, lossFunction(test.a3,y), color='blue')

plt.xlabel('iteration')
plt.ylabel('Loss')
plt.show()

● 두 번째 hidden layer activations의 decision boundary 그리기

c = np.linspace(0.4,0.5,100)
plt.ylim(-0.2, 1.2)
plt.scatter(test.a2[:50,0], test.a2[:50,1], color = 'red') #ver 0
plt.scatter(test.a2[50:,0], test.a2[50:,1], color = 'blue') #vir 1
c = test.a2[:,0]
plt.plot(c, -(test.W3[0]*c+test.b3[0])/test.W3[1])
plt.xlabel('a21')
plt.ylabel('a22')
plt.show()

#결과

● 학습의 진행됨에 따른 두 번째 hidden layer activations의 decision boundary를 나타낸 figures

학습이 진행되면서 versicolor와 viginica가 분리되는 것을 확인할 수 있다.

여러 번 실행해보면 매번 decision boundary를 기준으로 색깔의 위아래가 바뀌는 것을 알 수 있다. decision boundary 기준으로 위쪽이 versicolor(빨간 점)이고 아래쪽이 virginica(파란 점)일 때는 W3가 음수이고, 디시전바운더리 기준으로 위쪽이 virginica이고 아래쪽이 versicolor일때는 W3가 양수이다. 이는 파라미터의 초기값에 따라 수렴되는 가중치의 부호가 달라져 매번 다르게 나오는거 같다.

● Loss curve

파라미터 초기값에 따라 손실이 줄어드는 속도와 수렴하는 값이 다르다.

Day Library

Neural Network 라이브러리 없이 구현하기

티스토리툴바