NNablaでVGG16を実装
ソニー社内で使用されているというDNNフレームワークがNNablaという名前でOSS化されたので使ってみました.練習として,画像分類で代表的なモデルのVGG16-netを実装してみました.
import nnabla as nn import nnabla.functions as F import nnabla.parametric_functions as PF import nnabla.solvers as S import numpy as np from collections import OrderedDict import cv2 class VGG16(object): def __init__(self, input_tensor, weights='imagenet'): self.batch_size = input_tensor.shape[0] self.layers = OrderedDict() # Input self.x = self.layers['input'] = input_tensor # Convolution layers h = self.layers['block1_conv1'] = PF.convolution(self.x, 64, (3, 3), pad=(1, 1), stride=(1, 1), name='block1_conv1') h = F.relu(h) h = self.layers['block1_conv2'] = PF.convolution(h, 64, (3, 3), pad=(1, 1), stride=(1, 1), name='block1_conv2') h = F.relu(h) h = self.layers['block1_pool1'] = F.max_pooling(h, (2, 2), stride=(2, 2)) h = self.layers['block2_conv1'] = PF.convolution(h, 128, (3, 3), pad=(1, 1), stride=(1, 1), name='block2_conv1') h = F.relu(h) h = self.layers['block2_conv2'] = PF.convolution(h, 128, (3, 3), pad=(1, 1), stride=(1, 1), name='block2_conv2') h = F.relu(h) h = self.layers['block2_pool1'] = F.max_pooling(h, (2, 2), stride=(2, 2)) h = self.layers['block3_conv1'] = PF.convolution(h, 256, (3, 3), pad=(1, 1), stride=(1, 1), name='block3_conv1') h = F.relu(h) h = self.layers['block3_conv2'] = PF.convolution(h, 256, (3, 3), pad=(1, 1), stride=(1, 1), name='block3_conv2') h = F.relu(h) h = self.layers['block3_conv3'] = PF.convolution(h, 256, (3, 3), pad=(1, 1), stride=(1, 1), name='block3_conv3') h = F.relu(h) h = self.layers['block3_pool1'] = F.max_pooling(h, (2, 2), stride=(2, 2)) h = self.layers['block4_conv1'] = PF.convolution(h, 512, (3, 3), pad=(1, 1), stride=(1, 1), name='block4_conv1') h = F.relu(h) h = self.layers['block4_conv2'] = PF.convolution(h, 512, (3, 3), pad=(1, 1), stride=(1, 1), name='block4_conv2') h = F.relu(h) h = self.layers['block4_conv3'] = PF.convolution(h, 512, (3, 3), pad=(1, 1), stride=(1, 1), name='block4_conv3') h = F.relu(h) h = self.layers['block4_pool1'] = F.max_pooling(h, (2, 2), stride=(2, 2)) h = self.layers['block5_conv1'] = PF.convolution(h, 512, (3, 3), pad=(1, 1), stride=(1, 1), name='block5_conv1') h = F.relu(h) h = self.layers['block5_conv2'] = PF.convolution(h, 512, (3, 3), pad=(1, 1), stride=(1, 1), name='block5_conv2') h = F.relu(h) h = self.layers['block5_conv3'] = PF.convolution(h, 512, (3, 3), pad=(1, 1), stride=(1, 1), name='block5_conv3') h = F.relu(h) h = self.layers['block5_pool1'] = F.max_pooling(h, (2, 2), stride=(2, 2)) # Flatten h = F.transpose(h, (0, 2, 3, 1)) # (batch, channels, h, w) -> (batch, h, w, channels) h = self.layers['flatten'] = F.reshape(h, (self.batch_size, np.prod(h.shape[1:]))) # Fully-Connected layers h = self.layers['fc1'] = PF.affine(h, 4096, name='fc1') h = F.relu(h) h = self.layers['fc2'] = PF.affine(h, 4096, name='fc2') h = F.relu(h) h = self.layers['fc3'] = PF.affine(h, 1000, name='fc3') # Prediction output = self.layers['prob'] = F.softmax(h) self.output = output def summary(self): layer_len = len(self.layers.keys()) maxlen = max(map(len, self.layers.keys())) print("{} layers.".format(layer_len)) for k in self.layers: print("{} layer: {}".format(k.ljust(maxlen), str(self.layers[k].shape))) x = nn.Variable([1, 3, 224, 224]) vgg16 = VGG16(x)
ここから学習するには学習データを用意したうえで,lossとSolverを定義してforwardしてbackwardしてという手順になるのですが,あいにく肝心のimagenetのデータを持っていません.なので今回はKerasで使用されている学習済みの重みを移植することにしました.
まず以下のようにKeras側でpre-trainedなVGG16-netを構築し,学習済みのweightをpickleで吐き出します。
import pickle model= VGG16(weights="imagenet") weights = model.get_weights() mat = [] for l in weights: if len(l.shape) is 4: l = l.transpose() mat.append(l) with open('keras_nn.pickle', 'wb') as f: pickle.dump(mat, f)
pickle化したweightを,以下のコードでnnablaのVGG16-netに渡しています.(ここはもっといいやり方があると思う)
params = nn.get_parameters() with open('keras_nn.pickle', 'rb') as f: mats = pickle.load(f) idx = 0 suffix = ['/conv/W', '/conv/b', '/affine/W', '/affine/b'] for k in alex.layers: for suf in suffix: w_name = k + suf if w_name not in params: continue if suf is '/conv/W': mats[idx] = mats[idx].transpose((0,1,3,2)) params[w_name].d = np.float32(mats[idx]) idx += 1
ここまででnnablaでVGG16-netの構築と学習済みの重みを読み込みました.次に実際に画像を読み込んでクラス分類してみます.Kerasのモデルは色チャンネルをBGRの順で読みこんでいるのでopencvのimreadで読み込むと良いです.
def prepare_img(img): img = img.astype(np.float32) img = cv2.resize(img, (224, 224)) img[:, :, 0] -= 103.939 img[:, :, 1] -= 116.779 img[:, :, 2] -= 123.68 img = np.expand_dims(img, axis=0) img = img.transpose((0, 3, 2, 1)) return img img = cv2.imread('input.jpg', 1) preproc_img = prepare_img(img) x.d = preproc_img vgg16.output.forward() label = np.argsort(vgg16.output.d[0])[::-1][:5] for i in label: print(i, vgg16.output.d[0][i])
出力は以下のようになりました.
792 0.985962 //shovel
587 0.00602465 //hammer
813 0.00355319 //spatula
596 0.00200536 //hatchet
462 0.000546574 //broom
847 0.982786 //tank
471 0.0112128 //cannon
744 0.00218588 //projectile, missile
408 0.00189686 //amphibian
657 0.00173452 //missile
とりあえずうまく行っているかな?