簡介
實作 RNN 並應用至 Image Captioning
實作 LSTM 並應用至 Image Captioning
Neural Network 視覺化
Style Transfer 在藝術畫作上的應用
Generative Adversial Network 實作並應用在 MNIST 資料
Image Captioning with RNNs
資料集
Microsoft COCO 資料集,2014 年的版本,總共有80,000 張的訓練圖片與 40,000 張的驗證圖片,每張圖片都有 5 個 captions。
Vanilla RNN
step forward
原始 RNN 的計算方法如下:
\(h_t=tanh(W_h\cdot h_{t-1}+W_x\cdot X_t+b)\)
而 \(tanh(x)=\dfrac{e^{2x}-1}{e^{2x}+1}\)
不過我這邊使用numpy
自帶的tanh
函數,使用上會較為穩定,試過自定義一個tanh
函式,不知為何會導致 gradient exploding 的問題。
1 2 3 4 5 6 7 def rnn_step_forward (x, prev_h, Wx, Wh, b) : next_h, cache = None , None next_h = np.tanh(prev_h.dot(Wh) + x.dot(Wx) + b) cache = x, prev_h, Wx, Wh, b, next_h return next_h, cache
step backward
1 2 3 4 5 6 7 8 9 10 11 12 13 def rnn_step_backward (dnext_h, cache) : dx, dprev_h, dWx, dWh, db = None , None , None , None , None x, prev_h, Wx, Wh, b, next_h = cache dtanh = dnext_h * (1 - next_h ** 2 ) dx = dtanh.dot(Wx.T) dprev_h = dtanh.dot(Wh.T) dWx = x.T.dot(dtanh) dWh = prev_h.T.dot(dtanh) db = np.sum(dtanh, axis=0 ) return dx, dprev_h, dWx, dWh, db
forward
每筆資料都有 T
個 timestamp,所以用迴圈依序建構每個 timestamp 的 rnn layer。
1 2 3 4 5 6 7 8 9 10 11 12 def rnn_forward (x, h0, Wx, Wh, b) : h, cache = None , None N, T, D = x.shape H = h0.shape[1 ] h, cache = np.empty((N, T, H)), [None ] * T h[:,0 ,:], cache[0 ] = rnn_step_forward(x[:,0 ,:], h0, Wx, Wh, b) for i in range(1 , T): h[:,i,:], cache[i] = rnn_step_forward(x[:,i,:], h[:,i-1 ,:], Wx, Wh, b) return h, cache
backward
1 2 3 4 5 6 7 8 9 10 11 12 13 14 def rnn_backward (dh, cache) : dx, dh0, dWx, dWh, db = None , None , None , None , None N, T, H = dh.shape D = cache[0 ][0 ].shape[1 ] dx, dprev_h, dWx, dWh, db = np.empty((N, T, D)), np.zeros((N, H)), np.zeros((D, H)), np.zeros((H, H)), np.zeros((H,)) for i in range(T-1 ,-1 ,-1 ): dx[:,i,:], dprev_h, dWx_temp, dWh_temp, db_temp = rnn_step_backward(dh[:,i,:] + dprev_h, cache[i]) dWx, dWh, db = dWx + dWx_temp, dWh + dWh_temp, db + db_temp dh0 = dprev_h return dx, dh0, dWx, dWh, db
Word Embedding
forward
1 2 3 4 5 6 7 def word_embedding_forward (x, W) : out, cache = None , None out = W[x,:] cache = x, W return out, cache
backward
1 2 3 4 5 6 7 8 def word_embedding_backward (dout, cache) : dW = None x, W = cache dW = np.zeros_like(W) np.add.at(dW, x, dout) return dW
RNN for image captioning
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 def loss (self, features, captions) : captions_in = captions[:, :-1 ] captions_out = captions[:, 1 :] mask = (captions_out != self._null) W_proj, b_proj = self.params['W_proj' ], self.params['b_proj' ] W_embed = self.params['W_embed' ] Wx, Wh, b = self.params['Wx' ], self.params['Wh' ], self.params['b' ] W_vocab, b_vocab = self.params['W_vocab' ], self.params['b_vocab' ] loss, grads = 0.0 , {} feature_out, feature_cache = affine_forward(features, W_proj, b_proj) word_embed_out, word_embed_cache = word_embedding_forward(captions_in, W_embed) rnn_out, rnn_cache = rnn_forward(word_embed_out, feature_out, Wx, Wh, b) score, score_cache = temporal_affine_forward(rnn_out, W_vocab, b_vocab) loss, dout = temporal_softmax_loss(score, captions_out, mask) dscore, grads['W_vocab' ], grads['b_vocab' ] = temporal_affine_backward(dout, score_cache) drnn, dh0_rnn, grads['Wx' ], grads['Wh' ], grads['b' ] = rnn_backward(dscore, rnn_cache) grads['W_embed' ] = word_embedding_backward(drnn, word_embed_cache) dx_feature, grads['W_proj' ], grads['b_proj' ] = affine_backward(dh0_rnn, feature_cache) return loss, grads
Overfit small data
Test-time sampling
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 def sample (self, features, max_length=30 ) : N = features.shape[0 ] captions = self._null * np.ones((N, max_length), dtype=np.int32) W_proj, b_proj = self.params['W_proj' ], self.params['b_proj' ] W_embed = self.params['W_embed' ] Wx, Wh, b = self.params['Wx' ], self.params['Wh' ], self.params['b' ] W_vocab, b_vocab = self.params['W_vocab' ], self.params['b_vocab' ] hidden, _ = affine_forward(features, W_proj, b_proj) cur_word = self._start for n in range(max_length): word_embed, _ = word_embedding_forward(cur_word, W_embed) hidden, _ = rnn_step_forward(word_embed, hidden, Wx, Wh, b) score, _ = affine_forward(hidden, W_vocab, b_vocab) cur_word = np.argmax(score, axis=1 ) captions[:, n] = cur_word return captions
一些結果:
Image Captioning with LSTMs
LSTM
step forward
修正:右方應為 elementwise multiplication
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 def lstm_step_forward (x, prev_h, prev_c, Wx, Wh, b) : next_h, next_c, cache = None , None , None N, H = prev_h.shape gates = prev_h.dot(Wh) + x.dot(Wx) + b gates[:,:3 *H] = sigmoid(gates[:,:3 *H]) gates[:,3 *H:] = np.tanh(gates[:,3 *H:]) i, f, o, g = gates[:,:H], gates[:,H:2 *H], gates[:,2 *H:3 *H], gates[:,3 *H:4 *H], next_c = f * prev_c + i * g next_h = o * np.tanh(next_c) cache = x, prev_h, prev_c, Wx, Wh, b, next_h, next_c, i, f, o, g return next_h, next_c, cache
step backward
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 def lstm_step_backward (dnext_h, dnext_c, cache) : dx, dprev_h, dprev_c, dWx, dWh, db = None , None , None , None , None , None x, prev_h, prev_c, Wx, Wh, b, next_h, next_c, i, f, o, g = cache dnext_c = dnext_c + o * (1 - np.tanh(next_c) ** 2 ) * dnext_h df = dnext_c * prev_c di = dnext_c * g dg = dnext_c * i do = dnext_h * np.tanh(next_c) dgates = np.hstack((di*i*(1 -i), df*f*(1 -f), do*o*(1 -o), dg*(1 -g**2 ))) dx = dgates.dot(Wx.T) dprev_h = dgates.dot(Wh.T) dprev_c = dnext_c * f dWx = x.T.dot(dgates) dWh = prev_h.T.dot(dgates) db = np.sum(dgates, axis=0 ) return dx, dprev_h, dprev_c, dWx, dWh, db
forward
1 2 3 4 5 6 7 8 9 10 11 12 def lstm_forward (x, h0, Wx, Wh, b) : h, cache = None , None N, T, D = x.shape H = h0.shape[1 ] h, c, cache = np.zeros((N, T, H)), np.zeros((N, H)), [None ] * T h[:,0 ,:], c, cache[0 ] = lstm_step_forward(x[:,0 ,:], h0, c, Wx, Wh, b) for i in range(1 , T): h[:,i,:], c, cache[i] = lstm_step_forward(x[:,i,:], h[:,i-1 ,:], c, Wx, Wh, b) return h, cache
backward
1 2 3 4 5 6 7 8 9 10 11 12 13 14 def lstm_backward (dh, cache) : dx, dh0, dWx, dWh, db = None , None , None , None , None N, D = cache[0 ][0 ].shape T, H = dh.shape[1 :] dx, dprev_h, dWx, dWh, db, dc = np.zeros((N, T, D)), np.zeros((N, H)), np.zeros((D, 4 *H)), np.zeros((H, 4 *H)), np.zeros((4 *H,)), np.zeros((N, H)) for i in range(T-1 , -1 , -1 ): dx[:,i,:], dprev_h, dc, dWx_temp, dWh_temp, db_temp = lstm_step_backward(dh[:,i,:] + dprev_h, dc, cache[i]) dWx, dWh, db= dWx + dWx_temp, dWh + dWh_temp, db + db_temp dh0 = dprev_h return dx, dh0, dWx, dWh, db
Overfit small data
LSTM test-time sampling
Network Visualization
Saliency Maps
Saliency Map 的概念就是透過計算 \(loss\) 對 \(X\) 的梯度 \(dx\) ,觀察圖片的哪一個部分對分類的影響較為顯著。
假設圖片的 channel 數為 3,Saloency Map 就是將 \(dx\) 取絕對值後,取 3 個 channel 之中的最大值。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 def compute_saliency_maps (X, y, model) : saliency = None x = tf.convert_to_tensor(X, dtype=X.dtype) y = tf.convert_to_tensor(y, dtype=tf.int32) N = x.shape[0 ] with tf.GradientTape() as t: t.watch(x) score = model.call(x) correct_class = tf.gather_nd(score, tf.stack((tf.range(N), y), axis=1 )) dx = t.gradient(score, x) dx = tf.abs(dx) saliency = tf.reduce_max(dx, axis=3 ) return saliency
Fooling Images
既然這個梯度可以用來表示這張圖片中影響分類的關鍵部分,那麼我們是否能利用這個梯度來更新原始圖片,產生出一個看起來與原始圖片差不多,但是卻分類錯誤的圖片呢?
因此我們使用一個錯誤的類別當作正確答案來計算 loss,並利用這個 loss 計算對於原始圖片的梯度,再用這個梯度對原始圖片進行更新,看看是否能產生一張以假亂真的圖片。
順帶一提,更新的方法稱為 Gradient Ascent。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 def make_fooling_image (X, target_y, model) : X_fooling = X.copy() learning_rate = 1 X_fooling = tf.convert_to_tensor(X_fooling) for i in range(100 ): with tf.GradientTape() as t: t.watch(X_fooling) score = model.call(X_fooling) pred_y = np.argmax(score[0 ]) loss = score[0 , target_y] g = t.gradient(loss, X_fooling) dx = learning_rate * g / tf.norm(g) X_fooling += dx if pred_y == target_y: break return X_fooling
Class visualization
那麼反過來說若給我們一張隨機的圖片,我們是否能夠透過同樣的方法,使得這張圖片被模型分類為某一個類別呢?
若 \(I\) 是一張圖片,而 \(y\) 是目標類別,模型對於 \(I\) 在類別 \(y\) 的分數為 \(s_y(I)\) 。
我們希望產生一張圖片 \(I^*\) 使得他在類別 \(y\) 的分數越高越好:
\[
I^* = {\arg\max}_I (s_y(I) - R(I))
\]
其中 \(R\) 是一個 L2 reguralization term:
\[
R(I) = \lambda \|I\|_2^2
\]
1 2 3 4 5 6 7 with tf.GradientTape() as tape: tape.watch(X) score = model.call(X) loss = score[0 , target_y] g = tape.gradient(loss, X) + l2_reg * X * X X += learning_rate * g
Style Transfer
這個部分要利用 Image Style Transfer Using Convolutional Neural Networks 這篇 paper 的概念來實作 style transfer。
Loss function 為 content loss、style loss、total variation loss 三個的加權總和。
值得注意的是在這裡的 gradient descent 不是用來更新模型的參數,而是 output image 的 pixel value。
Content Loss
我們只需要計算一個 layer 的 content loss,假設這個 layer 的 feature maps \(A^\ell \in \mathbb{R}^{1 \times H_\ell \times W_\ell \times C_\ell}\) 。
\(C_\ell\) 是 layer \(\ell\) 的 filter/channel 數量,\(H^\ell\) 與 \(W^\ell\) 是圖片的長跟寬。
\(F^\ell \in \mathbb{R}^{M_\ell \times C_\ell}\) 是目前圖片的 feature map,而\(P^\ell \in \mathbb{R}^{M_\ell \times C_\ell}\) 是 content 圖片的 feature map,而 \(M_\ell=H_\ell\times W_\ell\) 是每個 feature map 的 pixel 數量,最後\(w_c\) 則是 content loss 這一項的權重。
content loss 定義如下:
\(L_c = w_c \times \sum_{i,j} (F_{ij}^{\ell} - P_{ij}^{\ell})^2\)
1 2 3 4 5 def content_loss (content_weight, content_current, content_original) : content_current = tf.reshape(content_current, [-1 , tf.shape(content_current)[-1 ]]) content_original = tf.reshape(content_original, [-1 , tf.shape(content_original)[-1 ]]) return content_weight * tf.reduce_sum((content_current - content_original)**2 )
Style Loss
建立 Gram Matrix 來表示 image 的 texture。
給定一個 feature map \(F^\ell\) ,大小為 \(M_\ell \times C_\ell\) ,Gram Matrix 的大小為 \(C_\ell \times C_\ell\) ,定義如下:
\[G_{ij}^\ell = \sum_k F^{\ell}_{ki} F^{\ell}_{kj}\]
假設 \(G^\ell\) 為由目前圖片的 feature map 所計算的 Gram Matrix,而 \(A^\ell\) 為由原始 style image 的 feature map 所計算的 Gram Matrix,\(w_\ell\) 是這一項的權重,則第 \(\ell\) 層的 style loss 可以透過計算這兩個 gram matrix 的 euclidean distance 來表示:
\[L_s^\ell = w_\ell \sum_{i, j} \left(G^\ell_{ij} - A^\ell_{ij}\right)^2\]
我們通常會將每一層的 loss 相加來表示最終的 loss:
\[L_s = \sum_{\ell \in \mathcal{L}} L_s^\ell\]
1 2 3 4 5 6 7 8 9 10 11 def gram_matrix (features, normalize=True) : _, H, W, C = tf.shape(features) features = tf.reshape(features, [-1 , C]) gram = tf.matmul(tf.transpose(features), features) if normalize: gram /= tf.cast(H * W * C, tf.float32) return gram
1 2 3 4 5 6 7 def style_loss (feats, style_layers, style_targets, style_weights) : style_loss = 0 for i, layer in enumerate(style_layers): style_loss += style_weights[i] * tf.reduce_sum((gram_matrix(feats[layer]) - style_targets[i])**2 ) return style_loss
Total-variation regularization
為了增加圖片的平滑度,我們會加入一個正規項來懲罰所謂的 wiggle 或是 total variation。
方法是計算相鄰 pixel 的平方差(水平與垂直方向,3個channel(RGB))並將所有結果相加。
\(L_{tv} = w_t \times \left(\sum_{c=1}^3\sum_{i=1}^{H-1}\sum_{j=1}^{W} (x_{i+1,j,c} - x_{i,j,c})^2 + \sum_{c=1}^3\sum_{i=1}^{H}\sum_{j=1}^{W - 1} (x_{i,j+1,c} - x_{i,j,c})^2\right)\)
1 2 3 4 def tv_loss (img, tv_weight) : horizontal_loss = tf.reduce_sum((img[:,1 :,:,:] - img[:,:-1 ,:,:])**2 ) vertical_loss = tf.reduce_sum((img[:,:,1 :,:] - img[:,:,:-1 ,:])**2 ) return tv_weight * (horizontal_loss + vertical_loss)
結果
Generative Adversarial Networks (GANs)
生成對抗網路(GAN)由兩個 neural network 組成:Discriminator 與 Generator。
Discriminator 就是一個二元分類器,目標是判斷輸入的圖片是 real (來自 training set) 或是 fake (不來自 training set)。
Generator 的目標是產生與 training set 相似的圖片,混淆 Discriminator 使其將產生的圖片判斷為 real。
我們可以想像這是一個 Discriminator \((D)\) 與 Generator \((G)\) 來回對抗的遊戲。遊戲剛開始時,Discriminator 能夠輕易地判斷輸入圖片的真實性,但隨著遊戲的進行,Generator 漸漸地能產生一些 Discriminator 判斷錯誤的圖片,這時 Discriminator 也會慢慢進步,重複這樣的流程,最後當 Discriminator 已經無法判斷輸入圖片的真實性時,就代表 Generator 已經能夠產生非常接近真實的圖片了。
\[\underset{G}{\text{minimize}}\; \underset{D}{\text{maximize}}\; \mathbb{E}_{x \sim p_\text{data}}\left[\log D(x)\right] + \mathbb{E}_{z \sim p(z)}\left[\log \left(1-D(G(z))\right)\right]\]
我們也可以把這個流程想像為一個 minmax game,重複以下兩個步驟:
Gradient descent 更新 generator \((G)\) 最小化 discriminator 做出正確決定 的機率
Gradient ascent 更新 discriminator \((D)\) 最大化 discriminator 做出正確決定 的機率
但是這種方法在實作上不太可行,因為若當 discriminator 已經變得很強的時候,generator 會產生 gradient vanishing 的問題。
因此我們通常會稍微修改下 generator 的更新方式:最大化 discriminator 做出錯誤決定的機率。
因此新的遊戲玩法如下:
Gradient ascent 更新 generator \((G)\) 最大化 discriminator 做出錯誤決定 的機率
\[\underset{G}{\text{maximize}}\; \mathbb{E}_{z \sim p(z)}\left[\log D(G(z))\right]\]
Gradient ascent 更新 discriminator \((D)\) 最大化 discriminator 做出正確決定 的機率
\[\underset{D}{\text{maximize}}\; \mathbb{E}_{x \sim p_\text{data}}\left[\log D(x)\right] + \mathbb{E}_{z \sim p(z)}\left[\log \left(1-D(G(z))\right)\right]\]
Vallina GAN
Leaky ReLU
Leaky ReLU 能夠解決 ReLU 的 dying problem,常被用在 GAN 的實作中。
\[f(x)=max(\alpha x, x)\]
1 2 def leaky_relu (x, alpha=0.01 ) : return tf.maximum(alpha * x, x)
Random Noise
隨機產生介於 \([-1,1]\) 的雜訊。
1 2 def sample_noise (batch_size, dim) : return tf.random.uniform(shape=(batch_size, dim), minval=-1 , maxval=1 )
Discriminator
1 2 3 4 5 6 7 8 9 def discriminator () : model = tf.keras.models.Sequential([ tf.keras.layers.Dense(256 , input_shape=(784 ,), activation=leaky_relu), tf.keras.layers.Dense(256 , activation=leaky_relu), tf.keras.layers.Dense(1 ) ]) return model
Generator
1 2 3 4 5 6 7 8 9 def generator (noise_dim=NOISE_DIM) : model = tf.keras.models.Sequential([ tf.keras.layers.Dense(1024 , input_shape=(noise_dim,), activation=tf.nn.relu), tf.keras.layers.Dense(1024 , activation=tf.nn.relu), tf.keras.layers.Dense(784 , activation=tf.nn.tanh) ]) return model
GAN Loss
Generator Loss: \[\ell_G = -\mathbb{E}_{z \sim p(z)}\left[\log D(G(z))\right]\]
Discriminator loss \[ \ell_D = -\mathbb{E}_{x \sim p_\text{data}}\left[\log D(x)\right] - \mathbb{E}_{z \sim p(z)}\left[\log \left(1-D(G(z))\right)\right]\]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 def discriminator_loss (logits_real, logits_fake) : loss = None cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True ) real_loss = cross_entropy(tf.ones_like(logits_real), logits_real) fake_loss = cross_entropy(tf.zeros_like(logits_fake), logits_fake) loss = real_loss + fake_loss return loss def generator_loss (logits_fake) : loss = None cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True ) loss = cross_entropy(tf.ones_like(logits_fake), logits_fake) return loss
Optimizing the loss
1 2 3 4 5 6 7 8 def get_solvers (learning_rate=1e-3 , beta1=0.5 ) : D_solver = None G_solver = None D_solver = tf.optimizers.Adam(learning_rate=1e-3 , beta_1=0.5 ) G_solver = tf.optimizers.Adam(learning_rate=1e-3 , beta_1=0.5 ) return D_solver, G_solver
結果
Least Squares GAN
Least Squares GAN 不同的地方只有在 loss function 的部分。
Generator loss: \[\ell_G = \frac{1}{2}\mathbb{E}_{z \sim p(z)}\left[\left(D(G(z))-1\right)^2\right]\]
Discriminator loss: \[ \ell_D = \frac{1}{2}\mathbb{E}_{x \sim p_\text{data}}\left[\left(D(x)-1\right)^2\right] + \frac{1}{2}\mathbb{E}_{z \sim p(z)}\left[ \left(D(G(z))\right)^2\right]\]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def ls_discriminator_loss (scores_real, scores_fake) : loss = None real_loss = 0.5 * tf.reduce_mean((scores_real - tf.ones_like(scores_real)) ** 2 ) fake_loss = 0.5 * tf.reduce_mean(scores_fake ** 2 ) loss = real_loss + fake_loss return loss def ls_generator_loss (scores_fake) : loss = None loss = 0.5 * tf.reduce_mean((scores_fake - tf.ones_like(scores_fake)) ** 2 ) return loss
結果
Deep Convolutional GANs
DCGAN 使用卷積神經網路來實作 Generator 與 Discriminator。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 def discriminator () : model = tf.keras.models.Sequential([ tf.keras.layers.Reshape((28 , 28 , 1 ), input_shape=(784 , )), tf.keras.layers.Conv2D(filters=32 , kernel_size=(5 , 5 ), strides=(1 , 1 ), padding='valid' ), tf.keras.layers.LeakyReLU(alpha=0.01 ), tf.keras.layers.MaxPool2D(pool_size=(2 , 2 ), strides=(2 , 2 )), tf.keras.layers.Conv2D(filters=64 , kernel_size=(5 , 5 ), strides=(1 , 1 ), padding='valid' ), tf.keras.layers.LeakyReLU(alpha=0.01 ), tf.keras.layers.MaxPool2D(pool_size=(2 , 2 ), strides=(2 , 2 )), tf.keras.layers.Flatten(), tf.keras.layers.Dense(4 *4 *64 ), tf.keras.layers.LeakyReLU(alpha=0.01 ), tf.keras.layers.Dense(1 ) ]) return model model = discriminator() test_discriminator(1102721 )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 def generator (noise_dim=NOISE_DIM) : model = tf.keras.models.Sequential([ tf.keras.layers.Dense(1024 , use_bias=True , input_shape=(noise_dim, ), activation='relu' ), tf.keras.layers.BatchNormalization(), tf.keras.layers.Dense(7 *7 *128 , use_bias=True , activation='relu' ), tf.keras.layers.BatchNormalization(), tf.keras.layers.Reshape((7 , 7 , 128 )), tf.keras.layers.Conv2DTranspose(filters=64 , kernel_size=(4 , 4 ), strides=(2 , 2 ), padding='same' , activation='relu' , use_bias=True ), tf.keras.layers.BatchNormalization(), tf.keras.layers.Conv2DTranspose(filters=1 , kernel_size=(4 , 4 ), strides=(2 , 2 ), padding='same' , activation='tanh' , use_bias=True ), ]) return model test_generator(6595521 )
結果