CS231n assignment 3

2019-05-29
學校課程/ 圖像辨識

簡介

  1. 實作 RNN 並應用至 Image Captioning
  2. 實作 LSTM 並應用至 Image Captioning
  3. Neural Network 視覺化
  4. Style Transfer 在藝術畫作上的應用
  5. Generative Adversial Network 實作並應用在 MNIST 資料

Image Captioning with RNNs

資料集

Microsoft COCO 資料集,2014 年的版本,總共有80,000 張的訓練圖片與 40,000 張的驗證圖片,每張圖片都有 5 個 captions。

<START> a group of people fly their kites in a field of flowers <END> <START> a dirt road a wooden bench some grass and trees <END> <START> a bird sitting on a tree <UNK> in a <UNK> <END>

Vanilla RNN

step forward

原始 RNN 的計算方法如下:

\(h_t=tanh(W_h\cdot h_{t-1}+W_x\cdot X_t+b)\)

\(tanh(x)=\dfrac{e^{2x}-1}{e^{2x}+1}\)

不過我這邊使用numpy自帶的tanh函數,使用上會較為穩定,試過自定義一個tanh函式,不知為何會導致 gradient exploding 的問題。

1
2
3
4
5
6
7
def rnn_step_forward(x, prev_h, Wx, Wh, b):
next_h, cache = None, None

next_h = np.tanh(prev_h.dot(Wh) + x.dot(Wx) + b)
cache = x, prev_h, Wx, Wh, b, next_h

return next_h, cache

step backward

1
2
3
4
5
6
7
8
9
10
11
12
13
def rnn_step_backward(dnext_h, cache):
dx, dprev_h, dWx, dWh, db = None, None, None, None, None

x, prev_h, Wx, Wh, b, next_h = cache

dtanh = dnext_h * (1 - next_h ** 2)
dx = dtanh.dot(Wx.T)
dprev_h = dtanh.dot(Wh.T)
dWx = x.T.dot(dtanh)
dWh = prev_h.T.dot(dtanh)
db = np.sum(dtanh, axis=0)

return dx, dprev_h, dWx, dWh, db

forward

每筆資料都有 T 個 timestamp,所以用迴圈依序建構每個 timestamp 的 rnn layer。

1
2
3
4
5
6
7
8
9
10
11
12
def rnn_forward(x, h0, Wx, Wh, b):
h, cache = None, None

N, T, D = x.shape
H = h0.shape[1]
h, cache = np.empty((N, T, H)), [None] * T
h[:,0,:], cache[0] = rnn_step_forward(x[:,0,:], h0, Wx, Wh, b)

for i in range(1, T):
h[:,i,:], cache[i] = rnn_step_forward(x[:,i,:], h[:,i-1,:], Wx, Wh, b)

return h, cache

backward

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def rnn_backward(dh, cache):
dx, dh0, dWx, dWh, db = None, None, None, None, None

N, T, H = dh.shape
D = cache[0][0].shape[1]
dx, dprev_h, dWx, dWh, db = np.empty((N, T, D)), np.zeros((N, H)), np.zeros((D, H)), np.zeros((H, H)), np.zeros((H,))

for i in range(T-1,-1,-1):
dx[:,i,:], dprev_h, dWx_temp, dWh_temp, db_temp = rnn_step_backward(dh[:,i,:] + dprev_h, cache[i])
dWx, dWh, db = dWx + dWx_temp, dWh + dWh_temp, db + db_temp

dh0 = dprev_h

return dx, dh0, dWx, dWh, db

Word Embedding

forward

1
2
3
4
5
6
7
def word_embedding_forward(x, W):
out, cache = None, None

out = W[x,:]
cache = x, W

return out, cache

backward

1
2
3
4
5
6
7
8
def word_embedding_backward(dout, cache):
dW = None

x, W = cache
dW = np.zeros_like(W)
np.add.at(dW, x, dout)

return dW

RNN for image captioning

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def loss(self, features, captions):
captions_in = captions[:, :-1]
captions_out = captions[:, 1:]

# You'll need this
mask = (captions_out != self._null)

# Weight and bias for the affine transform from image features to initial
# hidden state
W_proj, b_proj = self.params['W_proj'], self.params['b_proj']

# Word embedding matrix
W_embed = self.params['W_embed']

# Input-to-hidden, hidden-to-hidden, and biases for the RNN
Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']

# Weight and bias for the hidden-to-vocab transformation.
W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

loss, grads = 0.0, {}

feature_out, feature_cache = affine_forward(features, W_proj, b_proj)
word_embed_out, word_embed_cache = word_embedding_forward(captions_in, W_embed)
rnn_out, rnn_cache = rnn_forward(word_embed_out, feature_out, Wx, Wh, b)
score, score_cache = temporal_affine_forward(rnn_out, W_vocab, b_vocab)
loss, dout = temporal_softmax_loss(score, captions_out, mask)

dscore, grads['W_vocab'], grads['b_vocab'] = temporal_affine_backward(dout, score_cache)
drnn, dh0_rnn, grads['Wx'], grads['Wh'], grads['b'] = rnn_backward(dscore, rnn_cache)
grads['W_embed'] = word_embedding_backward(drnn, word_embed_cache)
dx_feature, grads['W_proj'], grads['b_proj'] = affine_backward(dh0_rnn, feature_cache)

return loss, grads

Overfit small data

Test-time sampling

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def sample(self, features, max_length=30):
N = features.shape[0]
captions = self._null * np.ones((N, max_length), dtype=np.int32)

# Unpack parameters
W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
W_embed = self.params['W_embed']
Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']

hidden, _ = affine_forward(features, W_proj, b_proj)
cur_word = self._start
for n in range(max_length):
word_embed, _ = word_embedding_forward(cur_word, W_embed)
hidden, _ = rnn_step_forward(word_embed, hidden, Wx, Wh, b)
score, _ = affine_forward(hidden, W_vocab, b_vocab)
cur_word = np.argmax(score, axis=1)
captions[:, n] = cur_word

return captions

一些結果:

Image Captioning with LSTMs

LSTM

step forward

修正:右方應為 elementwise multiplication
修正:右方應為 elementwise multiplication
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b):
next_h, next_c, cache = None, None, None
N, H = prev_h.shape

gates = prev_h.dot(Wh) + x.dot(Wx) + b
gates[:,:3*H] = sigmoid(gates[:,:3*H])
gates[:,3*H:] = np.tanh(gates[:,3*H:])
i, f, o, g = gates[:,:H], gates[:,H:2*H], gates[:,2*H:3*H], gates[:,3*H:4*H],

next_c = f * prev_c + i * g
next_h = o * np.tanh(next_c)

cache = x, prev_h, prev_c, Wx, Wh, b, next_h, next_c, i, f, o, g

return next_h, next_c, cache

step backward

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def lstm_step_backward(dnext_h, dnext_c, cache):
dx, dprev_h, dprev_c, dWx, dWh, db = None, None, None, None, None, None

x, prev_h, prev_c, Wx, Wh, b, next_h, next_c, i, f, o, g = cache

dnext_c = dnext_c + o * (1 - np.tanh(next_c) ** 2) * dnext_h
df = dnext_c * prev_c
di = dnext_c * g
dg = dnext_c * i
do = dnext_h * np.tanh(next_c)
dgates = np.hstack((di*i*(1-i), df*f*(1-f), do*o*(1-o), dg*(1-g**2)))

dx = dgates.dot(Wx.T)
dprev_h = dgates.dot(Wh.T)
dprev_c = dnext_c * f
dWx = x.T.dot(dgates)
dWh = prev_h.T.dot(dgates)
db = np.sum(dgates, axis=0)

return dx, dprev_h, dprev_c, dWx, dWh, db

forward

1
2
3
4
5
6
7
8
9
10
11
12
def lstm_forward(x, h0, Wx, Wh, b):
h, cache = None, None

N, T, D = x.shape
H = h0.shape[1]
h, c, cache = np.zeros((N, T, H)), np.zeros((N, H)), [None] * T
h[:,0,:], c, cache[0] = lstm_step_forward(x[:,0,:], h0, c, Wx, Wh, b)

for i in range(1, T):
h[:,i,:], c, cache[i] = lstm_step_forward(x[:,i,:], h[:,i-1,:], c, Wx, Wh, b)

return h, cache

backward

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def lstm_backward(dh, cache):
dx, dh0, dWx, dWh, db = None, None, None, None, None

N, D = cache[0][0].shape
T, H = dh.shape[1:]
dx, dprev_h, dWx, dWh, db, dc = np.zeros((N, T, D)), np.zeros((N, H)), np.zeros((D, 4*H)), np.zeros((H, 4*H)), np.zeros((4*H,)), np.zeros((N, H))

for i in range(T-1, -1, -1):
dx[:,i,:], dprev_h, dc, dWx_temp, dWh_temp, db_temp = lstm_step_backward(dh[:,i,:] + dprev_h, dc, cache[i])
dWx, dWh, db= dWx + dWx_temp, dWh + dWh_temp, db + db_temp

dh0 = dprev_h

return dx, dh0, dWx, dWh, db

Overfit small data

LSTM test-time sampling

Network Visualization

Saliency Maps

Saliency Map 的概念就是透過計算 \(loss\)\(X\) 的梯度 \(dx\),觀察圖片的哪一個部分對分類的影響較為顯著。

假設圖片的 channel 數為 3,Saloency Map 就是將 \(dx\) 取絕對值後,取 3 個 channel 之中的最大值。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def compute_saliency_maps(X, y, model):
saliency = None

x = tf.convert_to_tensor(X, dtype=X.dtype)
y = tf.convert_to_tensor(y, dtype=tf.int32)
N = x.shape[0]

with tf.GradientTape() as t:
t.watch(x)
score = model.call(x)
correct_class = tf.gather_nd(score, tf.stack((tf.range(N), y), axis=1))

dx = t.gradient(score, x)
dx = tf.abs(dx)
saliency = tf.reduce_max(dx, axis=3)

return saliency

Fooling Images

既然這個梯度可以用來表示這張圖片中影響分類的關鍵部分,那麼我們是否能利用這個梯度來更新原始圖片,產生出一個看起來與原始圖片差不多,但是卻分類錯誤的圖片呢?

因此我們使用一個錯誤的類別當作正確答案來計算 loss,並利用這個 loss 計算對於原始圖片的梯度,再用這個梯度對原始圖片進行更新,看看是否能產生一張以假亂真的圖片。

順帶一提,更新的方法稱為 Gradient Ascent。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def make_fooling_image(X, target_y, model):
# Make a copy of the input that we will modify
X_fooling = X.copy()

# Step size for the update
learning_rate = 1

X_fooling = tf.convert_to_tensor(X_fooling)

for i in range(100):
with tf.GradientTape() as t:
t.watch(X_fooling)
score = model.call(X_fooling)
pred_y = np.argmax(score[0])
loss = score[0, target_y]

g = t.gradient(loss, X_fooling)
dx = learning_rate * g / tf.norm(g)
X_fooling += dx

if pred_y == target_y:
break

return X_fooling

Class visualization

那麼反過來說若給我們一張隨機的圖片,我們是否能夠透過同樣的方法,使得這張圖片被模型分類為某一個類別呢?

\(I\) 是一張圖片,而 \(y\) 是目標類別,模型對於 \(I\) 在類別 \(y\) 的分數為 \(s_y(I)\)

我們希望產生一張圖片 \(I^*\) 使得他在類別 \(y\) 的分數越高越好:

\[ I^* = {\arg\max}_I (s_y(I) - R(I)) \]

其中 \(R\) 是一個 L2 reguralization term:

\[ R(I) = \lambda \|I\|_2^2 \]

1
2
3
4
5
6
7
with tf.GradientTape() as tape:
tape.watch(X)
score = model.call(X)
loss = score[0, target_y]

g = tape.gradient(loss, X) + l2_reg * X * X
X += learning_rate * g
iteration 1/100 iteration 25/100 iteration 50/100 iteration 75/100 iteration 100/100

Style Transfer

這個部分要利用 Image Style Transfer Using Convolutional Neural Networks 這篇 paper 的概念來實作 style transfer。

Loss function 為 content loss、style loss、total variation loss 三個的加權總和。

值得注意的是在這裡的 gradient descent 不是用來更新模型的參數,而是 output image 的 pixel value。

Content Loss

我們只需要計算一個 layer 的 content loss,假設這個 layer 的 feature maps \(A^\ell \in \mathbb{R}^{1 \times H_\ell \times W_\ell \times C_\ell}\)

\(C_\ell\) 是 layer \(\ell\) 的 filter/channel 數量,\(H^\ell\)\(W^\ell\) 是圖片的長跟寬。

\(F^\ell \in \mathbb{R}^{M_\ell \times C_\ell}\) 是目前圖片的 feature map,而\(P^\ell \in \mathbb{R}^{M_\ell \times C_\ell}\) 是 content 圖片的 feature map,而 \(M_\ell=H_\ell\times W_\ell\) 是每個 feature map 的 pixel 數量,最後\(w_c\)則是 content loss 這一項的權重。

content loss 定義如下:

\(L_c = w_c \times \sum_{i,j} (F_{ij}^{\ell} - P_{ij}^{\ell})^2\)

1
2
3
4
5
def content_loss(content_weight, content_current, content_original):
content_current = tf.reshape(content_current, [-1, tf.shape(content_current)[-1]])
content_original = tf.reshape(content_original, [-1, tf.shape(content_original)[-1]])

return content_weight * tf.reduce_sum((content_current - content_original)**2)

Style Loss

建立 Gram Matrix 來表示 image 的 texture。

給定一個 feature map \(F^\ell\),大小為 \(M_\ell \times C_\ell\),Gram Matrix 的大小為 \(C_\ell \times C_\ell\),定義如下:

\[G_{ij}^\ell = \sum_k F^{\ell}_{ki} F^{\ell}_{kj}\]

假設 \(G^\ell\) 為由目前圖片的 feature map 所計算的 Gram Matrix,而 \(A^\ell\) 為由原始 style image 的 feature map 所計算的 Gram Matrix,\(w_\ell\) 是這一項的權重,則第 \(\ell\) 層的 style loss 可以透過計算這兩個 gram matrix 的 euclidean distance 來表示:

\[L_s^\ell = w_\ell \sum_{i, j} \left(G^\ell_{ij} - A^\ell_{ij}\right)^2\]

我們通常會將每一層的 loss 相加來表示最終的 loss:

\[L_s = \sum_{\ell \in \mathcal{L}} L_s^\ell\]

1
2
3
4
5
6
7
8
9
10
11
def gram_matrix(features, normalize=True):
_, H, W, C = tf.shape(features)

features = tf.reshape(features, [-1, C])

gram = tf.matmul(tf.transpose(features), features)

if normalize:
gram /= tf.cast(H * W * C, tf.float32)

return gram
1
2
3
4
5
6
7
def style_loss(feats, style_layers, style_targets, style_weights):
style_loss = 0

for i, layer in enumerate(style_layers):
style_loss += style_weights[i] * tf.reduce_sum((gram_matrix(feats[layer]) - style_targets[i])**2)

return style_loss

Total-variation regularization

為了增加圖片的平滑度,我們會加入一個正規項來懲罰所謂的 wiggle 或是 total variation。

方法是計算相鄰 pixel 的平方差(水平與垂直方向,3個channel(RGB))並將所有結果相加。

\(L_{tv} = w_t \times \left(\sum_{c=1}^3\sum_{i=1}^{H-1}\sum_{j=1}^{W} (x_{i+1,j,c} - x_{i,j,c})^2 + \sum_{c=1}^3\sum_{i=1}^{H}\sum_{j=1}^{W - 1} (x_{i,j+1,c} - x_{i,j,c})^2\right)\)

1
2
3
4
def tv_loss(img, tv_weight):
horizontal_loss = tf.reduce_sum((img[:,1:,:,:] - img[:,:-1,:,:])**2)
vertical_loss = tf.reduce_sum((img[:,:,1:,:] - img[:,:,:-1,:])**2)
return tv_weight * (horizontal_loss + vertical_loss)

結果

iteration 0 iteration 100 iteration 199

Generative Adversarial Networks (GANs)

生成對抗網路(GAN)由兩個 neural network 組成:Discriminator 與 Generator。

Discriminator 就是一個二元分類器,目標是判斷輸入的圖片是 real (來自 training set) 或是 fake (不來自 training set)。

Generator 的目標是產生與 training set 相似的圖片,混淆 Discriminator 使其將產生的圖片判斷為 real。

我們可以想像這是一個 Discriminator \((D)\) 與 Generator \((G)\) 來回對抗的遊戲。遊戲剛開始時,Discriminator 能夠輕易地判斷輸入圖片的真實性,但隨著遊戲的進行,Generator 漸漸地能產生一些 Discriminator 判斷錯誤的圖片,這時 Discriminator 也會慢慢進步,重複這樣的流程,最後當 Discriminator 已經無法判斷輸入圖片的真實性時,就代表 Generator 已經能夠產生非常接近真實的圖片了。

\[\underset{G}{\text{minimize}}\; \underset{D}{\text{maximize}}\; \mathbb{E}_{x \sim p_\text{data}}\left[\log D(x)\right] + \mathbb{E}_{z \sim p(z)}\left[\log \left(1-D(G(z))\right)\right]\]

我們也可以把這個流程想像為一個 minmax game,重複以下兩個步驟:

  1. Gradient descent 更新 generator \((G)\) 最小化 discriminator 做出正確決定 的機率

  2. Gradient ascent 更新 discriminator \((D)\) 最大化 discriminator 做出正確決定 的機率

但是這種方法在實作上不太可行,因為若當 discriminator 已經變得很強的時候,generator 會產生 gradient vanishing 的問題。

因此我們通常會稍微修改下 generator 的更新方式:最大化 discriminator 做出錯誤決定的機率。

因此新的遊戲玩法如下:

  1. Gradient ascent 更新 generator \((G)\) 最大化 discriminator 做出錯誤決定 的機率

\[\underset{G}{\text{maximize}}\; \mathbb{E}_{z \sim p(z)}\left[\log D(G(z))\right]\]

  1. Gradient ascent 更新 discriminator \((D)\) 最大化 discriminator 做出正確決定 的機率

\[\underset{D}{\text{maximize}}\; \mathbb{E}_{x \sim p_\text{data}}\left[\log D(x)\right] + \mathbb{E}_{z \sim p(z)}\left[\log \left(1-D(G(z))\right)\right]\]

Vallina GAN

Leaky ReLU

Leaky ReLU 能夠解決 ReLU 的 dying problem,常被用在 GAN 的實作中。

\[f(x)=max(\alpha x, x)\]

1
2
def leaky_relu(x, alpha=0.01):
return tf.maximum(alpha * x, x)

Random Noise

隨機產生介於 \([-1,1]\) 的雜訊。

1
2
def sample_noise(batch_size, dim):
return tf.random.uniform(shape=(batch_size, dim), minval=-1, maxval=1)

Discriminator

1
2
3
4
5
6
7
8
9
def discriminator():
model = tf.keras.models.Sequential([

tf.keras.layers.Dense(256, input_shape=(784,), activation=leaky_relu),
tf.keras.layers.Dense(256, activation=leaky_relu),
tf.keras.layers.Dense(1)

])
return model

Generator

1
2
3
4
5
6
7
8
9
def generator(noise_dim=NOISE_DIM):
model = tf.keras.models.Sequential([

tf.keras.layers.Dense(1024, input_shape=(noise_dim,), activation=tf.nn.relu),
tf.keras.layers.Dense(1024, activation=tf.nn.relu),
tf.keras.layers.Dense(784, activation=tf.nn.tanh)

])
return model

GAN Loss

Generator Loss: \[\ell_G = -\mathbb{E}_{z \sim p(z)}\left[\log D(G(z))\right]\]

Discriminator loss \[ \ell_D = -\mathbb{E}_{x \sim p_\text{data}}\left[\log D(x)\right] - \mathbb{E}_{z \sim p(z)}\left[\log \left(1-D(G(z))\right)\right]\]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def discriminator_loss(logits_real, logits_fake):
loss = None

cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

real_loss = cross_entropy(tf.ones_like(logits_real), logits_real)
fake_loss = cross_entropy(tf.zeros_like(logits_fake), logits_fake)

loss = real_loss + fake_loss

return loss

def generator_loss(logits_fake):
loss = None

cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

loss = cross_entropy(tf.ones_like(logits_fake), logits_fake)

return loss

Optimizing the loss

1
2
3
4
5
6
7
8
def get_solvers(learning_rate=1e-3, beta1=0.5):
D_solver = None
G_solver = None

D_solver = tf.optimizers.Adam(learning_rate=1e-3, beta_1=0.5)
G_solver = tf.optimizers.Adam(learning_rate=1e-3, beta_1=0.5)

return D_solver, G_solver

結果

Least Squares GAN

Least Squares GAN 不同的地方只有在 loss function 的部分。

Generator loss: \[\ell_G = \frac{1}{2}\mathbb{E}_{z \sim p(z)}\left[\left(D(G(z))-1\right)^2\right]\]

Discriminator loss: \[ \ell_D = \frac{1}{2}\mathbb{E}_{x \sim p_\text{data}}\left[\left(D(x)-1\right)^2\right] + \frac{1}{2}\mathbb{E}_{z \sim p(z)}\left[ \left(D(G(z))\right)^2\right]\]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def ls_discriminator_loss(scores_real, scores_fake):
loss = None

real_loss = 0.5 * tf.reduce_mean((scores_real - tf.ones_like(scores_real)) ** 2)
fake_loss = 0.5 * tf.reduce_mean(scores_fake ** 2)

loss = real_loss + fake_loss

return loss

def ls_generator_loss(scores_fake):
loss = None

loss = 0.5 * tf.reduce_mean((scores_fake - tf.ones_like(scores_fake)) ** 2)

return loss

結果

Deep Convolutional GANs

DCGAN 使用卷積神經網路來實作 Generator 與 Discriminator。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def discriminator():
model = tf.keras.models.Sequential([

tf.keras.layers.Reshape((28, 28, 1), input_shape=(784, )),
tf.keras.layers.Conv2D(filters=32, kernel_size=(5, 5), strides=(1, 1), padding='valid'),
tf.keras.layers.LeakyReLU(alpha=0.01),
tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)),
tf.keras.layers.Conv2D(filters=64, kernel_size=(5, 5), strides=(1, 1), padding='valid'),
tf.keras.layers.LeakyReLU(alpha=0.01),
tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(4*4*64),
tf.keras.layers.LeakyReLU(alpha=0.01),
tf.keras.layers.Dense(1)

])
return model

model = discriminator()
test_discriminator(1102721)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def generator(noise_dim=NOISE_DIM):
model = tf.keras.models.Sequential([

tf.keras.layers.Dense(1024, use_bias=True, input_shape=(noise_dim, ), activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dense(7*7*128, use_bias=True, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Reshape((7, 7, 128)),
tf.keras.layers.Conv2DTranspose(filters=64, kernel_size=(4, 4), strides=(2, 2), padding='same', activation='relu', use_bias=True),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Conv2DTranspose(filters=1, kernel_size=(4, 4), strides=(2, 2), padding='same', activation='tanh', use_bias=True),

])
return model

test_generator(6595521)

結果