Ablation_Tests_MNIST.py

# -*- coding: utf-8 -*-
"""
Created on Dec 21 14:57:02 2019
@author: Learning Deep Kernels for Two-sample Test
@Implementation of MMD-D and baselines in our paper on MNIST dataset

BEFORE USING THIS CODE:
1. This code requires PyTorch 1.1.0, which can be found in
https://pytorch.org/get-started/previous-versions/ (CUDA version is 10.1).
2. This code also requires freqopttest repo (interpretable nonparametric two-sample test)
to implement ME and SCF tests, which can be installed by
   pip install git+https://github.com/wittawatj/interpretable-test
3. Pickle is required to load fake MNIST datasets (generated by DCGAN), which can be installed by
   pip install pickle-mixin
4. Numpy and Sklearn are also required. Users can install
Python via Anaconda (Python 3.7.3) to obtain both packages. Anaconda
can be found in https://www.anaconda.com/distribution/#download-section .
"""
import argparse
import os
import numpy as np
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision import datasets
from torch.autograd import Variable
import torch.nn as nn
import torch
import pickle
from utils_HD import MatConvert, Pdist2, MMDu, TST_MMD_adaptive_bandwidth, TST_MMD_u, MMDu_linear_kernel, TST_MMD_u_linear_kernel

# Setup seeds
os.makedirs("images", exist_ok=True)
np.random.seed(819)
torch.manual_seed(819)
torch.cuda.manual_seed(819)
torch.backends.cudnn.deterministic = True
is_cuda = True

# parameters setting
parser = argparse.ArgumentParser()
parser.add_argument("--n_epochs", type=int, default=2000, help="number of epochs of training")
parser.add_argument("--batch_size", type=int, default=100, help="size of the batches")
parser.add_argument("--lr", type=float, default=0.0002, help="adam: learning rate")
parser.add_argument("--img_size", type=int, default=32, help="size of each image dimension")
parser.add_argument("--channels", type=int, default=1, help="number of image channels")
parser.add_argument("--n", type=int, default=200, help="number of samples in one set")
opt = parser.parse_args()
print(opt)
dtype = torch.float
device = torch.device("cuda:0")
cuda = True if torch.cuda.is_available() else False
N_per = 100 # permutation times
alpha = 0.05 # test threshold
N1 = opt.n # number of samples in one set
K = 10 # number of trails
N = 100 # number of test sets
N_f = 100.0 # number of test sets (float)

# Loss function
adversarial_loss = torch.nn.CrossEntropyLoss()

# Naming variables
ep_OPT = np.zeros([K])
s_OPT = np.zeros([K])
s0_OPT = np.zeros([K])
Results = np.zeros([4,K])

# Define the deep network for G+c and D+C
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()

        def discriminator_block(in_filters, out_filters, bn=True):
            block = [nn.Conv2d(in_filters, out_filters, 3, 2, 1), nn.LeakyReLU(0.2, inplace=True), nn.Dropout2d(0)] #0.25
            if bn:
                block.append(nn.BatchNorm2d(out_filters, 0.8))
            return block
        self.model = nn.Sequential(
            *discriminator_block(opt.channels, 16, bn=False),
            *discriminator_block(16, 32),
            *discriminator_block(32, 64),
            *discriminator_block(64, 128),
        )
        # The height and width of downsampled image
        ds_size = opt.img_size // 2 ** 4
        self.adv_layer = nn.Sequential(
            nn.Linear(128 * ds_size ** 2, 100),
            nn.ReLU())
        self.output_layer = nn.Sequential(
            nn.Linear(100, 2),
            nn.Softmax())
    def forward(self, img):
        out = self.model(img)
        out = out.view(out.shape[0], -1)
        fea = self.adv_layer(out)
        validity = self.output_layer(fea)

        return validity, fea # It is different with Discriminator defined in "Deep_Baselines_MNIST.py"

# Define the deep network for L+J and G+J
class Featurizer(nn.Module):
    def __init__(self):
        super(Featurizer, self).__init__()

        def discriminator_block(in_filters, out_filters, bn=True):
            block = [nn.Conv2d(in_filters, out_filters, 3, 2, 1), nn.LeakyReLU(0.2, inplace=True), nn.Dropout2d(0)] #0.25
            if bn:
                block.append(nn.BatchNorm2d(out_filters, 0.8))
            return block

        self.model = nn.Sequential(
            *discriminator_block(opt.channels, 16, bn=False),
            *discriminator_block(16, 32),
            *discriminator_block(32, 64),
            *discriminator_block(64, 128),
        )

        # The height and width of downsampled image
        ds_size = opt.img_size // 2 ** 4
        self.adv_layer = nn.Sequential(
            nn.Linear(128 * ds_size ** 2, 100))

    def forward(self, img):
        out = self.model(img)
        out = out.view(out.shape[0], -1)
        feature = self.adv_layer(out)

        return feature

# Configure data loader
os.makedirs("./data/mnist", exist_ok=True)
dataloader_FULL = torch.utils.data.DataLoader(
    datasets.MNIST(
        "./data/mnist",
        train=True,
        download=True,
        transform=transforms.Compose(
            [transforms.Resize(opt.img_size), transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
        ),
    ),
    batch_size=60000,
    shuffle=True,
)
# Obtain real MNIST images
for i, (imgs, Labels) in enumerate(dataloader_FULL):
    data_all = imgs
    label_all = Labels
dataloader_FULL_te = torch.utils.data.DataLoader(
    datasets.MNIST(
        "./data/mnist",
        train=False,
        download=True,
        transform=transforms.Compose(
            [transforms.Resize(opt.img_size), transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
        ),
    ),
    batch_size=10000,
    shuffle=True,
)
for i, (imgs, Labels) in enumerate(dataloader_FULL_te):
    data_all_te = imgs
    label_all_te = Labels

# Repeat experiments K times (K = 10) and report average test power (rejection rate)
for kk in range(K):
    torch.manual_seed(kk * 19 + N1)
    torch.cuda.manual_seed(kk * 19 + N1)
    np.random.seed(seed=1102 * (kk + 10) + N1)
    # Initialize deep networks for L+J and G+J (called featurizer), G+C and D+C (called discriminator)
    featurizer = Featurizer()
    discriminator = Discriminator()
    featurizer_linear_kernel = Featurizer()
    # Initialize parameters
    epsilonOPT = torch.log(MatConvert(np.random.rand(1) * 10 ** (-10), device, dtype))
    epsilonOPT.requires_grad = True
    sigmaOPT = MatConvert(np.ones(1) * np.sqrt(2*32*32), device, dtype)
    sigmaOPT.requires_grad = True
    sigma0OPT = MatConvert(np.ones(1) * np.sqrt(0.005), device, dtype)
    sigma0OPT.requires_grad = True
    print(epsilonOPT.item())
    if cuda:
        featurizer.cuda()
        discriminator.cuda()
        featurizer_linear_kernel.cuda()
        adversarial_loss.cuda()

    # Collect real MNIST images
    np.random.seed(seed=819 * (kk + 9) + N1)
    train_data = []
    ind_M_all = np.arange(4000)
    ind_M_tr = np.random.choice(4000, N1, replace=False)
    ind_M_te = np.delete(ind_M_all,ind_M_tr)
    for i in ind_M_tr:
       train_data.append([data_all[i], label_all[i]])

    dataloader = torch.utils.data.DataLoader(
        train_data,
        batch_size=opt.batch_size,
        shuffle=True,
    )

    # Collect fake MNIST images
    Fake_MNIST = pickle.load(open('./Fake_MNIST_data_EP100_N10000.pckl', 'rb'))
    ind_all = np.arange(4000)
    ind_tr = np.random.choice(4000, N1, replace=False)
    ind_te = np.delete(ind_all,ind_tr)
    Fake_MNIST_tr = torch.from_numpy(Fake_MNIST[0][ind_tr])
    Fake_MNIST_te = torch.from_numpy(Fake_MNIST[0][ind_te])
    # REPLACE above 6 lines with
    # Fake_MNIST_tr = data_all[ind_M_tr_all[N1:]]
    # Fake_MNIST_te = data_all[ind_M_te]
    # for validating type-I error

    # Initialize optimizers
    optimizer_F = torch.optim.Adam(list(featurizer.parameters()) + [epsilonOPT] + [sigmaOPT] + [sigma0OPT], lr=opt.lr)
    optimizer_F_linear_kernel = torch.optim.Adam(list(featurizer_linear_kernel.parameters()), lr=opt.lr)
    optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=opt.lr)
    Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor

    # -------------------------------------------------------
    #  Training L+J and G+J and deep networks of G+C and D+C
    # -------------------------------------------------------
    np.random.seed(seed=1102)
    torch.manual_seed(1102)
    torch.cuda.manual_seed(1102)
    for epoch in range(opt.n_epochs):
        for i, (imgs, _) in enumerate(dataloader):
            if True:
                ind = np.random.choice(N1, imgs.shape[0], replace=False)
                Fake_imgs = Fake_MNIST_tr[ind]
                # Adversarial ground truths
                valid = Variable(Tensor(imgs.shape[0], 1).fill_(1.0), requires_grad=False)
                fake = Variable(Tensor(imgs.shape[0], 1).fill_(0.0), requires_grad=False)
                # Configure input
                real_imgs = Variable(imgs.type(Tensor))
                Fake_imgs = Variable(Fake_imgs.type(Tensor))
                X = torch.cat([real_imgs, Fake_imgs], 0)
                Y = torch.cat([valid, fake], 0).squeeze().long()

                # -----------
                #  Train G+J
                # -----------
                optimizer_F.zero_grad()
                modelu_output = featurizer(X)
                ep = torch.exp(epsilonOPT) / (1 + torch.exp(epsilonOPT))
                sigma = sigmaOPT ** 2
                sigma0_u = sigma0OPT ** 2
                TEMP = MMDu(modelu_output, imgs.shape[0], X.view(X.shape[0],-1), sigma, sigma0_u, ep, is_smooth=False)
                mmd_value_temp = -1 * (TEMP[0])
                mmd_std_temp = torch.sqrt(TEMP[1] + 10 ** (-8))
                if mmd_std_temp.item() == 0:
                    print('error std!!')
                if np.isnan(mmd_std_temp.item()):
                    print('error mmd!!')
                STAT_u = torch.div(mmd_value_temp, mmd_std_temp)
                STAT_u.backward()
                optimizer_F.step()

                # -----------
                #  Train L+J
                # -----------
                optimizer_F_linear_kernel.zero_grad()
                modelu_output_linear = featurizer_linear_kernel(X)
                TEMP_l = MMDu_linear_kernel(modelu_output_linear, imgs.shape[0])
                mmd_value_temp_l = -1 * (TEMP_l[0])
                mmd_std_temp_l = torch.sqrt(TEMP_l[1] + 10 ** (-8))
                if mmd_std_temp_l.item() == 0:
                    print('error std!!')
                if np.isnan(mmd_std_temp_l.item()):
                    print('error mmd!!')
                STAT_u_l = torch.div(mmd_value_temp_l, mmd_std_temp_l)
                STAT_u_l.backward()
                optimizer_F_linear_kernel.step()

                # ------------------------------------
                #  Train deep network for G+C and D+C
                # ------------------------------------
                optimizer_D.zero_grad()
                loss_C = adversarial_loss(discriminator(X)[0], Y)
                loss_C.backward()
                optimizer_D.step()
                if (epoch+1) % 100 == 0:
                    print(
                        "[Epoch %d/%d] [Batch %d/%d] [CE loss: %f] [Stat J of L+J: %f] [Stat J of G+J: %f]"
                        % (epoch, opt.n_epochs, i, len(dataloader), loss_C.item(), -STAT_u.item(), -STAT_u_l.item())
                    )
                batches_done = epoch * len(dataloader) + i
            else:
                break

    # Fetch training data
    s1 = data_all[ind_M_tr]
    s2 = Variable(Fake_MNIST_tr.type(Tensor))
    S = torch.cat([s1.cpu(),s2.cpu()],0).cuda()
    Sv = S.view(2*N1,-1)

    # Train G+C
    np.random.seed(seed=1102)
    torch.manual_seed(1102)
    torch.cuda.manual_seed(1102)
    S_m_v = discriminator(S)[1].view(2 * N1, -1)
    Dxy = Pdist2(S_m_v[:N1, :], S_m_v[N1:, :])
    sigma0 = torch.tensor(2*100) * torch.ones([1]).to(device, dtype)
    sigma0.requires_grad = True
    optimizer_sigma0 = torch.optim.Adam([sigma0], lr=0.001)
    for t in range(2000):
        TEMPa = MMDu(S_m_v, N1, S_m_v, sigma, sigma0, is_smooth=False)
        mmd_value_tempa = -1 * (TEMPa[0] + 10 ** (-8))
        mmd_std_tempa = torch.sqrt(TEMPa[1] + 10 ** (-8))
        if mmd_std_tempa.item() == 0:
            print('error!!')
        if np.isnan(mmd_std_tempa.item()):
            print('error!!')
        STAT_adaptive = torch.div(mmd_value_tempa, mmd_std_tempa)
        optimizer_sigma0.zero_grad()
        STAT_adaptive.backward(retain_graph=True)
        optimizer_sigma0.step()
        if t % 100 == 0:
            print("mmd_value: ", -1 * mmd_value_tempa.item(), "mmd_std: ", mmd_std_tempa.item(), "Statistic: ",
                  -1 * STAT_adaptive.item())

    # Train D+C
    np.random.seed(seed=1102)
    torch.manual_seed(1102)
    torch.cuda.manual_seed(1102)
    S_m_v = discriminator(S)[1].view(2 * N1, -1)
    epsilonOPT_dc = torch.log(MatConvert(np.random.rand(1) * 10 ** (-10), device, dtype))
    epsilonOPT_dc.requires_grad = True
    sigmaOPT_dc = MatConvert(np.ones(1) * np.sqrt(8 * 32 * 32), device, dtype)
    sigmaOPT_dc.requires_grad = True
    sigma0OPT_dc = MatConvert(np.ones(1) * np.sqrt(2*200), device, dtype)
    sigma0OPT_dc.requires_grad = True
    optimizer_sigma0_dc = torch.optim.Adam([epsilonOPT_dc] + [sigmaOPT_dc] + [sigma0OPT_dc], lr=0.001)
    for t in range(2000):
        ep_dc = torch.exp(epsilonOPT_dc) / (1 + torch.exp(epsilonOPT_dc))
        sigma_dc = sigmaOPT_dc ** 2
        sigma0_dc = sigma0OPT_dc ** 2
        TEMPa_dc = MMDu(S_m_v, N1, Sv, sigma_dc, sigma0_dc, ep_dc)
        mmd_value_tempa_dc = -1 * (TEMPa_dc[0] + 10 ** (-8))
        mmd_std_tempa_dc = torch.sqrt(TEMPa_dc[1] + 10 ** (-8))
        STAT_adaptive_dc = torch.div(mmd_value_tempa_dc, mmd_std_tempa_dc)
        optimizer_sigma0_dc.zero_grad()
        STAT_adaptive_dc.backward(retain_graph=True)
        optimizer_sigma0_dc.step()
        if t % 100 == 0:
            print("mmd_value: ", -1 * mmd_value_tempa_dc.item(), "mmd_std: ", mmd_std_tempa_dc.item(), "Statistic J: ",
                  -1 * STAT_adaptive_dc.item())

    # Run two-sample test on the training set
    # G+C
    h_adaptive, threshold_adaptive, mmd_value_adaptive = TST_MMD_adaptive_bandwidth(S_m_v, N_per, N1, S_m_v, sigma,
                                                                                    sigma0, alpha, device, dtype)
    # G+J
    h_u, threshold_u, mmd_value_u = TST_MMD_u(featurizer(S), N_per, N1, Sv, sigma, sigma0_u, ep, alpha, device, dtype,
                                              is_smooth=False)
    # L+J
    h_u_l, threshold_u_l, mmd_value_u_l = TST_MMD_u_linear_kernel(featurizer(S), N_per, N1, alpha, device, dtype)

    # D+C
    h_adaptive_dc, threshold_adaptive_dc, mmd_value_adaptive_dc = TST_MMD_u(S_m_v, N_per, N1, Sv, sigma_dc,
                                                                                    sigma0_dc, ep_dc, alpha,device, dtype)

    # Record best epsilon, sigma and sigma_0
    ep_OPT[kk] = ep.item()
    s_OPT[kk] = sigma.item()
    s0_OPT[kk] = sigma0_u.item()

    # Compute test power of MMD-D and baselines
    H_u = np.zeros(N)
    T_u = np.zeros(N)
    M_u = np.zeros(N)
    H_u_l = np.zeros(N)
    T_u_l = np.zeros(N)
    M_u_l = np.zeros(N)
    H_adaptive = np.zeros(N)
    T_adaptive = np.zeros(N)
    M_adaptive = np.zeros(N)
    H_adaptive_dc = np.zeros(N)
    T_adaptive_dc = np.zeros(N)
    M_adaptive_dc = np.zeros(N)
    np.random.seed(1102)
    count_u = 0
    count_adp = 0
    count_u_l = 0
    count_adp_dc = 0
    for k in range(N):
        # Fetch test data
        np.random.seed(seed=1102 * (k + 1) + N1)
        ind_M = np.random.choice(len(ind_M_te), N1, replace=False)
        s1 = data_all[ind_M_te[ind_M]]
        np.random.seed(seed=819 * (k + 3) + N1)
        ind_F = np.random.choice(len(Fake_MNIST_te), N1, replace=False)
        s2 = Variable(Fake_MNIST_te[ind_F].type(Tensor))
        S = torch.cat([s1.cpu(), s2.cpu()], 0).cuda()
        Sv = S.view(2 * N1, -1)
        S_m_v = discriminator(S)[1].view(2 * N1, -1)

        # Run two-sample test on the test set
        # G+C
        h_adaptive, threshold_adaptive, mmd_value_adaptive = TST_MMD_adaptive_bandwidth(S_m_v, N_per, N1, S_m_v, sigma,
                                                                                        sigma0, alpha, device, dtype)
        # G+J
        h_u, threshold_u, mmd_value_u = TST_MMD_u(featurizer(S), N_per, N1, Sv, sigma, sigma0_u, ep, alpha, device,
                                                  dtype, is_smooth=False)
        # L+J
        h_u_l, threshold_u_l, mmd_value_u_l = TST_MMD_u_linear_kernel(featurizer(S), N_per, N1, alpha, device, dtype)

        # D+C
        h_adaptive_dc, threshold_adaptive_dc, mmd_value_adaptive_dc = TST_MMD_u(S_m_v, N_per, N1, Sv, sigma_dc,
                                                                                sigma0_dc, ep_dc, alpha, device, dtype)
        # Gather results
        count_u = count_u + h_u
        count_adp = count_adp + h_adaptive
        count_u_l = count_u_l + h_u_l
        count_adp_dc = count_adp_dc + h_adaptive_dc
        print("L+J:", count_u_l,"G+J:", count_u,"G+C:", count_adp,"D+C:", count_adp_dc)
        H_u[k] = h_u
        T_u[k] = threshold_u
        M_u[k] = mmd_value_u
        H_u_l[k] = h_u_l
        T_u_l[k] = threshold_u_l
        M_u_l[k] = mmd_value_u_l
        H_adaptive[k] = h_adaptive
        T_adaptive[k] = threshold_adaptive
        M_adaptive[k] = mmd_value_adaptive
        H_adaptive_dc[k] = h_adaptive_dc
        T_adaptive_dc[k] = threshold_adaptive_dc
        M_adaptive_dc[k] = mmd_value_adaptive_dc
    print("Reject rate_LJ: ", H_u_l.sum() / N_f, "Reject rate_GJ: ", H_u.sum() / N_f, "Reject rate_GC:",
          H_adaptive.sum() / N_f,
          "Reject rate_DC: ", H_adaptive_dc.sum() / N_f)
    Results[0, kk] = H_u_l.sum() / N_f
    Results[1, kk] = H_u.sum() / N_f
    Results[2, kk] = H_adaptive.sum() / N_f
    Results[3, kk] = H_adaptive_dc.sum() / N_f
    print("Test Power of deep kernel based tests (K times): ")
    print(Results)
    print("Average Test Power of deep kernel based tests (K times): ")
    print(Results.sum(1) / (kk + 1))