diff --git a/cleverhans/future/torch/attacks/__init__.py b/cleverhans/future/torch/attacks/__init__.py index f411958ac..e10e47751 100644 --- a/cleverhans/future/torch/attacks/__init__.py +++ b/cleverhans/future/torch/attacks/__init__.py @@ -3,3 +3,4 @@ from cleverhans.future.torch.attacks.projected_gradient_descent import projected_gradient_descent from cleverhans.future.torch.attacks.noise import noise from cleverhans.future.torch.attacks.semantic import semantic +from cleverhans.future.torch.attacks.deepfool import deepfool diff --git a/cleverhans/future/torch/attacks/deepfool.py b/cleverhans/future/torch/attacks/deepfool.py new file mode 100644 index 000000000..abcceb2b9 --- /dev/null +++ b/cleverhans/future/torch/attacks/deepfool.py @@ -0,0 +1,171 @@ +"""The Deepfool attack.""" +import numpy as np +import torch +from torch.autograd.gradcheck import zero_gradients +from cleverhans.future.torch.utils import clip_eta + + +def deepfool(model_fn, x, clip_min=-np.inf, clip_max=np.inf, + y=None, targeted=False, eps=None, norm=None, + num_classes=10, overshoot=0.02, max_iter=50, + is_debug=False, sanity_checks=False): + """ + PyTorch implementation of DeepFool (https://arxiv.org/pdf/1511.04599.pdf). + :param model_fn: A callable that takes an input tensor and returns the model logits. + :param x: Input tensor. + :param clip_min: If specified, the minimum input value. + :param clip_max: If specified, the maximum input value. + :param y: (optional) Tensor with true labels. If targeted is true, then provide the + target label. Otherwise, only provide this parameter if you'd like to use true + labels when crafting adversarial samples. Otherwise, model predictions are used + as labels to avoid the "label leaking" effect (explained in this paper: + https://arxiv.org/abs/1611.01236). Default is None. + :param targeted: (optional) bool. Is the attack targeted or untargeted? Untargeted, the + default, will try to make the label incorrect. Targeted will instead try to + move in the direction of being more like y. + :param eps: The size of the maximum perturbation, or None if the perturbation + should not be constrained. + :param norm: Order of the norm used for eps (mimics NumPy). Possible values: np.inf, 1 or 2. + :param num_classes: the attack targets this many of the closest classes in the untargeted + version. + :param overshoot: used as a termination criterion to prevent vanishing updates. + :param max_iter: maximum number of iterations for DeepFool. + :param is_debug: If True, print the success rate after each iteration. + :param sanity_checks: bool, if True, include asserts (Turn them off to use less runtime / + memory or for unit tests that intentionally pass strange input) + :return: a tensor for the adversarial example + """ + + if y is not None and len(x) != len(y): + raise ValueError('number of inputs {} is different from number of labels {}' + .format(len(x), len(y))) + if y is None: + if targeted: + raise ValueError('cannot perform a targeted attack without specifying targets y') + y = torch.argmax(model_fn(x), dim=1) + + if eps is not None: + if eps < 0: + raise ValueError( + "eps must be greater than or equal to 0, got {} instead".format(eps)) + if norm not in [np.inf, 1, 2]: + raise ValueError('invalid norm') + if eps == 0: + return x + + if clip_min is not None and clip_max is not None: + if clip_min > clip_max: + raise ValueError( + "clip_min must be less than or equal to clip_max, got clip_min={} and clip_max={}" + .format(clip_min, clip_max)) + + asserts = [] + + # If a data range was specified, check that the input was in that range + + asserts.append(torch.all(x >= clip_min)) + asserts.append(torch.all(x <= clip_max)) + + # Determine classes to target + if targeted: + target_classes = y[:, None] + y = torch.argmax(model_fn(x), dim=1) + else: + logits = model_fn(x) + logit_indices = torch.arange( + logits.size()[1], + dtype=y.dtype, + device=y.device, + )[None, :].expand(y.size()[0], -1) + # Number of target classes should be at most number of classes minus 1 + num_classes = min(num_classes, logits.size()[1] - 1) + incorrect_logits = torch.where( + logit_indices == y[:, None], + torch.full_like(logits, -np.inf), + logits, + ) + target_classes = incorrect_logits.argsort( + dim=1, descending=True)[:, :num_classes] + + x = x.clone().detach().to(torch.float) + perturbations = torch.zeros_like(x) + + if is_debug: + print("Starting DeepFool attack") + + for i in range(max_iter): + x_adv = x + (1 + overshoot) * perturbations + x_adv.requires_grad_(True) + zero_gradients(x_adv) + logits = model_fn(x_adv) + + # "Live" inputs are still being attacked; others have already achieved misclassification + if targeted: + live = torch.argmax(logits, dim=1) != target_classes[:, 0] + else: + live = torch.argmax(logits, dim=1) == y + if is_debug: + print('Iteration {}: {:.1f}% success'.format( + i, 100 * (1 - live.sum().float() / len(live)).item())) + if torch.all(~live): + # Stop early if all inputs are already misclassified + break + + smallest_magnitudes = torch.full((int(live.sum()),), np.inf, + dtype=torch.float, device=perturbations.device) + smallest_perturbation_updates = torch.zeros_like(perturbations[live]) + + logits[live, y[live]].sum().backward(retain_graph=True) + grads_correct = x_adv.grad.data[live].clone().detach() + + for k in range(target_classes.size()[1]): + zero_gradients(x_adv) + + logits_target = logits[live, target_classes[live, k]] + logits_target.sum().backward(retain_graph=True) + grads_target = x_adv.grad.data[live].clone().detach() + + grads_diff = (grads_target - grads_correct).detach() + logits_margin = (logits_target - logits[live, y[live]]).detach() + + p = 1 if norm == np.inf else 2 + + grads_norm = (grads_diff ** p).abs().sum(dim=list(range(1, len(grads_diff.size())))) \ + ** (1. / p) + magnitudes = logits_margin.abs() / grads_norm + + magnitudes_expanded = magnitudes + for _ in range(len(grads_diff.size()) - 1): + grads_norm = grads_norm.unsqueeze(-1) + magnitudes_expanded = magnitudes_expanded.unsqueeze(-1) + + if norm == np.inf: + perturbation_updates = ((magnitudes_expanded + 1e-4) * + torch.sign(grads_diff)) + else: + perturbation_updates = ((magnitudes_expanded + 1e-4) * grads_diff / + grads_norm) + + smaller = magnitudes < smallest_magnitudes + smallest_perturbation_updates[smaller] = perturbation_updates[smaller] + smallest_magnitudes[smaller] = magnitudes[smaller] + + all_perturbation_updates = torch.zeros_like(perturbations) + all_perturbation_updates[live] = smallest_perturbation_updates + perturbations.add_(all_perturbation_updates) + + perturbations *= (1 + overshoot) + if eps is not None: + perturbations = clip_eta(perturbations, norm, eps) + perturbations = torch.clamp(x + perturbations, clip_min, clip_max) - x + perturbations /= (1 + overshoot) + + x_adv = x + perturbations * (1 + overshoot) + + asserts.append(torch.all(x_adv >= clip_min)) + asserts.append(torch.all(x_adv <= clip_max)) + + if sanity_checks: + assert np.all(asserts) + + return x_adv diff --git a/cleverhans/future/torch/tests/test_attacks.py b/cleverhans/future/torch/tests/test_attacks.py index 1f1722eee..5244ac7c3 100644 --- a/cleverhans/future/torch/tests/test_attacks.py +++ b/cleverhans/future/torch/tests/test_attacks.py @@ -5,12 +5,18 @@ from __future__ import unicode_literals import numpy as np +import copy from nose.plugins.skip import SkipTest import torch +from torch.autograd import Variable +from torch.autograd.gradcheck import zero_gradients +import torch.nn.functional as F +from torch import nn from cleverhans.devtools.checks import CleverHansTest from cleverhans.future.torch.attacks.fast_gradient_method import fast_gradient_method from cleverhans.future.torch.attacks.projected_gradient_descent import projected_gradient_descent +from cleverhans.future.torch.attacks.deepfool import deepfool class SimpleModel(torch.nn.Module): @@ -25,6 +31,7 @@ def forward(self, x): x = torch.matmul(x, self.w2) return x + class CommonAttackProperties(CleverHansTest): def setUp(self): @@ -349,3 +356,253 @@ def test_multiple_initial_random_step(self): ori_label.eq(new_label_multi).sum().to(torch.float) / self.normalized_x.size(0)) self.assertLess(failed_attack, .5) + + +class SimpleImageModel(torch.nn.Module): + """ + This slightly more complex model is useful for testing Deepfool. It has + two full-connected layers (one hidden) with ReLU activations between and outputs + five classes. + """ + + def __init__(self): + super(SimpleImageModel, self).__init__() + self.l1 = nn.Linear(2, 10) + self.l2 = nn.Linear(10, 5) + + def forward(self, x): + if len(x.size()) == 4: + x = x[:, 0, 0] + elif len(x.size()) == 3: + x = x[None, 0, 0] + x = self.l1(x) + x = F.relu(x) + x = self.l2(x) + return x + + +class TestDeepFool(CommonAttackProperties): + + def setUp(self): + super(TestDeepFool, self).setUp() + self.attack = deepfool + self.attack_param = { + 'clip_min' : -5, + 'clip_max' : 5, + } + + def test_invalid_input(self): + x = torch.tensor([[-2., 3.]]) + for norm in self.ord_list: + self.assertRaises( + AssertionError, self.attack, model_fn=self.model, x=x, eps=.1, + norm=norm, clip_min=-1., clip_max=1., sanity_checks=True) + + def test_invalid_eps(self): + for norm in self.ord_list: + self.assertRaises( + ValueError, self.attack, model_fn=self.model, + x=self.x, eps=-.1, norm=norm) + + def test_eps_equals_zero(self): + for norm in self.ord_list: + self.assertClose( + self.attack(model_fn=self.model, x=self.x, eps=0, norm=norm), + self.x) + + def test_max_iter_equals_zero(self): + for norm in self.ord_list: + self.assertClose( + self.attack( + model_fn=self.model, x=self.x, eps=.5, norm=norm, max_iter=0), + self.x) + + def test_invalid_clips(self): + clip_min = .5 + clip_max = -.5 + for norm in self.ord_list: + self.assertRaises( + ValueError, self.attack, model_fn=self.model, x=self.x, eps=.1, + norm=norm, clip_min=clip_min, clip_max=clip_max) + + def test_adv_example_success_rate_linf(self): + self.help_adv_examples_success_rate( + norm=np.inf, **self.attack_param) + + def test_targeted_adv_example_success_rate_linf(self): + self.help_targeted_adv_examples_success_rate( + norm=np.inf, **self.attack_param) + + def test_adv_example_success_rate_l1(self): + self.help_adv_examples_success_rate( + norm=1, **self.attack_param) + + def test_targeted_adv_example_success_rate_l1(self): + self.help_targeted_adv_examples_success_rate( + norm=1, **self.attack_param) + + def test_adv_example_success_rate_l2(self): + self.help_adv_examples_success_rate( + norm=2, **self.attack_param) + + def test_targeted_adv_example_success_rate_l2(self): + self.help_targeted_adv_examples_success_rate( + norm=2, **self.attack_param) + + def test_do_not_reach_lp_boundary(self): + for norm in self.ord_list: + x_adv = self.attack( + model_fn=self.model, x=self.normalized_x, eps=.5, norm=norm) + + if norm == np.inf: + delta, _ = torch.abs(x_adv - self.normalized_x).max(dim=1) + elif norm == 1: + delta = torch.abs(x_adv - self.normalized_x).sum(dim=1) + elif norm == 2: + delta = torch.pow(x_adv - self.normalized_x, 2).sum(dim=1).pow(.5) + diff = torch.max(.5 - delta) + self.assertTrue(diff > .25) + + def test_attack_strength(self): + x_adv = self.attack( + model_fn=self.model, x=self.normalized_x, eps=1., + norm=np.inf, clip_min=.5, clip_max=.7, + sanity_checks=False) + _, ori_label = self.model(self.normalized_x).max(1) + _, adv_label = self.model(x_adv).max(1) + adv_acc = ( + adv_label.eq(ori_label).sum().to(torch.float) + / self.normalized_x.size(0)) + self.assertLess(adv_acc, .1) + + def test_eps(self): + # test if the attack respects the norm constraint + # NOTE clip_eta makes sure that at each step, adv_x respects the eps + # norm constraint. Therefore, this is essentially a test on clip_eta, + # which is implemented in a separate test_clip_eta + raise SkipTest() + + def test_clip_eta(self): + # NOTE: this has been tested with test_clip_eta in test_utils + raise SkipTest() + + def test_clips(self): + clip_min = -1. + clip_max = 1. + for norm in self.ord_list: + x_adv = self.attack( + model_fn=self.model, x=self.normalized_x, eps=.3, + norm=norm, clip_min=clip_min, clip_max=clip_max) + self.assertTrue(torch.all(x_adv <= clip_max)) + self.assertTrue(torch.all(x_adv >= clip_min)) + + def test_multiple_initial_random_step(self): + _, ori_label = self.model(self.normalized_x).max(1) + new_label_multi = ori_label.clone().detach() + + for _ in range(10): + x_adv = self.attack( + model_fn=self.model, x=self.normalized_x, eps=.5, + norm=np.inf, clip_min=.5, clip_max=.7, sanity_checks=False) + _, new_label = self.model(x_adv).max(1) + + # examples for which we have not found adversarial examples + i = ori_label.eq(new_label_multi) + new_label_multi[i] = new_label[i] + + failed_attack = ( + ori_label.eq(new_label_multi).sum().to(torch.float) + / self.normalized_x.size(0)) + self.assertLess(failed_attack, .5) + + def test_matches_reference(self): + model = SimpleImageModel() + x_adv = self.attack(model_fn=model, x=self.x[:, None, None, :]) + for image, adv_image in zip(self.x, x_adv): + image = image[None, None, :] + _, _, _, _, pert_image = TestDeepFool.reference_deepfool(image, model, num_classes=5) + assert torch.norm(adv_image - pert_image) < 1e-4, (adv_image, pert_image) + + @staticmethod + def reference_deepfool(image, net, num_classes=10, overshoot=0.02, max_iter=50): + """ + Reference implementation of DeepFool from original authors at + https://github.com/LTS4/DeepFool. + :param image: Image of size HxWx3 + :param net: network (input: images, output: values of activation **BEFORE** softmax). + :param num_classes: num_classes (limits the number of classes to test against, by default = 10) + :param overshoot: used as a termination criterion to prevent vanishing updates (default = 0.02). + :param max_iter: maximum number of iterations for deepfool (default = 50) + :return: minimal perturbation that fools the classifier, number of iterations that it required, new estimated_label and perturbed image + """ + is_cuda = torch.cuda.is_available() + + if is_cuda: + print("Using GPU") + image = image.cuda() + net = net.cuda() + else: + print("Using CPU") + + f_image = net.forward(Variable( + image[None, :, :, :], requires_grad=True)).data.cpu().numpy().flatten() + I = (np.array(f_image)).flatten().argsort()[::-1] + + I = I[0:num_classes] + label = I[0] + + input_shape = image.cpu().numpy().shape + pert_image = copy.deepcopy(image) + w = np.zeros(input_shape) + r_tot = np.zeros(input_shape) + + loop_i = 0 + + x = Variable(pert_image[None, :], requires_grad=True) + fs = net.forward(x) + fs_list = [fs[0, I[k]] for k in range(num_classes)] + k_i = label + + while k_i == label and loop_i < max_iter: + + pert = np.inf + fs[0, I[0]].backward(retain_graph=True) + grad_orig = x.grad.data.cpu().numpy().copy() + + for k in range(1, num_classes): + zero_gradients(x) + + fs[0, I[k]].backward(retain_graph=True) + cur_grad = x.grad.data.cpu().numpy().copy() + + # set new w_k and new f_k + w_k = cur_grad - grad_orig + f_k = (fs[0, I[k]] - fs[0, I[0]]).data.cpu().numpy() + + pert_k = abs(f_k) / np.linalg.norm(w_k.flatten()) + + # determine which w_k to use + if pert_k < pert: + pert = pert_k + w = w_k + + # compute r_i and r_tot + # Added 1e-4 for numerical stability + r_i = (pert + 1e-4) * w / np.linalg.norm(w) + r_tot = np.float32(r_tot + r_i) + + if is_cuda: + pert_image = image + (1 + overshoot) * \ + torch.from_numpy(r_tot).cuda() + else: + pert_image = image + (1 + overshoot) * torch.from_numpy(r_tot) + + x = Variable(pert_image, requires_grad=True) + fs = net.forward(x) + k_i = np.argmax(fs.data.cpu().numpy().flatten()) + + loop_i += 1 + + r_tot = (1 + overshoot) * r_tot + + return r_tot, loop_i, label, k_i, pert_image diff --git a/cleverhans/future/torch/tests/test_utils.py b/cleverhans/future/torch/tests/test_utils.py index a4ee73150..f42b4cf43 100644 --- a/cleverhans/future/torch/tests/test_utils.py +++ b/cleverhans/future/torch/tests/test_utils.py @@ -94,13 +94,9 @@ def test_clip_eta_linf(self): self.assertTrue(torch.all(clipped >= -.5)) def test_clip_eta_l1(self): - self.assertRaises( - NotImplementedError, self.clip_eta, eta=self.rand_eta, norm=1, eps=.5) - - # TODO uncomment the actual test below after we have implemented the L1 attack - # clipped = self.clip_eta(eta=self.rand_eta, norm=1, eps=.5) - # norm = clipped.abs().sum(dim=self.red_ind) - # self.assertTrue(torch.all(norm <= .5001)) + clipped = self.clip_eta(eta=self.rand_eta, norm=1, eps=.5) + norm = clipped.abs().sum(dim=self.red_ind) + self.assertTrue(torch.all(norm <= .5001)) def test_clip_eta_l2(self): clipped = self.clip_eta(eta=self.rand_eta, norm=2, eps=.5) diff --git a/cleverhans/future/torch/utils.py b/cleverhans/future/torch/utils.py index 9e33abea5..dff5923ed 100644 --- a/cleverhans/future/torch/utils.py +++ b/cleverhans/future/torch/utils.py @@ -21,22 +21,7 @@ def clip_eta(eta, norm, eps): if norm == np.inf: eta = torch.clamp(eta, -eps, eps) else: - if norm == 1: - raise NotImplementedError("L1 clip is not implemented.") - norm = torch.max( - avoid_zero_div, - torch.sum(torch.abs(eta), dim=reduc_ind, keepdim=True) - ) - elif norm == 2: - norm = torch.sqrt(torch.max( - avoid_zero_div, - torch.sum(eta ** 2, dim=reduc_ind, keepdim=True) - )) - factor = torch.min( - torch.tensor(1., dtype=eta.dtype, device=eta.device), - eps / norm - ) - eta *= factor + eta = torch.renorm(eta, p=norm, dim=0, maxnorm=eps) return eta def get_or_guess_labels(model, x, **kwargs):