Source code for ares.attack.deepfool

import torch
import numpy as np
from ares.utils.registry import registry

[docs]@registry.register_attack('deepfool')
class DeepFool(object):
    ''' DeepFool. A white-box iterative optimization method. It needs to calculate the Jacobian of the logits with
    relate to input, so that it only applies to tasks with small number of classification class.

    Example:
        >>> from ares.utils.registry import registry
        >>> attacker_cls = registry.get_attack('deepfool')
        >>> attacker = attacker_cls(model)
        >>> adv_images = attacker(images, labels)

    - Supported distance metric: 2, np.inf.
    - References: https://arxiv.org/abs/1511.04599.
    '''
[docs]    def __init__(self, model, device='cuda', norm=np.inf, overshoot=0.02, max_iter=50, target=False):
        '''
        Args:
            model (torch.nn.Module): The target model to be attacked.
            device  (torch.device): The device to perform autoattack. Defaults to 'cuda'.
            norm (float): The norm of distance calculation for adversarial constraint. Defaults to np.inf.
            overshoot (float): The parameter overshoot. Defaults to 0.02.
            max_iter (int): The maximum iteration.
            target (bool): Conduct target/untarget attack. Defaults to False.
        '''

        self.overshoot = overshoot
        self.max_iter = max_iter
        self.net = model
        self.norm = norm
        self.device = device
        self.target = target
        self.min_value = 0
        self.max_value = 1
        if self.target:
            raise AssertionError('DeepFool dont support targeted attack')

[docs]    def deepfool(self, x, y):
        '''The function for deepfool.'''
        with torch.no_grad():
            logits = self.net(x)
            outputs = torch.argmax(logits, dim=1)
            if outputs!=y:
                return x
        self.nb_classes = logits.size(-1)

        adv_x = x.clone().detach().requires_grad_()
        
        iteration = 0
        logits = self.net(adv_x)
        current = logits.max(1)[1].item()
        original = logits.max(1)[1].item()
        noise = torch.zeros(x.size()).to(self.device)
        w = torch.zeros(x.size()).to(self.device)
        
        while (current == original and iteration < self.max_iter):
            gradients_0 = torch.autograd.grad(logits[0, current], [adv_x],retain_graph=True)[0].detach()
            
            for k in range(self.nb_classes):
                pert = np.inf
                if k==current:
                    continue
                gradients_1 = torch.autograd.grad(logits[0, k], [adv_x],retain_graph=True)[0].detach()
                w_k = gradients_1 - gradients_0
                f_k = logits[0, k] - logits[0, current]
                if self.norm == np.inf:
                    pert_k = (torch.abs(f_k) + 0.00001)/ torch.norm(w_k.flatten(1),1, -1)
                elif self.norm == 2:
                    pert_k = (torch.abs(f_k) + 0.00001) / torch.norm(w_k.flatten(1),2,-1)
                if pert_k < pert:
                    pert = pert_k
                    w = w_k
                if self.norm == np.inf:
                    r_i = (pert + 1e-4) * w.sign()
                elif self.norm==2:
                    r_i = (pert + 1e-4) * w / torch.norm(w.flatten(1),2,-1)
            noise += r_i.clone()
            adv_x = x.clone().detach().requires_grad_()
            adv_x = torch.clamp(x + noise, self.min_value, self.max_value).requires_grad_()
            logits = self.net(adv_x)
            current = logits.max(1)[1].item()
            iteration = iteration + 1

        adv_x = torch.clamp((1 + self.overshoot) * noise + x, self.min_value, self.max_value)
        
        return adv_x

    def __call__(self, images=None, labels=None, target_labels=None):
        '''This function perform attack on target images with corresponding labels 
        and target labels for target attack.

        Args:
            images (torch.Tensor): The images to be attacked. The images should be torch.Tensor with shape [N, C, H, W] and range [0, 1].
            labels (torch.Tensor): The corresponding labels of the images. The labels should be torch.Tensor with shape [N, ]
            target_labels (torch.Tensor): Not used in deepfool and should be None type.

        Returns:
            torch.Tensor: Adversarial images with value range [0,1].

        '''
        assert target_labels is None, "Target attack is not supported for deepfool."

        adv_images = []
        for i in range(len(images)):
            adv_x = self.deepfool(images[i].unsqueeze(0), labels[i].unsqueeze(0))  
            adv_images.append(adv_x)
        adv_images = torch.cat(adv_images, 0)
        return adv_images