RPM_seg/mmseg/models/uda/dacs.py

# Obtained from: https://github.com/lhoyer/DAFormer
# Modifications:
# - Delete tensors after usage to free GPU memory
# - Add HRDA debug visualizations
# - Support ImageNet feature distance for LR and HR predictions of HRDA
# - Add masked image consistency
# - Update debug image system
# ---------------------------------------------------------------
# Copyright (c) 2021-2022 ETH Zurich, Lukas Hoyer. All rights reserved.
# Licensed under the Apache License, Version 2.0
# ---------------------------------------------------------------

# The ema model update and the domain-mixing are based on:
# https://github.com/vikolss/DACS
# Copyright (c) 2020 vikolss. Licensed under the MIT License.
# A copy of the license is available at resources/license_dacs

import math
import os
import random
from copy import deepcopy


import mmcv
import numpy as np
import torch
from matplotlib import pyplot as plt
from timm.models.layers import DropPath
from torch.nn import functional as F
from torch.nn.modules.dropout import _DropoutNd

from mmseg.core import add_prefix
from mmseg.models import UDA, HRDAEncoderDecoder, build_segmentor
from mmseg.models.segmentors.hrda_encoder_decoder import crop
from mmseg.models.uda.masking_consistency_module import \
    MaskingConsistencyModule
from mmseg.models.uda.uda_decorator import UDADecorator, get_module
from mmseg.models.utils.dacs_transforms import (denorm, get_class_masks,
                                                get_mean_std, strong_transform,get_rare_class_mask)
from mmseg.models.utils.visualization import prepare_debug_out, subplotimg
from mmseg.utils.utils import downscale_label_ratio
import os.path as osp
import json
from mmseg.ops import resize
from mmcv.runner import BaseModule, auto_fp16, force_fp32
from mmseg.models.uda.prototype_dist_estimator import prototype_dist_estimator
import gc

def _params_equal(ema_model, model):
    for ema_param, param in zip(ema_model.named_parameters(),
                                model.named_parameters()):
        if not torch.equal(ema_param[1].data, param[1].data):
            # print("Difference in", ema_param[0])
            return False
    return True

# 根據指定的 norm_type 參數計算模型梯度的大小，可以用於評估梯度的重要程度或進行梯度正則化等操作。
def calc_grad_magnitude(grads, norm_type=2.0):
    norm_type = float(norm_type)
    if norm_type == math.inf:
        norm = max(p.abs().max() for p in grads)
    else:
        norm = torch.norm(
            torch.stack([torch.norm(p, norm_type) for p in grads]), norm_type)

    return norm

# 得到source對應的稀少類和頻率
def get_rcs_class_probs(data_root, temperature):
    with open(osp.join(data_root, 'sample_class_stats.json'), 'r') as of:
        sample_class_stats = json.load(of)
    overall_class_stats = {}
    for s in sample_class_stats:
        s.pop('file')
        for c, n in s.items():
            c = int(c)
            if c not in overall_class_stats:
                overall_class_stats[c] = n
            else:
                overall_class_stats[c] += n
    overall_class_stats = {
        k: v
        for k, v in sorted(
            overall_class_stats.items(), key=lambda item: item[1])
    }
    freq = torch.tensor(list(overall_class_stats.values()))
    freq = freq / torch.sum(freq)
    original_freq = freq
    freq = 1 - freq
    freq = torch.softmax(freq / temperature, dim=-1)

    return list(overall_class_stats.keys()), freq.numpy(), original_freq


@UDA.register_module()
class DACS(UDADecorator):

    def __init__(self, **cfg):
        super(DACS, self).__init__(**cfg)
        self.local_iter = 0
        self.max_iters = cfg['max_iters']
        self.source_only = cfg['source_only']
        self.alpha = cfg['alpha']
        self.pseudo_threshold = cfg['pseudo_threshold']
        self.psweight_ignore_top = cfg['pseudo_weight_ignore_top']
        self.psweight_ignore_bottom = cfg['pseudo_weight_ignore_bottom']
        self.fdist_lambda = cfg['imnet_feature_dist_lambda']
        self.fdist_classes = cfg['imnet_feature_dist_classes']
        self.fdist_scale_min_ratio = cfg['imnet_feature_dist_scale_min_ratio']
        self.enable_fdist = self.fdist_lambda > 0
        self.mix = cfg['mix']
        self.blur = cfg['blur']
        self.color_jitter_s = cfg['color_jitter_strength']
        self.color_jitter_p = cfg['color_jitter_probability']
        self.mask_mode = cfg['mask_mode']
        self.enable_masking = self.mask_mode is not None
        self.print_grad_magnitude = cfg['print_grad_magnitude']
        self.rare_class_mix = cfg['rare_class_mix']
        self.class_num = cfg['class_num']
        self.mask_type = cfg['mask_type']
        self.dist_mode = cfg['dist_mode']
        self.topk = cfg['topk']
        
        assert self.dist_mode == ['global', 'local'] or self.dist_mode == ['global'] or self.dist_mode == ['local']
        assert self.mix == 'class'

        self.debug_fdist_mask = None
        self.debug_gt_rescale = None

        self.data_root = cfg['data_root']
        self.rcs_class_temp = cfg['rcs_class_temp']
        self.mask_block_size = cfg['mask_block_size'] 
        # dataset = UDADataset(None,None,cfg)
        self.rcs_classes = None
        self.rcs_classprob = None
        self.freq = None
        # mmcv.print_log(f'DACS RCS Classes: {self.rcs_classes}', 'mmseg')
        # mmcv.print_log(f'DACS RCS ClassProb: {self.rcs_classprob}', 'mmseg')
        # mmcv.print_log(f'DACS RCS freq: {self.freq}', 'mmseg')

        self.class_probs = {}
        ema_cfg = deepcopy(cfg['model'])
        if not self.source_only:
            self.ema_model = build_segmentor(ema_cfg)
        self.mic = None
        if self.enable_masking:
            self.mic = MaskingConsistencyModule(require_teacher=False, cfg=cfg)
        if self.enable_fdist:
            self.imnet_model = build_segmentor(deepcopy(cfg['model']))
        else:
            self.imnet_model = None
        if self.mask_type =='proto'or 'proto_prob':    
            self.feat_estimator = prototype_dist_estimator(feature_num=256, cfg=cfg)    

    def get_ema_model(self):
        return get_module(self.ema_model)

    def get_imnet_model(self):
        return get_module(self.imnet_model)

    def _init_ema_weights(self):
        if self.source_only:
            return
        for param in self.get_ema_model().parameters():
            param.detach_()
        mp = list(self.get_model().parameters())
        mcp = list(self.get_ema_model().parameters())
        for i in range(0, len(mp)):
            if not mcp[i].data.shape:  # scalar tensor
                mcp[i].data = mp[i].data.clone()
            else:
                mcp[i].data[:] = mp[i].data[:].clone()

    def _update_ema(self, iter):
        if self.source_only:
            return
        alpha_teacher = min(1 - 1 / (iter + 1), self.alpha)
        for ema_param, param in zip(self.get_ema_model().parameters(),
                                    self.get_model().parameters()):
            if not param.data.shape:  # scalar tensor
                ema_param.data = \
                    alpha_teacher * ema_param.data + \
                    (1 - alpha_teacher) * param.data
            else:
                ema_param.data[:] = \
                    alpha_teacher * ema_param[:].data[:] + \
                    (1 - alpha_teacher) * param[:].data[:]

    def train_step(self, data_batch, optimizer, **kwargs):
        """The iteration step during training.

        This method defines an iteration step during training, except for the
        back propagation and optimizer updating, which are done in an optimizer
        hook. Note that in some complicated cases or models, the whole process
        including back propagation and optimizer updating is also defined in
        this method, such as GAN.

        Args:
            data (dict): The output of dataloader.
            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
                runner is passed to ``train_step()``. This argument is unused
                and reserved.

        Returns:
            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
                ``num_samples``.
                ``loss`` is a tensor for back propagation, which can be a
                weighted sum of multiple losses.
                ``log_vars`` contains all the variables to be sent to the
                logger.
                ``num_samples`` indicates the batch size (when the model is
                DDP, it means the batch size on each GPU), which is used for
                averaging the logs.
        """

        optimizer.zero_grad()
        log_vars = self(**data_batch)
        optimizer.step()

        log_vars.pop('loss', None)  # remove the unnecessary 'loss'
        outputs = dict(
            log_vars=log_vars, num_samples=len(data_batch['img_metas']))
        return outputs

    def masked_feat_dist(self, f1, f2, mask=None):
        feat_diff = f1 - f2
        # mmcv.print_log(f'fdiff: {feat_diff.shape}', 'mmseg')
        pw_feat_dist = torch.norm(feat_diff, dim=1, p=2)
        # mmcv.print_log(f'pw_fdist: {pw_feat_dist.shape}', 'mmseg')
        if mask is not None:
            # mmcv.print_log(f'fd mask: {mask.shape}', 'mmseg')
            pw_feat_dist = pw_feat_dist[mask.squeeze(1)]
            # mmcv.print_log(f'fd masked: {pw_feat_dist.shape}', 'mmseg')
        # If the mask is empty, the mean will be NaN. However, as there is
        # no connection in the compute graph to the network weights, the
        # network gradients are zero and no weight update will happen.
        # This can be verified with print_grad_magnitude.
        return torch.mean(pw_feat_dist)

    def calc_feat_dist(self, img, gt, feat=None):
        assert self.enable_fdist
        # Features from multiple input scales (see HRDAEncoderDecoder)
        if isinstance(self.get_model(), HRDAEncoderDecoder) and \
                self.get_model().feature_scale in \
                self.get_model().feature_scale_all_strs:
            lay = -1
            feat = [f[lay] for f in feat]
            with torch.no_grad():
                self.get_imnet_model().eval()
                feat_imnet = self.get_imnet_model().extract_feat(img)
                feat_imnet = [f[lay].detach() for f in feat_imnet]
            feat_dist = 0
            n_feat_nonzero = 0
            for s in range(len(feat_imnet)):
                if self.fdist_classes is not None:
                    fdclasses = torch.tensor(
                        self.fdist_classes, device=gt.device)
                    gt_rescaled = gt.clone()
                    if s in HRDAEncoderDecoder.last_train_crop_box:
                        gt_rescaled = crop(
                            gt_rescaled,
                            HRDAEncoderDecoder.last_train_crop_box[s])
                    scale_factor = gt_rescaled.shape[-1] // feat[s].shape[-1]
                    gt_rescaled = downscale_label_ratio(
                        gt_rescaled, scale_factor, self.fdist_scale_min_ratio,
                        self.num_classes, 255).long().detach()
                    fdist_mask = torch.any(gt_rescaled[..., None] == fdclasses,
                                           -1)
                    fd_s = self.masked_feat_dist(feat[s], feat_imnet[s],
                                                 fdist_mask)
                    feat_dist += fd_s
                    if fd_s != 0:
                        n_feat_nonzero += 1
                    del fd_s
                    if s == 0:
                        self.debug_fdist_mask = fdist_mask
                        self.debug_gt_rescale = gt_rescaled
                else:
                    raise NotImplementedError
        else:
            with torch.no_grad():
                self.get_imnet_model().eval()
                feat_imnet = self.get_imnet_model().extract_feat(img)
                feat_imnet = [f.detach() for f in feat_imnet]
            lay = -1
            if self.fdist_classes is not None:
                fdclasses = torch.tensor(self.fdist_classes, device=gt.device)
                scale_factor = gt.shape[-1] // feat[lay].shape[-1]
                gt_rescaled = downscale_label_ratio(gt, scale_factor,
                                                    self.fdist_scale_min_ratio,
                                                    self.num_classes,
                                                    255).long().detach()
                fdist_mask = torch.any(gt_rescaled[..., None] == fdclasses, -1)
                feat_dist = self.masked_feat_dist(feat[lay], feat_imnet[lay],
                                                  fdist_mask)
                self.debug_fdist_mask = fdist_mask
                self.debug_gt_rescale = gt_rescaled
            else:
                feat_dist = self.masked_feat_dist(feat[lay], feat_imnet[lay])
        feat_dist = self.fdist_lambda * feat_dist
        feat_loss, feat_log = self._parse_losses(
            {'loss_imnet_feat_dist': feat_dist})
        feat_log.pop('loss', None)
        return feat_loss, feat_log

    def update_debug_state(self):
        debug = self.local_iter % self.debug_img_interval == 0
        self.get_model().automatic_debug = False
        self.get_model().debug = debug
        if not self.source_only:
            self.get_ema_model().automatic_debug = False
            self.get_ema_model().debug = debug
        if self.mic is not None:
            self.mic.debug = debug

    def get_pseudo_label_and_weight(self, logits):
        ema_softmax = torch.softmax(logits.detach(), dim=1)
        pseudo_prob, pseudo_label = torch.max(ema_softmax, dim=1)
        ps_large_p = pseudo_prob.ge(self.pseudo_threshold).long() == 1
        ps_size = np.size(np.array(pseudo_label.cpu()))
        pseudo_weight = torch.sum(ps_large_p).item() / ps_size
        pseudo_weight = pseudo_weight * torch.ones(
            pseudo_prob.shape, device=logits.device)
        return pseudo_label, pseudo_weight

    def filter_valid_pseudo_region(self, pseudo_weight, valid_pseudo_mask):
        if self.psweight_ignore_top > 0:
            # Don't trust pseudo-labels in regions with potential
            # rectification artifacts. This can lead to a pseudo-label
            # drift from sky towards building or traffic light.
            assert valid_pseudo_mask is None
            pseudo_weight[:, :self.psweight_ignore_top, :] = 0
        if self.psweight_ignore_bottom > 0:
            assert valid_pseudo_mask is None
            pseudo_weight[:, -self.psweight_ignore_bottom:, :] = 0
        if valid_pseudo_mask is not None:
            pseudo_weight *= valid_pseudo_mask.squeeze(1)
        return pseudo_weight 
    
    @force_fp32(apply_to=('rep', ))
    def upSample(self, rep, img, align_corners):
        with torch.no_grad():
            rep = resize(
                input=rep,
                size=img.shape[2:],
                mode='bilinear',
                align_corners=align_corners)
        mmcv.print_log(f'self.align_corners: {align_corners}', 'mmseg')
        mmcv.print_log(f'rep2.shape: {rep.shape}', 'mmseg')
        mmcv.print_log(f'rep2: {rep}', 'mmseg') 
        return rep 
    
    
    def forward_train(self,
                      img,
                      img_metas,
                      gt_semantic_seg,
                      target_img,
                      target_img_metas,
                      rare_class=None,
                      valid_pseudo_mask=None,
                      target_gt_semantic_seg=None,
                      cls_dist=None,):
        """Forward function for training.

        Args:
            img (Tensor): Input images.
            img_metas (list[dict]): List of image info dict where each dict
                has: 'img_shape', 'scale_factor', 'flip', and may also contain
                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
                For details on the values of these keys see
                `mmseg/datasets/pipelines/formatting.py:Collect`.
            gt_semantic_seg (Tensor): Semantic segmentation masks
                used if the architecture supports semantic segmentation task.

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        log_vars = {}
        batch_size = img.shape[0]
        dev = img.device

        # transform cls_dist to one batch
        if cls_dist is not None:
            cls_dist_cpu = {
                'prob': {k: v[0].cpu() for k, v in cls_dist['prob'].items()},
                'bin_edges': cls_dist['bin_edges'][0].cpu(),
                'relation': {k: v[0].cpu() for k, v in cls_dist['relation'].items()},
            }

        del cls_dist
        
        # mmcv.print_log(f'target_gt_semantic_seg: {target_gt_semantic_seg}', 'mmseg')
        # Init/update ema model
        if self.local_iter == 0:
            self._init_ema_weights()
            # assert _params_equal(self.get_ema_model(), self.get_model())

        if self.local_iter > 0:
            self._update_ema(self.local_iter)
            # assert not _params_equal(self.get_ema_model(), self.get_model())
            # assert self.get_ema_model().training
        if self.mic is not None:
            self.mic.update_weights(self.get_model(), self.local_iter)

        self.update_debug_state()
        seg_debug = {}

        means, stds = get_mean_std(img_metas, dev)
        strong_parameters = {
            'mix': None,
            'color_jitter': random.uniform(0, 1),
            'color_jitter_s': self.color_jitter_s,
            'color_jitter_p': self.color_jitter_p,
            'blur': random.uniform(0, 1) if self.blur else 0,
            'mean': means[0].unsqueeze(0),  # assume same normalization
            'std': stds[0].unsqueeze(0)
        }
        

        # Train on source images
        clean_losses = self.get_model().forward_train(
            img, img_metas, gt_semantic_seg, return_feat=True)
        src_feat = clean_losses.pop('features')
        seg_debug['Source'] = self.get_model().debug_output
        clean_loss, clean_log_vars = self._parse_losses(clean_losses)
        log_vars.update(clean_log_vars)
        clean_loss.backward(retain_graph=self.enable_fdist)
        if self.print_grad_magnitude:
            params = self.get_model().backbone.parameters()
            seg_grads = [
                p.grad.detach().clone() for p in params if p.grad is not None
            ]
            grad_mag = calc_grad_magnitude(seg_grads)
            mmcv.print_log(f'Seg. Grad.: {grad_mag}', 'mmseg')

        # ImageNet feature distance
        if self.enable_fdist:
            feat_loss, feat_log = self.calc_feat_dist(img, gt_semantic_seg,
                                                      src_feat)
            log_vars.update(add_prefix(feat_log, 'src'))
            feat_loss.backward()
            if self.print_grad_magnitude:
                params = self.get_model().backbone.parameters()
                fd_grads = [
                    p.grad.detach() for p in params if p.grad is not None
                ]
                fd_grads = [g2 - g1 for g1, g2 in zip(seg_grads, fd_grads)]
                grad_mag = calc_grad_magnitude(fd_grads)
                mmcv.print_log(f'Fdist Grad.: {grad_mag}', 'mmseg')
        del src_feat, clean_loss
        if self.enable_fdist:
            del feat_loss

        pseudo_label, pseudo_weight = None, None
        if not self.source_only:
            # Generate pseudo-label
            for m in self.get_ema_model().modules():
                if isinstance(m, _DropoutNd):
                    m.training = False
                if isinstance(m, DropPath):
                    m.training = False
            ema_logits = self.get_ema_model().generate_pseudo_label(
                target_img, target_img_metas)
            
                    # small_ema_logits = self.get_ema_model().get_small_out()
            # mmcv.print_log(f' small_ema_logits.size(): {small_ema_logits.size()}', 'mmseg') 
            # mmcv.print_log(f' small_ema_logits: {small_ema_logits}', 'mmseg')            
            seg_debug['Target'] = self.get_ema_model().debug_output

            pseudo_label, pseudo_weight= self.get_pseudo_label_and_weight(
                ema_logits)
            del ema_logits

            pseudo_weight = self.filter_valid_pseudo_region(
                pseudo_weight, valid_pseudo_mask)
            gt_pixel_weight = torch.ones((pseudo_weight.shape), device=dev)

            # Apply mixing
            mixed_img, mixed_lbl = [None] * batch_size, [None] * batch_size
            mixed_seg_weight = pseudo_weight.clone()
            self.rcs_classes, self.rcs_classprob, self.freq = get_rcs_class_probs(
                self.data_root, self.rcs_class_temp)
            if self.rare_class_mix:
                # mmcv.print_log(f'rare_class_mix: {self.rare_class_mix}', 'mmseg')
                # mmcv.print_log(f'data_root!!: {self.data_root}', 'mmseg')
                # mmcv.print_log(f'rcs_class_temp!!: {self.rcs_class_temp}', 'mmseg')
                # mmcv.print_log(f'rcs_classprob: {self.rcs_classprob}', 'mmseg')
                # mmcv.print_log(f'rcs_classes: {self.rcs_classes}', 'mmseg')
                mix_masks = get_rare_class_mask(gt_semantic_seg,self.rcs_classprob,self.rcs_classes)
            else:
                mix_masks = get_class_masks(gt_semantic_seg)
                
            # cdmix hyperparameter
            strong_parameters['dist_mode'] = self.dist_mode
            strong_parameters['topk'] = self.topk

            for i in range(batch_size):
                strong_parameters['mix'] = mix_masks[i]

                mixed_img[i], mixed_lbl[i], mixed_seg_weight[i] = strong_transform(
                    strong_parameters,
                    data=torch.stack((img[i], target_img[i])),
                    target=torch.stack(
                        (gt_semantic_seg[i][0], pseudo_label[i])),
                    weight=torch.stack((gt_pixel_weight[i], pseudo_weight[i])),
                    cls_dist=cls_dist_cpu,
                )
                
            del gt_pixel_weight
            mixed_img = torch.cat(mixed_img)
            mixed_lbl = torch.cat(mixed_lbl)

            # Train on mixed images
            mix_losses = self.get_model().forward_train(
                mixed_img,
                img_metas,
                mixed_lbl,
                seg_weight=mixed_seg_weight,
                return_feat=False,
            )
            seg_debug['Mixed'] = self.get_model().debug_output
            mix_losses = add_prefix(mix_losses, 'mix')
            mix_loss, mix_log_vars = self._parse_losses(mix_losses)
            log_vars.update(mix_log_vars)
            mix_loss.backward()
            
            del mix_loss

        if self.mask_type=='proto' or 'proto_prob':
            with torch.no_grad():
                # mmcv.print_log(f'proto2: {self.mask_type}', 'mmseg')
                B, _, H, W = img.shape
                source_rep = self.get_ema_model().getRep(img, img_metas)
                target_rep = self.get_ema_model().getRep(target_img, target_img_metas)
                source_rep = resize(
                    input=source_rep,
                    size=((round(H / self.mask_block_size),round(W / self.mask_block_size))),
                    mode='bilinear',
                    align_corners=False)
                target_rep = resize(
                    input=target_rep,
                    size=((round(H / self.mask_block_size),round(W / self.mask_block_size))),
                    mode='bilinear',
                    align_corners=False)            
                # source mask: downsample the ground-truth label
                B, A, Hs, Ws = source_rep.size()
                # mmcv.print_log(f'gt_semantic_seg.shape: {gt_semantic_seg.shape}', 'mmseg')
                # mmcv.print_log(f'gt_semantic_seg: {gt_semantic_seg}', 'mmseg')
                src_mask = F.interpolate(gt_semantic_seg.float(), size=(Hs, Ws), mode='nearest').long()
                # mmcv.print_log(f'src_mask.size(): {src_mask.size()}', 'mmseg')
                # mmcv.print_log(f'src_mask: {src_mask}', 'mmseg')
                # mmcv.print_log(f'source_rep.size(): {source_rep.size()}', 'mmseg')
                # mmcv.print_log(f'source_rep: {source_rep}', 'mmseg')
                src_mask = src_mask.contiguous().view(B * Hs * Ws, )
                assert not src_mask.requires_grad
                
                # target mask: threshold -- self.pseudo_threshold
                B, A, Ht, Wt = target_rep.size()
                # tgt_out_maxvalue, tgt_mask = torch.max(small_ema_logits, dim=1)
                # del small_ema_logits
                tgt_mask = pseudo_label.detach()
                # for i in range(self.class_num):
                #     tgt_mask[(pseudo_prob < self.pseudo_threshold) * (pseudo_label == i)] = 255
                tgt_mask = F.interpolate(tgt_mask.float().unsqueeze(1), size=(Ht, Wt), mode='nearest').long()
                # mmcv.print_log(f'tgt_mask.size(): {tgt_mask.size()}', 'mmseg')
                # mmcv.print_log(f'tgt_mask: {tgt_mask}', 'mmseg')
                # mmcv.print_log(f'target_rep.size(): {target_rep.size()}', 'mmseg')
                # mmcv.print_log(f'target_rep: {target_rep}', 'mmseg')                    
                tgt_mask = tgt_mask.contiguous().view(B * Ht * Wt, )
                assert not tgt_mask.requires_grad 
                
                
                source_rep = source_rep.permute(0, 2, 3, 1).contiguous().view(B * Hs * Ws, A)
                target_rep = target_rep.permute(0, 2, 3, 1).contiguous().view(B * Ht * Wt, A)
            
            # update feature-level statistics

                if self.local_iter<=20000:
                    self.feat_estimator.front_update(features=source_rep.detach(), labels=src_mask)
                    self.feat_estimator.front_update(features=target_rep.detach(), labels=tgt_mask)
                else:
                    self.feat_estimator.later_update(source_features=source_rep.detach(), source_labels=src_mask, target_features=target_rep.detach(), target_labels=tgt_mask) 
            del source_rep, src_mask  
            del tgt_mask 
        # mmcv.print_log(f'Protosize: {self.feat_estimator.Proto.detach().size()}', 'mmseg')   
        # mmcv.print_log(f'Proto: {self.feat_estimator.Proto.detach()}', 'mmseg') 
        # mmcv.print_log(f'target_rep: {target_rep.detach().size()}', 'mmseg')
        # Masked Training
        if self.enable_masking and self.mask_mode.startswith('separate'):
            if self.mask_type=='proto' or 'proto_prob':
                if self.local_iter<=20000:
                    # mmcv.print_log(f'proto3: {self.mask_type}', 'mmseg')
                    masked_loss = self.mic(self.get_model(), img, img_metas,
                                        gt_semantic_seg, target_img,
                                        target_img_metas, valid_pseudo_mask,
                                        pseudo_label, pseudo_weight, self.local_iter)
                else:
                    masked_loss = self.mic(self.get_model(), img, img_metas,
                                        gt_semantic_seg, target_img,
                                        target_img_metas, valid_pseudo_mask,
                                        pseudo_label, pseudo_weight, self.local_iter, self.feat_estimator.Proto.detach(), target_rep.detach())  
                del target_rep 
            else:
                masked_loss = self.mic(self.get_model(), img, img_metas,
                                        gt_semantic_seg, target_img,
                                        target_img_metas, valid_pseudo_mask,
                                        pseudo_label, pseudo_weight, self.local_iter)         
            seg_debug.update(self.mic.debug_output)
            masked_loss = add_prefix(masked_loss, 'masked')
            masked_loss, masked_log_vars = self._parse_losses(masked_loss)
            log_vars.update(masked_log_vars)
            masked_loss.backward()
            
            del masked_loss,pseudo_weight
            

        if self.local_iter % self.debug_img_interval == 0 and \
                not self.source_only:
            out_dir = os.path.join(self.train_cfg['work_dir'], 'debug')
            os.makedirs(out_dir, exist_ok=True)
            vis_img = torch.clamp(denorm(img, means, stds), 0, 1)
            vis_trg_img = torch.clamp(denorm(target_img, means, stds), 0, 1)
            vis_mixed_img = torch.clamp(denorm(mixed_img, means, stds), 0, 1)
            for j in range(batch_size):
                rows, cols = 2, 3
                fig, axs = plt.subplots(
                    rows,
                    cols,
                    figsize=(3 * cols, 3 * rows),
                    gridspec_kw={
                        'hspace': 0.1,
                        'wspace': 0,
                        'top': 0.95,
                        'bottom': 0,
                        'right': 1,
                        'left': 0
                    },
                )
                subplotimg(axs[0][0], vis_img[j], 'Source Image')
                subplotimg(axs[0][1], vis_trg_img[j], 'Target Image')
                subplotimg(axs[0][2], vis_mixed_img[j], 'Mixed Image')
                subplotimg(
                    axs[1][0],
                    gt_semantic_seg[j],
                    'Source Label',
                    cmap='cityscapes')
                subplotimg(
                    axs[1][1],
                    pseudo_label[j],
                    'Target Pseudo Label',
                    cmap='cityscapes')
                # subplotimg(
                #     axs[1][2], mix_masks[j][0], 'Domain Mask', cmap='gray')
                # subplotimg(axs[0][3], pred_u_s[j], "Seg Pred",
                #            cmap="cityscapes")
                if mixed_lbl is not None:
                    subplotimg(
                        axs[1][2], mixed_lbl[j], 'Mixed Label', cmap='cityscapes')
                    
                   
                # subplotimg(
                #     axs[0][3],
                #     mixed_seg_weight[j],
                #     'Pseudo W.',
                #     vmin=0,
                #     vmax=1)
                # if self.debug_fdist_mask is not None:
                #     subplotimg(
                #         axs[0][4],
                #         self.debug_fdist_mask[j][0],
                #         'FDist Mask',
                #         cmap='gray')
                # if self.debug_gt_rescale is not None:
                #     subplotimg(
                #         axs[1][4],
                #         self.debug_gt_rescale[j],
                #         'Scaled GT',
                #         cmap='cityscapes')
                for ax in axs.flat:
                    ax.axis('off')
                plt.savefig(
                    os.path.join(out_dir,
                                 f'{(self.local_iter + 1):06d}_{j}.png'))
                plt.close()
            del mixed_img  

        if self.local_iter % self.debug_img_interval == 0:
            out_dir = os.path.join(self.train_cfg['work_dir'], 'debug')
            os.makedirs(out_dir, exist_ok=True)
            if seg_debug['Source'] is not None and seg_debug:
                # if 'Target' in seg_debug:
                #     seg_debug['Target']['Pseudo W.'] = mixed_seg_weight.cpu(
                #     ).numpy()
                for j in range(batch_size):
                    # cols = len(seg_debug)
                    # rows = max(len(seg_debug[k]) for k in seg_debug.keys())
                    cols = 4
                    rows = 3
                    fig, axs = plt.subplots(
                        rows,
                        cols,
                        figsize=(5 * cols, 5 * rows),
                        gridspec_kw={
                            'hspace': 0.1,
                            'wspace': 0,
                            'top': 0.95,
                            'bottom': 0,
                            'right': 1,
                            'left': 0
                        },
                        squeeze=False,
                    )
                    
                    for k1, (n1, outs) in enumerate(seg_debug.items()):
                        for k2, (n2, out) in enumerate(outs.items()):
                            if k1 == 2:
                                # 交换第三列和第四列的内容，调用subplotimg时交换 k1=2 和 k1=3 的位置
                                subplotimg(axs[k2][k1 + 1], **prepare_debug_out(f'{n1} {n2}', out[j], means, stds))
                                axs[k2][k1 + 1].set_title(f'{n1} {n2}', fontsize=20)
                            elif k1 == 3:
                                # 交换第三列和第四列的内容，调用subplotimg时交换 k1=3 和 k1=2 的位置
                                subplotimg(axs[k2][k1 - 1], **prepare_debug_out(f'{n1} {n2}', out[j], means, stds))
                                axs[k2][k1 - 1].set_title(f'{n1} {n2}', fontsize=20)
                            else:
                                subplotimg(axs[k2][k1], **prepare_debug_out(f'{n1} {n2}', out[j], means, stds))
                                axs[k2][k1].set_title(f'{n1} {n2}', fontsize=20)  # 設置標題字體大小
                    #         subplotimg(
                    #             axs[k2][k1],
                    #             **prepare_debug_out(f'{n1} {n2}', out[j],
                    #                                 means, stds))
                    #         axs[k2][k1].set_title(f'{n1} {n2}', fontsize=24)  # 設置標題字體大小
                    # # Now swap the third and fourth columns
                    # axs[:, 2], axs[:, 3] = axs[:, 3].copy(), axs[:, 2].copy()
                    
                    subplotimg(
                    axs[2][0],
                    gt_semantic_seg[j],
                    'Source Label',
                    cmap='cityscapes')
                    axs[2][0].set_title('Source Label', fontsize=20)  
                    if mixed_lbl is not None:
                        subplotimg(
                            axs[2][3], mixed_lbl[j], 'Mixed Label', cmap='cityscapes')
                        axs[2][3].set_title('Mixed Label', fontsize=20)                  
                    subplotimg(
                    axs[2][1],
                    target_gt_semantic_seg[j],
                    'Target Label',
                    cmap='cityscapes') 
                    axs[2][1].set_title('Target Label', fontsize=20)   
                    
                    subplotimg(
                    axs[2][2],
                    target_gt_semantic_seg[j],
                    'Target Label',
                    cmap='cityscapes') 
                    axs[2][2].set_title('Target Label', fontsize=20)     
                    
                    axs[1][1].set_title('Target Prediction', fontsize=20)                  
                                   
                    for ax in axs.flat:
                        ax.axis('off')
                    plt.savefig(
                        os.path.join(out_dir,
                                     f'{(self.local_iter + 1):06d}_{j}_s.png'))
                    plt.close()
                del seg_debug, mixed_seg_weight, mixed_lbl, pseudo_label, target_gt_semantic_seg
        self.local_iter += 1

        gc.collect()
        torch.cuda.empty_cache()
        return log_vars