PeterL1n · Kartikaeya · Oct 22, 2023
diff --git a/dataset/videomatte.py b/dataset/videomatte.py
@@ -9,14 +9,14 @@
 class VideoMatteDataset(Dataset):
     def __init__(self,
                  videomatte_dir,
-                 background_image_dir,
+                #  background_image_dir,
                  background_video_dir,
                  size,
                  seq_length,
                  seq_sampler,
                  transform=None):
-        self.background_image_dir = background_image_dir
-        self.background_image_files = os.listdir(background_image_dir)
+        # self.background_image_dir = background_image_dir
+        # self.background_image_files = os.listdir(background_image_dir)
         self.background_video_dir = background_video_dir
         self.background_video_clips = sorted(os.listdir(background_video_dir))
         self.background_video_frames = [sorted(os.listdir(os.path.join(background_video_dir, clip)))
@@ -38,10 +38,10 @@ def __len__(self):
         return len(self.videomatte_idx)
 
     def __getitem__(self, idx):
-        if random.random() < 0.5:
-            bgrs = self._get_random_image_background()
-        else:
-            bgrs = self._get_random_video_background()
+        # if random.random() < 0.5:
+        #     bgrs = self._get_random_image_background()
+        # else:
+        bgrs = self._get_random_video_background()
 
         fgrs, phas = self._get_videomatte(idx)
 
@@ -50,11 +50,11 @@ def __getitem__(self, idx):
 
         return fgrs, phas, bgrs
 
-    def _get_random_image_background(self):
-        with Image.open(os.path.join(self.background_image_dir, random.choice(self.background_image_files))) as bgr:
-            bgr = self._downsample_if_needed(bgr.convert('RGB'))
-        bgrs = [bgr] * self.seq_length
-        return bgrs
+    # def _get_random_image_background(self):
+    #     with Image.open(os.path.join(self.background_image_dir, random.choice(self.background_image_files))) as bgr:
+    #         bgr = self._downsample_if_needed(bgr.convert('RGB'))
+    #     bgrs = [bgr] * self.seq_length
+    #     return bgrs
 
     def _get_random_video_background(self):
         clip_idx = random.choice(range(len(self.background_video_clips)))

diff --git a/inference.py b/inference.py
@@ -120,6 +120,10 @@ def convert_video(model,
             rec = [None] * 4
             for src in reader:
 
+                if src.shape[-1] %2 == 1:
+                    src = src[:, :, :, :-1]
+                if src.shape[-2] %2 == 1:
+                    src = src[:, :, :-1, :]
                 if downsample_ratio is None:
                     downsample_ratio = auto_downsample_ratio(*src.shape[2:])
 

diff --git a/inference_utils.py b/inference_utils.py
@@ -5,7 +5,7 @@
 from torch.utils.data import Dataset
 from torchvision.transforms.functional import to_pil_image
 from PIL import Image
-
+import torch
 
 class VideoReader(Dataset):
     def __init__(self, path, transform=None):
@@ -55,18 +55,23 @@ def close(self):
 class ImageSequenceReader(Dataset):
     def __init__(self, path, transform=None):
         self.path = path
-        self.files = sorted(os.listdir(path))
+        self.files_fgr = sorted(os.listdir(path + "fgr/"))
+        self.files_bgr = sorted(os.listdir(path + "bgr/"))
         self.transform = transform
 
     def __len__(self):
-        return len(self.files)
+        return len(self.files_fgr)
 
     def __getitem__(self, idx):
-        with Image.open(os.path.join(self.path, self.files[idx])) as img:
-            img.load()
+        with Image.open(os.path.join(self.path + "fgr/", self.files_fgr[idx])) as fgr_img:
+            fgr_img.load()
+
+        with Image.open(os.path.join(self.path + "bgr/", self.files_bgr[idx])) as bgr_img:
+            bgr_img.load()
+
         if self.transform is not None:
-            return self.transform(img)
-        return img
+            return torch.cat([self.transform(fgr_img), self.transform(bgr_img)], dim = 0)
+        return fgr_img
 
 
 class ImageSequenceWriter:

diff --git a/model/decoder.py b/model/decoder.py
@@ -1,18 +1,18 @@
 import torch
 from torch import Tensor
 from torch import nn
-from torch.nn import functional as F
+# from torch.nn import functional as F
 from typing import Tuple, Optional
 
 class RecurrentDecoder(nn.Module):
     def __init__(self, feature_channels, decoder_channels):
         super().__init__()
         self.avgpool = AvgPool()
         self.decode4 = BottleneckBlock(feature_channels[3])
-        self.decode3 = UpsamplingBlock(feature_channels[3], feature_channels[2], 3, decoder_channels[0])
-        self.decode2 = UpsamplingBlock(decoder_channels[0], feature_channels[1], 3, decoder_channels[1])
-        self.decode1 = UpsamplingBlock(decoder_channels[1], feature_channels[0], 3, decoder_channels[2])
-        self.decode0 = OutputBlock(decoder_channels[2], 3, decoder_channels[3])
+        self.decode3 = UpsamplingBlock(feature_channels[3], feature_channels[2], 6, decoder_channels[0])
+        self.decode2 = UpsamplingBlock(decoder_channels[0], feature_channels[1], 6, decoder_channels[1])
+        self.decode1 = UpsamplingBlock(decoder_channels[1], feature_channels[0], 6, decoder_channels[2])
+        self.decode0 = OutputBlock(decoder_channels[2], 6, decoder_channels[3])
 
     def forward(self,
                 s0: Tensor, f1: Tensor, f2: Tensor, f3: Tensor, f4: Tensor,

diff --git a/model/mobilenetv3.py b/model/mobilenetv3.py
@@ -3,6 +3,21 @@
 from torchvision.models.mobilenetv3 import MobileNetV3, InvertedResidualConfig
 from torchvision.transforms.functional import normalize
 
+def load_matched_state_dict(model, state_dict, print_stats=True):
+    """
+    Only loads weights that matched in key and shape. Ignore other weights.
+    """
+    num_matched, num_total = 0, 0
+    curr_state_dict = model.state_dict()
+    for key in curr_state_dict.keys():
+        num_total += 1
+        if key in state_dict and curr_state_dict[key].shape == state_dict[key].shape:
+            curr_state_dict[key] = state_dict[key]
+            num_matched += 1
+    model.load_state_dict(curr_state_dict)
+    if print_stats:
+        print(f'Loaded state_dict: {num_matched}/{num_total} matched')
+
 class MobileNetV3LargeEncoder(MobileNetV3):
     def __init__(self, pretrained: bool = False):
         super().__init__(
@@ -27,14 +42,24 @@ def __init__(self, pretrained: bool = False):
         )
 
         if pretrained:
-            self.load_state_dict(torch.hub.load_state_dict_from_url(
-                'https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth'))
+            pretrained_state_dict = torch.hub.load_state_dict_from_url(
+                'https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth')
+
+            # print("pretrained_state_dict keys \n \n ", pretrained_state_dict.keys())
+
+            # print("\n\ncurrent model state dict keys \n\n", self.state_dict().keys())
+
+            load_matched_state_dict(self, pretrained_state_dict)
+
+            # self.load_state_dict(torch.hub.load_state_dict_from_url(
+            #     'https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth'))
 
         del self.avgpool
         del self.classifier
 
     def forward_single_frame(self, x):
-        x = normalize(x, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        # print(x.shape)
+        x = torch.cat((normalize(x[:, :3, ...], [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), normalize(x[:, 3:, ...], [0.485, 0.456, 0.406], [0.229, 0.224, 0.225])), dim = -3)
 
         x = self.features[0](x)
         x = self.features[1](x)

diff --git a/model/model.py b/model/model.py
@@ -1,6 +1,7 @@
 import torch
 from torch import Tensor
 from torch import nn
+from torchsummary import summary
 from torch.nn import functional as F
 from typing import Optional, List
 
@@ -58,8 +59,8 @@ def forward(self,
         if not segmentation_pass:
             fgr_residual, pha = self.project_mat(hid).split([3, 1], dim=-3)
             if downsample_ratio != 1:
-                fgr_residual, pha = self.refiner(src, src_sm, fgr_residual, pha, hid)
-            fgr = fgr_residual + src
+                fgr_residual, pha = self.refiner(src[:, :, :3, ...], src_sm[:, :, :3, ...], fgr_residual, pha, hid)
+            fgr = fgr_residual + src[:, :, :3, ...]
             fgr = fgr.clamp(0., 1.)
             pha = pha.clamp(0., 1.)
             return [fgr, pha, *rec]

diff --git a/requirements_training.txt b/requirements_training.txt
@@ -1,5 +1,7 @@
 easing_functions==1.0.4
-tensorboard==2.5.0
-torch==1.9.0
-torchvision==0.10.0
-tqdm==4.61.1
+tensorboard
+torch
+torchvision
+tqdm==4.61.1
+opencv-python==4.6.0.66
+torchsummary