lightglue.patch

diff --git a/lightglue/lightglue.py b/lightglue/lightglue.py
index 7a86214..7f8d690 100644
--- a/lightglue/lightglue.py
+++ b/lightglue/lightglue.py
@@ -5,7 +5,7 @@ import numpy as np
 import torch
 from torch import nn
 import torch.nn.functional as F
-from typing import Optional, List, Callable, Tuple
+from typing import Optional, List, Callable
 
 try:
     from flash_attn.modules.mha import FlashCrossAttention
@@ -23,10 +23,8 @@ torch.backends.cudnn.deterministic = True
 @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
 def normalize_keypoints(
         kpts: torch.Tensor,
-        size: Optional[torch.Tensor] = None) -> torch.Tensor:
-    if size is None:
-        size = 1 + kpts.max(-2).values - kpts.min(-2).values
-    elif not isinstance(size, torch.Tensor):
+        size: torch.Tensor) -> torch.Tensor:
+    if not isinstance(size, torch.Tensor):
         size = torch.tensor(size, device=kpts.device, dtype=kpts.dtype)
     size = size.to(kpts)
     shift = size / 2
@@ -35,17 +33,6 @@ def normalize_keypoints(
     return kpts
 
 
-def pad_to_length(x: torch.Tensor, length: int) -> Tuple[torch.Tensor]:
-    if length <= x.shape[-2]:
-        return x, torch.ones_like(x[..., :1], dtype=torch.bool)
-    pad = torch.ones(*x.shape[:-2], length-x.shape[-2], x.shape[-1],
-                     device=x.device, dtype=x.dtype)
-    y = torch.cat([x, pad], dim=-2)
-    mask = torch.zeros(*y.shape[:-1], 1, dtype=torch.bool, device=x.device)
-    mask[..., :x.shape[-2], :] = True
-    return y, mask
-
-
 def rotate_half(x: torch.Tensor) -> torch.Tensor:
     x = x.unflatten(-1, (-1, 2))
     x1, x2 = x.unbind(dim=-1)
@@ -85,8 +72,8 @@ class TokenConfidence(nn.Module):
     def forward(self, desc0: torch.Tensor, desc1: torch.Tensor):
         """ get confidence tokens """
         return (
-            self.token(desc0.detach()).squeeze(-1),
-            self.token(desc1.detach()).squeeze(-1))
+            self.token(desc0.detach().float()).squeeze(-1),
+            self.token(desc1.detach().float()).squeeze(-1))
 
 
 class Attention(nn.Module):
@@ -99,38 +86,29 @@ class Attention(nn.Module):
                 stacklevel=2,
             )
         self.enable_flash = allow_flash and FLASH_AVAILABLE
-        self.has_sdp = hasattr(F, 'scaled_dot_product_attention')
         if allow_flash and FlashCrossAttention:
             self.flash_ = FlashCrossAttention()
-        if self.has_sdp:
-            torch.backends.cuda.enable_flash_sdp(allow_flash)
 
-    def forward(self, q, k, v, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(self, q, k, v) -> torch.Tensor:
         if self.enable_flash and q.device.type == 'cuda':
-            # use torch 2.0 scaled_dot_product_attention with flash
-            if self.has_sdp:
-                args = [x.half().contiguous() for x in [q, k, v]]
-                v = F.scaled_dot_product_attention(*args, attn_mask=mask).to(q.dtype)
-                return v if mask is None else v.nan_to_num()
-            else:
-                assert mask is None
-                q, k, v = [x.transpose(-2, -3).contiguous() for x in [q, k, v]]
+            if FlashCrossAttention:
+                q, k, v = [x.transpose(-2, -3) for x in [q, k, v]]
                 m = self.flash_(q.half(), torch.stack([k, v], 2).half())
-                return m.transpose(-2, -3).to(q.dtype).clone()
-        elif self.has_sdp:
+                return m.transpose(-2, -3).to(q.dtype)
+            else:  # use torch 2.0 scaled_dot_product_attention with flash
+                args = [x.half().contiguous() for x in [q, k, v]]
+                with torch.backends.cuda.sdp_kernel(enable_flash=True):
+                    return F.scaled_dot_product_attention(*args).to(q.dtype)
+        elif hasattr(F, 'scaled_dot_product_attention'):
             args = [x.contiguous() for x in [q, k, v]]
-            v = F.scaled_dot_product_attention(*args, attn_mask=mask)
-            return v if mask is None else v.nan_to_num()
+            return F.scaled_dot_product_attention(*args).to(q.dtype)
         else:
             s = q.shape[-1] ** -0.5
-            sim = torch.einsum('...id,...jd->...ij', q, k) * s
-            if mask is not None:
-                sim.masked_fill(~mask, -float('inf'))
-            attn = F.softmax(sim, -1)
+            attn = F.softmax(torch.einsum('...id,...jd->...ij', q, k) * s, -1)
             return torch.einsum('...ij,...jd->...id', attn, v)
 
 
-class SelfBlock(nn.Module):
+class Transformer(nn.Module):
     def __init__(self, embed_dim: int, num_heads: int,
                  flash: bool = False, bias: bool = True) -> None:
         super().__init__()
@@ -148,21 +126,24 @@ class SelfBlock(nn.Module):
             nn.Linear(2*embed_dim, embed_dim)
         )
 
-    def forward(self, x: torch.Tensor,
-                encoding: torch.Tensor,
-                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def _forward(self, x: torch.Tensor,
+                 encoding: Optional[torch.Tensor] = None):
         qkv = self.Wqkv(x)
         qkv = qkv.unflatten(-1, (self.num_heads, -1, 3)).transpose(1, 2)
         q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2]
-        q = apply_cached_rotary_emb(encoding, q)
-        k = apply_cached_rotary_emb(encoding, k)
-        context = self.inner_attn(q, k, v, mask=mask)
+        if encoding is not None:
+            q = apply_cached_rotary_emb(encoding, q)
+            k = apply_cached_rotary_emb(encoding, k)
+        context = self.inner_attn(q, k, v)
         message = self.out_proj(
             context.transpose(1, 2).flatten(start_dim=-2))
         return x + self.ffn(torch.cat([x, message], -1))
 
+    def forward(self, x0, x1, encoding0=None, encoding1=None):
+        return self._forward(x0, encoding0), self._forward(x1, encoding1)
+
 
-class CrossBlock(nn.Module):
+class CrossTransformer(nn.Module):
     def __init__(self, embed_dim: int, num_heads: int,
                  flash: bool = False, bias: bool = True) -> None:
         super().__init__()
@@ -179,6 +160,7 @@ class CrossBlock(nn.Module):
             nn.GELU(),
             nn.Linear(2*embed_dim, embed_dim)
         )
+
         if flash and FLASH_AVAILABLE:
             self.flash = Attention(True)
         else:
@@ -187,27 +169,22 @@ class CrossBlock(nn.Module):
     def map_(self, func: Callable, x0: torch.Tensor, x1: torch.Tensor):
         return func(x0), func(x1)
 
-    def forward(self, x0: torch.Tensor, x1: torch.Tensor,
-                mask: Optional[torch.Tensor] = None) -> List[torch.Tensor]:
+    def forward(self, x0: torch.Tensor, x1: torch.Tensor) -> List[torch.Tensor]:
         qk0, qk1 = self.map_(self.to_qk, x0, x1)
         v0, v1 = self.map_(self.to_v, x0, x1)
         qk0, qk1, v0, v1 = map(
             lambda t: t.unflatten(-1, (self.heads, -1)).transpose(1, 2),
             (qk0, qk1, v0, v1))
-        if self.flash is not None and qk0.device.type == 'cuda':
-            m0 = self.flash(qk0, qk1, v1, mask)
-            m1 = self.flash(qk1, qk0, v0, mask.transpose(-1, -2) if mask is not None else None)
+        if self.flash is not None:
+            m0 = self.flash(qk0, qk1, v1)
+            m1 = self.flash(qk1, qk0, v0)
         else:
             qk0, qk1 = qk0 * self.scale**0.5, qk1 * self.scale**0.5
-            sim = torch.einsum('bhid, bhjd -> bhij', qk0, qk1)
-            if mask is not None:
-                sim = sim.masked_fill(~mask, -float('inf'))
+            sim = torch.einsum('b h i d, b h j d -> b h i j', qk0, qk1)
             attn01 = F.softmax(sim, dim=-1)
             attn10 = F.softmax(sim.transpose(-2, -1).contiguous(), dim=-1)
             m0 = torch.einsum('bhij, bhjd -> bhid', attn01, v1)
             m1 = torch.einsum('bhji, bhjd -> bhid', attn10.transpose(-2, -1), v0)
-            if mask is not None:
-                m0, m1 = m0.nan_to_num(), m1.nan_to_num()
         m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2),
                            m0, m1)
         m0, m1 = self.map_(self.to_out, m0, m1)
@@ -216,40 +193,6 @@ class CrossBlock(nn.Module):
         return x0, x1
 
 
-class TransformerLayer(nn.Module):
-    def __init__(self, *args, **kwargs):
-        super().__init__()
-        self.self_attn = SelfBlock(*args, **kwargs)
-        self.cross_attn = CrossBlock(*args, **kwargs)
-
-    def forward(self,
-                desc0, desc1,
-                encoding0, encoding1,
-                mask0: Optional[torch.Tensor] = None,
-                mask1: Optional[torch.Tensor] = None):
-        if mask0 is not None and mask1 is not None:
-            return self.masked_forward(
-                desc0, desc1, encoding0, encoding1,
-                mask0, mask1
-            )
-        else:
-            desc0 = self.self_attn(desc0, encoding0)
-            desc1 = self.self_attn(desc1, encoding1)
-            return self.cross_attn(desc0, desc1)
-
-    # This part is compiled and allows padding inputs
-    def masked_forward(
-            self, desc0, desc1,
-            encoding0, encoding1,
-            mask0, mask1):
-        mask = mask0 & mask1.transpose(-1, -2)
-        mask0 = mask0 & mask0.transpose(-1, -2)
-        mask1 = mask1 & mask1.transpose(-1, -2)
-        desc0 = self.self_attn(desc0, encoding0, mask0)
-        desc1 = self.self_attn(desc1, encoding1, mask1)
-        return self.cross_attn(desc0, desc1, mask)
-
-
 def sigmoid_log_double_softmax(
         sim: torch.Tensor, z0: torch.Tensor, z1: torch.Tensor) -> torch.Tensor:
     """ create the log assignment matrix from logits and similarity"""
@@ -283,8 +226,10 @@ class MatchAssignment(nn.Module):
         scores = sigmoid_log_double_softmax(sim, z0, z1)
         return scores, sim
 
-    def get_matchability(self, desc: torch.Tensor):
-        return torch.sigmoid(self.matchability(desc)).squeeze(-1)
+    def scores(self, desc0: torch.Tensor, desc1: torch.Tensor):
+        m0 = torch.sigmoid(self.matchability(desc0)).squeeze(-1)
+        m1 = torch.sigmoid(self.matchability(desc1)).squeeze(-1)
+        return m0, m1
 
 
 def filter_matches(scores: torch.Tensor, th: float):
@@ -299,7 +244,10 @@ def filter_matches(scores: torch.Tensor, th: float):
     zero = max0_exp.new_tensor(0)
     mscores0 = torch.where(mutual0, max0_exp, zero)
     mscores1 = torch.where(mutual1, mscores0.gather(1, m1), zero)
-    valid0 = mutual0 & (mscores0 > th)
+    if th is not None:
+        valid0 = mutual0 & (mscores0 > th)
+    else:
+        valid0 = mutual0
     valid1 = mutual1 & valid0.gather(1, m1)
     m0 = torch.where(valid0, m0, -1)
     m1 = torch.where(valid1, m1, -1)
@@ -321,15 +269,6 @@ class LightGlue(nn.Module):
         'weights': None,
     }
 
-    # Point pruning involves an overhead (gather).
-    # Therefore, we only activate it if there are enough keypoints.
-    pruning_keypoint_thresholds = {
-        'cpu': -1,
-        'mps': -1,
-        'cuda': 1024,
-        'flash': 1536,
-    }
-
     required_data_keys = [
         'image0', 'image1']
 
@@ -360,19 +299,15 @@ class LightGlue(nn.Module):
         self.posenc = LearnableFourierPositionalEncoding(2, head_dim, head_dim)
 
         h, n, d = conf.num_heads, conf.n_layers, conf.descriptor_dim
-
-        self.transformers = nn.ModuleList(
-            [TransformerLayer(d, h, conf.flash) for _ in range(n)]
-        )
-
+        self.self_attn = nn.ModuleList(
+            [Transformer(d, h, conf.flash) for _ in range(n)])
+        self.cross_attn = nn.ModuleList(
+            [CrossTransformer(d, h, conf.flash) for _ in range(n)])
         self.log_assignment = nn.ModuleList(
             [MatchAssignment(d) for _ in range(n)])
         self.token_confidence = nn.ModuleList([
             TokenConfidence(d) for _ in range(n-1)])
-        self.register_buffer('confidence_thresholds', torch.Tensor([
-            self.confidence_threshold(i) for i in range(self.conf.n_layers)]))
 
-        state_dict = None
         if features is not None:
             fname = f'{conf.weights}_{self.version}.pth'.replace('.', '-')
             state_dict = torch.hub.load_state_dict_from_url(
@@ -382,33 +317,8 @@ class LightGlue(nn.Module):
             path = Path(__file__).parent
             path = path / 'weights/{}.pth'.format(self.conf.weights)
             state_dict = torch.load(str(path), map_location='cpu')
-
-        if state_dict:
-            # rename old state dict entries
-            for i in range(self.conf.n_layers):
-                pattern = f'self_attn.{i}', f'transformers.{i}.self_attn'
-                state_dict = {k.replace(*pattern): v for k, v in state_dict.items()}
-                pattern = f'cross_attn.{i}', f'transformers.{i}.cross_attn'
-                state_dict = {k.replace(*pattern): v for k, v in state_dict.items()}
             self.load_state_dict(state_dict, strict=False)
 
-        # static lengths LightGlue is compiled for (only used with torch.compile)
-        self.static_lengths = None
-
-    def compile(self, mode='reduce-overhead',
-                static_lengths=[256, 512, 768, 1024, 1280, 1536]):
-        if self.conf.width_confidence != -1:
-            warnings.warn(
-                'Point pruning is partially disabled for compiled forward.',
-                stacklevel=2,
-            )
-
-        for i in range(self.conf.n_layers):
-            self.transformers[i].masked_forward = torch.compile(
-                self.transformers[i].masked_forward, mode=mode, fullgraph=True)
-
-        self.static_lengths = static_lengths
-
     def forward(self, data: dict) -> dict:
         """
         Match keypoints and descriptors between two images
@@ -437,16 +347,26 @@ class LightGlue(nn.Module):
         for key in self.required_data_keys:
             assert key in data, f'Missing key {key} in data'
         data0, data1 = data['image0'], data['image1']
-        kpts0, kpts1 = data0['keypoints'], data1['keypoints']
-        b, m, _ = kpts0.shape
-        b, n, _ = kpts1.shape
-        device = kpts0.device
+        kpts0_, kpts1_ = data0['keypoints'], data1['keypoints']
+        b, m, _ = kpts0_.shape
+        b, n, _ = kpts1_.shape
         size0, size1 = data0.get('image_size'), data1.get('image_size')
-        kpts0 = normalize_keypoints(kpts0, size0).clone()
-        kpts1 = normalize_keypoints(kpts1, size1).clone()
+        if size0 is None and 'image' in data0:
+            size0 = data0['image'].shape[-2:][::-1]
+        if size1 is None and 'image' in data1:
+            size1 = data1['image'].shape[-2:][::-1]
+
+        if size0 is not None and size1 is not None:
+            kpts0 = normalize_keypoints(kpts0_, size=size0)
+            kpts1 = normalize_keypoints(kpts1_, size=size1)
+        else:
+            kpts0, kpts1 = kpts0_, kpts1_
+
+        assert torch.all(kpts0 >= -1) and torch.all(kpts0 <= 1)
+        assert torch.all(kpts1 >= -1) and torch.all(kpts1 <= 1)
 
-        desc0 = data0['descriptors'].detach().contiguous()
-        desc1 = data1['descriptors'].detach().contiguous()
+        desc0 = data0['descriptors'].detach()
+        desc1 = data1['descriptors'].detach()
 
         assert desc0.shape[-1] == self.conf.input_dim
         assert desc1.shape[-1] == self.conf.input_dim
@@ -455,64 +375,51 @@ class LightGlue(nn.Module):
             desc0 = desc0.half()
             desc1 = desc1.half()
 
-        mask0, mask1 = None, None
-        c = max(m, n)
-        do_compile = self.static_lengths and c <= max(self.static_lengths)
-        if do_compile:
-            kn = min([k for k in self.static_lengths if k >= c])
-            desc0, mask0 = pad_to_length(desc0, kn)
-            desc1, mask1 = pad_to_length(desc1, kn)
-            kpts0, _ = pad_to_length(kpts0, kn)
-            kpts1, _ = pad_to_length(kpts1, kn)
         desc0 = self.input_proj(desc0)
         desc1 = self.input_proj(desc1)
+
         # cache positional embeddings
         encoding0 = self.posenc(kpts0)
         encoding1 = self.posenc(kpts1)
 
         # GNN + final_proj + assignment
         do_early_stop = self.conf.depth_confidence > 0
-        do_point_pruning = self.conf.width_confidence > 0 and not do_compile
-        pruning_th = self.pruning_min_kpts(device)
+        do_point_pruning = self.conf.width_confidence > 0
         if do_point_pruning:
-            ind0 = torch.arange(0, m, device=device)[None]
-            ind1 = torch.arange(0, n, device=device)[None]
+            ind0 = torch.arange(0, m, device=kpts0.device)[None]
+            ind1 = torch.arange(0, n, device=kpts0.device)[None]
             # We store the index of the layer at which pruning is detected.
             prune0 = torch.ones_like(ind0)
             prune1 = torch.ones_like(ind1)
         token0, token1 = None, None
         for i in range(self.conf.n_layers):
-            desc0, desc1 = self.transformers[i](desc0, desc1,
-                                                encoding0, encoding1,
-                                                mask0=mask0, mask1=mask1)
+            desc0, desc1 = self.self_attn[i](
+                desc0, desc1, encoding0, encoding1)
+            desc0, desc1 = self.cross_attn[i](desc0, desc1)
             if i == self.conf.n_layers - 1:
                 continue  # no early stopping or adaptive width at last layer
 
             if do_early_stop:
                 token0, token1 = self.token_confidence[i](desc0, desc1)
-                if self.check_if_stop(token0[..., :m, :], token1[..., :n, :], i, m+n):
+                if self.check_if_stop(token0, token1, i, m+n):
+                    break
+            if do_point_pruning:
+                scores0, scores1 = self.log_assignment[i].scores(desc0, desc1)
+                mask0 = self.get_pruning_mask(token0, scores0, i)
+                mask1 = self.get_pruning_mask(token1, scores1, i)
+                ind0, ind1 = ind0[mask0][None], ind1[mask1][None]
+                desc0, desc1 = desc0[mask0][None], desc1[mask1][None]
+                if desc0.shape[-2] == 0 or desc1.shape[-2] == 0:
                     break
-            if do_point_pruning and desc0.shape[-2] > pruning_th:
-                scores0 = self.log_assignment[i].get_matchability(desc0)
-                prunemask0 = self.get_pruning_mask(token0, scores0, i)
-                keep0 = torch.where(prunemask0)[1]
-                ind0 = ind0.index_select(1, keep0)
-                desc0 = desc0.index_select(1, keep0)
-                encoding0 = encoding0.index_select(-2, keep0)
+                encoding0 = encoding0[:, :, mask0][:, None]
+                encoding1 = encoding1[:, :, mask1][:, None]
                 prune0[:, ind0] += 1
-            if do_point_pruning and desc1.shape[-2] > pruning_th:
-                scores1 = self.log_assignment[i].get_matchability(desc1)
-                prunemask1 = self.get_pruning_mask(token1, scores1, i)
-                keep1 = torch.where(prunemask1)[1]
-                ind1 = ind1.index_select(1, keep1)
-                desc1 = desc1.index_select(1, keep1)
-                encoding1 = encoding1.index_select(-2, keep1)
                 prune1[:, ind1] += 1
 
-        desc0, desc1 = desc0[..., :m, :], desc1[..., :n, :]
         scores, _ = self.log_assignment[i](desc0, desc1)
         m0, m1, mscores0, mscores1 = filter_matches(
             scores, self.conf.filter_threshold)
+
         matches, mscores = [], []
         for k in range(b):
             valid = m0[k] > -1
@@ -537,9 +444,6 @@ class LightGlue(nn.Module):
             mscores0_[:, ind0] = mscores0
             mscores1_[:, ind1] = mscores1
             m0, m1, mscores0, mscores1 = m0_, m1_, mscores0_, mscores1_
-        else:
-            prune0 = torch.ones_like(mscores0) * self.conf.n_layers
-            prune1 = torch.ones_like(mscores1) * self.conf.n_layers
 
         pred = {
             'matches0': m0,
@@ -549,10 +453,9 @@ class LightGlue(nn.Module):
             'stop': i+1,
             'matches': matches,
             'scores': mscores,
-            'prune0': prune0,
-            'prune1': prune1
         }
-
+        if do_point_pruning:
+            pred.update(dict(prune0=prune0, prune1=prune1))
         return pred
 
     def confidence_threshold(self, layer_index: int) -> float:
@@ -563,10 +466,11 @@ class LightGlue(nn.Module):
     def get_pruning_mask(self, confidences: torch.Tensor, scores: torch.Tensor,
                          layer_index: int) -> torch.Tensor:
         """ mask points which should be removed """
-        keep = scores > (1 - self.conf.width_confidence)
-        if confidences is not None:  # Low-confidence points are never pruned.
-            keep |= confidences <= self.confidence_thresholds[layer_index]
-        return keep
+        threshold = self.confidence_threshold(layer_index)
+        if confidences is not None:
+            scores = torch.where(
+                confidences > threshold, scores, scores.new_tensor(1.0))
+        return scores > (1 - self.conf.width_confidence)
 
     def check_if_stop(self,
                       confidences0: torch.Tensor,
@@ -574,12 +478,6 @@ class LightGlue(nn.Module):
                       layer_index: int, num_points: int) -> torch.Tensor:
         """ evaluate stopping condition"""
         confidences = torch.cat([confidences0, confidences1], -1)
-        threshold = self.confidence_thresholds[layer_index]
-        ratio_confident = 1.0 - (confidences < threshold).float().sum() / num_points
-        return ratio_confident > self.conf.depth_confidence
-
-    def pruning_min_kpts(self, device: torch.device):
-        if self.conf.flash and FLASH_AVAILABLE and device.type == 'cuda':
-            return self.pruning_keypoint_thresholds['flash']
-        else:
-            return self.pruning_keypoint_thresholds[device.type]
+        threshold = self.confidence_threshold(layer_index)
+        pos = 1.0 - (confidences < threshold).float().sum() / num_points
+        return pos > self.conf.depth_confidence