Refs

  1. Github Repo: jacobmarks:awesome-clip-papers

1. 经典中的经典:CLIP本尊

Title: Learning Transferable Visual Models From Natural Language Supervision

CLIP的核心工作是Contrastive Language–Image Pre-training,即对比语言-图像预训练,联合训练一个图像编码器和一个文本编码器来预测一批(图像,文本)训练样本的正确配对

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# image_encoder - ResNet or Vision Transformer
# text_encoder - CBOW or Text Transformer
# I[n, h, w, c] - minibatch of aligned images
# T[n, l] - minibatch of aligned texts
# W_i[d_i, d_e] - learned proj of image to embed
# W_t[d_t, d_e] - learned proj of text to embed
# t - learned temperature parameter

# extract feature representations of each modality
I_f = image_encoder(I) #[n, d_i]
T_f = text_encoder(T) #[n, d_t]

# joint multimodal embedding [n, d_e]
I_e = l2_normalize(np.dot(I_f, W_i), axis=1)
T_e = l2_normalize(np.dot(T_f, W_t), axis=1)

# scaled pairwise cosine similarities [n, n]
logits = np.dot(I_e, T_e.T) * np.exp(t)

# symmetric loss function
labels = np.arange(n)
loss_i = cross_entropy_loss(logits, labels, axis=0)
loss_t = cross_entropy_loss(logits, labels, axis=1)
loss = (loss_i + loss_t)/2

2. GroupViT: 使用Group Token和文本之间的对齐进行无监督的语义分割任务

Title: GroupViT: Semantic Segmentation Emerges from Text Supervision

GroupViT工作流程

3. YOLO World: 利用重参数化实现轻量的Open-vocab推理

Title: YOLO-World: Real-Time Open-Vocabulary Object Detection

RepVL-PAN

alt text

Constractive Text-Image Head:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class ContrastiveHead(BaseModule):
"""Contrastive Head for YOLO-World
compute the region-text scores according to the
similarity between image and text features
Args:
embed_dims (int): embed dim of text and image features
"""

def __init__(self,
embed_dims: int,
init_cfg: OptConfigType = None,
use_einsum: bool = True) -> None:

super().__init__(init_cfg=init_cfg)

self.bias = nn.Parameter(torch.zeros([]))
self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
self.use_einsum = use_einsum

def forward(self, x: Tensor, w: Tensor) -> Tensor:
"""Forward function of contrastive learning."""
x = F.normalize(x, dim=1, p=2)
w = F.normalize(w, dim=-1, p=2)

if self.use_einsum:
x = torch.einsum('bchw,bkc->bkhw', x, w)
else:
batch, channel, height, width = x.shape
_, k, _ = w.shape
x = x.permute(0, 2, 3, 1) # bchw->bhwc
x = x.reshape(batch, -1, channel) # bhwc->b(hw)c
w = w.permute(0, 2, 1) # bkc->bck
x = torch.matmul(x, w) # b(hw)c * bck -> b(hw)k
x = x.reshape(batch, height, width, k) # b(hw)k->bhwk
x = x.permute(0, 3, 1, 2) # bhwk->bkhw

x = x * self.logit_scale.exp() + self.bias # simularity score
return x

推理时的模态融合:

原文是这么写的:

During inference, the offline vocabulary embeddings can be re-parameterized into weights of convolutional or linear layers for deployment.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
@MODELS.register_module()
class MaxSigmoidAttnBlock(BaseModule):
"""Max Sigmoid attention block."""

def __init__(self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
kernel_size: int = 3,
padding: int = 1,
num_heads: int = 1,
use_depthwise: bool = False,
with_scale: bool = False,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
init_cfg: OptMultiConfig = None,
use_einsum: bool = True) -> None:
super().__init__(init_cfg=init_cfg)
conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule

assert (out_channels % num_heads == 0 and
embed_channels % num_heads == 0), \
'out_channels and embed_channels should be divisible by num_heads.'
self.num_heads = num_heads
self.head_channels = embed_channels // num_heads
self.use_einsum = use_einsum

self.embed_conv = ConvModule(
in_channels,
embed_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None) if embed_channels != in_channels else None
self.guide_fc = Linear(guide_channels, embed_channels)
self.bias = nn.Parameter(torch.zeros(num_heads))
if with_scale:
self.scale = nn.Parameter(torch.ones(1, num_heads, 1, 1))
else:
self.scale = 1.0

self.project_conv = conv(in_channels,
out_channels,
kernel_size,
stride=1,
padding=padding,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)

def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
B, _, H, W = x.shape

guide = self.guide_fc(guide)
guide = guide.reshape(B, -1, self.num_heads, self.head_channels)
embed = self.embed_conv(x) if self.embed_conv is not None else x
embed = embed.reshape(B, self.num_heads, self.head_channels, H, W)

if self.use_einsum:
attn_weight = torch.einsum('bmchw,bnmc->bmhwn', embed, guide)
else:
batch, m, channel, height, width = embed.shape
_, n, _, _ = guide.shape
embed = embed.permute(0, 1, 3, 4, 2)
embed = embed.reshape(batch, m, -1, channel)
guide = guide.permute(0, 2, 3, 1)
attn_weight = torch.matmul(embed, guide)
attn_weight = attn_weight.reshape(batch, m, height, width, n)

attn_weight = attn_weight.max(dim=-1)[0]
attn_weight = attn_weight / (self.head_channels**0.5)
attn_weight = attn_weight + self.bias[None, :, None, None]
attn_weight = attn_weight.sigmoid() * self.scale

x = self.project_conv(x)
x = x.reshape(B, self.num_heads, -1, H, W)
x = x * attn_weight.unsqueeze(2)
x = x.reshape(B, -1, H, W)
return x

怎么做的Re-Parameterize呢?

1
2
3
self.guide_weight = nn.Parameter(
torch.zeros(guide_channels, embed_channels // num_heads,
num_heads))

与basemodule不同,这里的guide_weight以一种科可学习的方式初始化。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class RepMatrixMaxSigmoidAttnBlock(BaseModule):
"""Max Sigmoid attention block."""

def __init__(self,
in_channels: int,
out_channels: int,
embed_channels: int,
guide_channels: int,
kernel_size: int = 3,
padding: int = 1,
num_heads: int = 1,
use_depthwise: bool = False,
with_scale: bool = False,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
init_cfg: OptMultiConfig = None,
use_einsum: bool = True) -> None:
super().__init__(init_cfg=init_cfg)
conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule

assert (out_channels % num_heads == 0 and
embed_channels % num_heads == 0), \
'out_channels and embed_channels should be divisible by num_heads.'
self.num_heads = num_heads
self.head_channels = out_channels // num_heads
self.use_einsum = use_einsum

self.embed_conv = ConvModule(
in_channels,
embed_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None) if embed_channels != in_channels else None
self.bias = nn.Parameter(torch.zeros(num_heads))
self.guide_weight = nn.Parameter(
torch.zeros(guide_channels, embed_channels // num_heads,
num_heads))
self.project_conv = conv(in_channels,
out_channels,
kernel_size,
stride=1,
padding=padding,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)

def forward(self, x: Tensor, txt_feats: Tensor = None) -> Tensor:
"""Forward process."""
B, _, H, W = x.shape

embed = self.embed_conv(x) if self.embed_conv is not None else x
embed = embed.reshape(B, self.num_heads, self.head_channels, H, W)

batch, m, channel, height, width = embed.shape
_, n, _, _ = self.guide_weight.shape
# can be formulated to split conv
embed = embed.permute(0, 1, 3, 4, 2)
embed = embed.reshape(batch, m, -1, channel)
attn_weight = torch.matmul(embed, self.guide_weight)
attn_weight = attn_weight.reshape(batch, m, height, width, n)

attn_weight = attn_weight.max(dim=-1)[0]
attn_weight = attn_weight / (self.head_channels**0.5)
attn_weight = attn_weight + self.bias[None, :, None, None]
attn_weight = attn_weight.sigmoid()

x = self.project_conv(x)
x = x.reshape(B, self.num_heads, -1, H, W)
x = x * attn_weight.unsqueeze(2)
x = x.reshape(B, -1, H, W)
return x

forward中,txt_feats甚至没有根本用到(不知道写在这里干嘛。。。)

下面,继承mmcv中CSPLayerWithTwoConv基类构建Text-guided CSPLayer.

首先是融合了Sigmoid-attention的CSP层:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
@MODELS.register_module()
class MaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv):
"""Sigmoid-attention based CSP layer with two convolution layers."""

def __init__(
self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
num_heads: int = 1,
expand_ratio: float = 0.5,
num_blocks: int = 1,
with_scale: bool = False,
add_identity: bool = True, # shortcut
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None,
use_einsum: bool = True) -> None:
super().__init__(in_channels=in_channels,
out_channels=out_channels,
expand_ratio=expand_ratio,
num_blocks=num_blocks,
add_identity=add_identity,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)

self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)

self.attn_block = MaxSigmoidAttnBlock(self.mid_channels,
self.mid_channels,
guide_channels=guide_channels,
embed_channels=embed_channels,
num_heads=num_heads,
with_scale=with_scale,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
use_einsum=use_einsum)

def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
x_main = self.main_conv(x)
x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
x_main.append(self.attn_block(x_main[-1], guide))
return self.final_conv(torch.cat(x_main, 1))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
@MODELS.register_module()
class RepMaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv):
"""Sigmoid-attention based CSP layer with two convolution layers."""

def __init__(
self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
num_heads: int = 1,
expand_ratio: float = 0.5,
num_blocks: int = 1,
with_scale: bool = False,
add_identity: bool = True, # shortcut
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None,
use_einsum: bool = True) -> None:
super().__init__(in_channels=in_channels,
out_channels=out_channels,
expand_ratio=expand_ratio,
num_blocks=num_blocks,
add_identity=add_identity,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)

self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)

self.attn_block = RepMatrixMaxSigmoidAttnBlock(
self.mid_channels,
self.mid_channels,
embed_channels=embed_channels,
guide_channels=guide_channels,
num_heads=num_heads,
with_scale=with_scale,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
use_einsum=use_einsum)

def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
x_main = self.main_conv(x)
x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
x_main.append(self.attn_block(x_main[-1], guide))
return self.final_conv(torch.cat(x_main, 1))

无监督训练的loss: 基于协方差

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
@MODELS.register_module()
class CoVMSELoss(nn.Module):

def __init__(self,
dim: int = 0,
reduction: str = 'mean',
loss_weight: float = 1.0,
eps: float = 1e-6) -> None:
super().__init__()
self.dim = dim
self.reduction = reduction
self.loss_weight = loss_weight
self.eps = eps

def forward(self,
pred: Tensor,
weight: Optional[Tensor] = None,
avg_factor: Optional[int] = None,
reduction_override: Optional[str] = None) -> Tensor:
"""Forward function of loss."""
assert reduction_override in (None, 'none', 'mean', 'sum')
reduction = (
reduction_override if reduction_override else self.reduction)
cov = pred.std(self.dim) / pred.mean(self.dim).clamp(min=self.eps)
target = torch.zeros_like(cov)
loss = self.loss_weight * mse_loss(
cov, target, weight, reduction=reduction, avg_factor=avg_factor)
return loss

重参数化的过程:

1
2
3
4
5
6
def convert_head(scale, bias, text_embed):
N, D = text_embed.shape
weight = (text_embed * scale.exp()).view(N, D, 1, 1)
bias = torch.ones(N) * bias
return weight, bias