gogsAdmin
/
OVSegmentor


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
							# -------------------------------------------------------------------------
# MIT License
#
# Copyright (c) 2021 OpenAI
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
# Modified by Jiarui Xu
# -------------------------------------------------------------------------
# Modified by Jilan Xu
# -------------------------------------------------------------------------


import torch
import torch.utils.checkpoint as checkpoint
from torch import nn

from .builder import MODELS
from .misc import Result
from .utils import ResidualAttentionBlock
from ipdb import set_trace
import clip
from transformers import AutoModel
from timm.models.layers import DropPath, to_2tuple, trunc_normal_

class Transformer(nn.Module):

    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, use_checkpoint=False):
        super().__init__()
        self.width = width
        self.layers = layers
        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
        proj_std = (self.width**-0.5) * ((2 * self.layers)**-0.5)
        attn_std = self.width**-0.5
        fc_std = (2 * self.width)**-0.5
        for block in self.resblocks:
            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)

        self.use_checkpoint = use_checkpoint

    def forward(self, x: torch.Tensor):
        for i, resblock in enumerate(self.resblocks):
            if self.use_checkpoint:
                x = checkpoint.checkpoint(resblock, x)
            else:
                x = resblock(x)
        return x

@MODELS.register_module()
class DistilBert(nn.Module):
    def __init__(
        self,
        context_length: int,
        width: int,
        layers: int,
        vocab_size,
        use_checkpoint=False,
        pretrained=True,
        fixed=True,
    ):
        super().__init__()
        self.transformer = AutoModel.from_pretrained('distilbert-base-uncased', output_hidden_states=True)
        self.transformer.train()
        self.width = width
    
        if fixed is True:
            for p in self.transformer.parameters():
                p.requires_grad = False

        if pretrained is False:
            self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def forward(self, x, as_dict=True):
        outs = Result(as_dict=as_dict)
        out_x = self.transformer(**x)
        out_hidden = out_x.last_hidden_state[:, 0, :]
        last_hidden = out_x.hidden_states[-1]

        outs.append(out_hidden, name='x')
        outs.append(last_hidden, name='all_tokens')
        return outs.as_return()

@MODELS.register_module()
class Bert(nn.Module):
    def __init__(
        self,
        context_length: int,
        width: int,
        layers: int,
        vocab_size,
        use_checkpoint=False,
        pretrained=True,
        fixed=True,
    ):
        super().__init__()
        self.transformer = AutoModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
        self.transformer.train()
        self.width = width
    
        if fixed is True:
            for p in self.transformer.parameters():
                p.requires_grad = False

        if pretrained is False:
            self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def forward(self, x, as_dict=True):
        outs = Result(as_dict=as_dict)
        out_x = self.transformer(**x)
        out_hidden = out_x.last_hidden_state[:, 0, :]
        last_hidden = out_x.hidden_states[-1]

        outs.append(out_hidden, name='x')
        outs.append(last_hidden, name='all_tokens')
        return outs.as_return()
    
@MODELS.register_module()
class Roberta(nn.Module):
    def __init__(
        self,
        context_length: int,
        width: int,
        layers: int,
        vocab_size,
        use_checkpoint=False,
        pretrained=True,
        fixed=True,
    ):
        super().__init__()
        self.transformer = AutoModel.from_pretrained('roberta-base', output_hidden_states=True, cache_dir='/mnt/petrelfs/xujilan/checkpoints/')
        self.transformer.train()
        self.width = width
    
        if fixed is True:
            for p in self.transformer.parameters():
                p.requires_grad = False

        if pretrained is False:
            self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def forward(self, x, question=None, as_dict=True):
        outs = Result(as_dict=as_dict)
        out_x = self.transformer(**x)
        out_hidden = out_x.last_hidden_state[:, 0, :]
        last_hidden = out_x.hidden_states[-1]

        outs.append(out_hidden, name='x')
        outs.append(last_hidden, name='all_tokens')
        return outs.as_return()

@MODELS.register_module()
class BertMedium(nn.Module):
    def __init__(
        self,
        context_length: int,
        width: int,
        layers: int,
        vocab_size,
        use_checkpoint=False,
        pretrained=True,
        fixed=True,
    ):
        super().__init__()
        self.transformer = AutoModel.from_pretrained('prajjwal1/bert-medium', output_hidden_states=True)
        self.transformer.train()
        self.width = width
    
        if fixed is True:
            for p in self.transformer.parameters():
                p.requires_grad = False

        if pretrained is False:
            self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def forward(self, x, as_dict=True):
        outs = Result(as_dict=as_dict)
        out_x = self.transformer(**x)
        out_hidden = out_x.last_hidden_state[:, 0, :]
        last_hidden = out_x.hidden_states[-1]

        outs.append(out_hidden, name='x')
        outs.append(last_hidden, name='all_tokens')
        return outs.as_return()

@MODELS.register_module()
class TextTransformer(nn.Module):

    def __init__(
        self,
        context_length: int,
        width: int,
        layers: int,
        vocab_size,
        use_checkpoint=False,
        pretrained=True,
        fixed=True,
    ):

        super().__init__()
        heads = width // 64
        self.context_length = context_length
        self.width = width
        self.transformer = Transformer(
            width=width,
            layers=layers,
            heads=heads,
            attn_mask=self.build_attention_mask(),
            use_checkpoint=use_checkpoint)

        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width))
        self.ln_final = nn.LayerNorm(width)
        self.token_embedding = nn.Embedding(vocab_size, width)
        nn.init.normal_(self.token_embedding.weight, std=0.02)

        clip_model, _ = clip.load('ViT-B/16', device='cuda', jit=False)
        self.text_projection = nn.Parameter(torch.empty(clip_model.text_projection.shape))
        nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)

        # initialization
        nn.init.normal_(self.positional_embedding, std=0.01)

        if pretrained:
            print('loading clip weights for text encoder')
            self.reload_clip_weights(clip_model)
        if fixed:
            print('freezing text encoder')
            self.freeze_text_encoder()

    def freeze_text_encoder(self):
        for p in self.parameters():
            p.requires_grad=False

    def reload_clip_weights(self, clip_model):
        text_dict = clip_model.state_dict()
        msg = self.load_state_dict(text_dict, strict=False)

    def build_attention_mask(self):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
        mask = torch.empty(self.context_length, self.context_length)
        mask.fill_(float('-inf'))
        mask.triu_(1)  # zero out the lower diagonal
        return mask

    def forward(self, text, *, as_dict=True):
        x = self.token_embedding(text)
        outs = Result(as_dict=as_dict)
        x = x + self.positional_embedding
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_final(x)

        ### w/o text projection ###
        # all_tokens = x.clone()
        # x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)]

        ### w/ text projection ###
        all_tokens = x.clone() @ self.text_projection
        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
        outs.append(x, name='x')
        outs.append(all_tokens, name='all_tokens')

        return outs.as_return()