transformer.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # -------------------------------------------------------------------------
  2. # MIT License
  3. #
  4. # Copyright (c) 2021 OpenAI
  5. #
  6. # Permission is hereby granted, free of charge, to any person obtaining a copy
  7. # of this software and associated documentation files (the "Software"), to deal
  8. # in the Software without restriction, including without limitation the rights
  9. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10. # copies of the Software, and to permit persons to whom the Software is
  11. # furnished to do so, subject to the following conditions:
  12. #
  13. # The above copyright notice and this permission notice shall be included in all
  14. # copies or substantial portions of the Software.
  15. #
  16. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  19. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22. # SOFTWARE.
  23. #
  24. # Modified by Jiarui Xu
  25. # -------------------------------------------------------------------------
  26. import torch
  27. import torch.utils.checkpoint as checkpoint
  28. from torch import nn
  29. from .builder import MODELS
  30. from .misc import Result
  31. from .utils import ResidualAttentionBlock
  32. class Transformer(nn.Module):
  33. def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, use_checkpoint=False):
  34. super().__init__()
  35. self.width = width
  36. self.layers = layers
  37. self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
  38. proj_std = (self.width**-0.5) * ((2 * self.layers)**-0.5)
  39. attn_std = self.width**-0.5
  40. fc_std = (2 * self.width)**-0.5
  41. for block in self.resblocks:
  42. nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
  43. nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
  44. nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
  45. nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
  46. self.use_checkpoint = use_checkpoint
  47. def forward(self, x: torch.Tensor):
  48. for resblock in self.resblocks:
  49. if self.use_checkpoint:
  50. x = checkpoint.checkpoint(resblock, x)
  51. else:
  52. x = resblock(x)
  53. return x
  54. @MODELS.register_module()
  55. class TextTransformer(nn.Module):
  56. def __init__(
  57. self,
  58. context_length: int,
  59. width: int,
  60. layers: int,
  61. vocab_size,
  62. use_checkpoint=False,
  63. ):
  64. super().__init__()
  65. heads = width // 64
  66. self.context_length = context_length
  67. self.width = width
  68. self.transformer = Transformer(
  69. width=width,
  70. layers=layers,
  71. heads=heads,
  72. attn_mask=self.build_attention_mask(),
  73. use_checkpoint=use_checkpoint)
  74. self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width))
  75. self.ln_final = nn.LayerNorm(width)
  76. self.token_embedding = nn.Embedding(vocab_size, width)
  77. nn.init.normal_(self.token_embedding.weight, std=0.02)
  78. # initialization
  79. nn.init.normal_(self.positional_embedding, std=0.01)
  80. def build_attention_mask(self):
  81. # lazily create causal attention mask, with full attention between the vision tokens
  82. # pytorch uses additive attention mask; fill with -inf
  83. mask = torch.empty(self.context_length, self.context_length)
  84. mask.fill_(float('-inf'))
  85. mask.triu_(1) # zero out the lower diagonal
  86. return mask
  87. def forward(self, text, *, as_dict=False):
  88. x = self.token_embedding(text)
  89. outs = Result(as_dict=as_dict)
  90. x = x + self.positional_embedding
  91. x = x.permute(1, 0, 2) # NLD -> LND
  92. x = self.transformer(x)
  93. x = x.permute(1, 0, 2) # LND -> NLD
  94. x = self.ln_final(x)
  95. # x.shape = [batch_size, n_ctx, transformer.width]
  96. # take features from the eot embedding (eot_token is the highest number in each sequence)
  97. x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)]
  98. outs.append(x, name='x')
  99. return outs.as_return()