clipmodel.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537
  1. from collections import OrderedDict
  2. from typing import Tuple, Union
  3. import numpy as np
  4. import torch
  5. import torch.nn.functional as F
  6. from torch import nn
  7. from ipdb import set_trace
  8. class Bottleneck(nn.Module):
  9. expansion = 4
  10. def __init__(self, inplanes, planes, stride=1):
  11. super().__init__()
  12. # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
  13. self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
  14. self.bn1 = nn.BatchNorm2d(planes)
  15. self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
  16. self.bn2 = nn.BatchNorm2d(planes)
  17. self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
  18. self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
  19. self.bn3 = nn.BatchNorm2d(planes * self.expansion)
  20. self.relu = nn.ReLU(inplace=True)
  21. self.downsample = None
  22. self.stride = stride
  23. if stride > 1 or inplanes != planes * Bottleneck.expansion:
  24. # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
  25. self.downsample = nn.Sequential(OrderedDict([
  26. ("-1", nn.AvgPool2d(stride)),
  27. ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
  28. ("1", nn.BatchNorm2d(planes * self.expansion))
  29. ]))
  30. def forward(self, x: torch.Tensor):
  31. identity = x
  32. out = self.relu(self.bn1(self.conv1(x)))
  33. out = self.relu(self.bn2(self.conv2(out)))
  34. out = self.avgpool(out)
  35. out = self.bn3(self.conv3(out))
  36. if self.downsample is not None:
  37. identity = self.downsample(x)
  38. out += identity
  39. out = self.relu(out)
  40. return out
  41. class AttentionPool2d(nn.Module):
  42. def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
  43. super().__init__()
  44. self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
  45. self.k_proj = nn.Linear(embed_dim, embed_dim)
  46. self.q_proj = nn.Linear(embed_dim, embed_dim)
  47. self.v_proj = nn.Linear(embed_dim, embed_dim)
  48. self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
  49. self.num_heads = num_heads
  50. def forward(self, x):
  51. x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC
  52. x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
  53. x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
  54. x, _ = F.multi_head_attention_forward(
  55. query=x, key=x, value=x,
  56. embed_dim_to_check=x.shape[-1],
  57. num_heads=self.num_heads,
  58. q_proj_weight=self.q_proj.weight,
  59. k_proj_weight=self.k_proj.weight,
  60. v_proj_weight=self.v_proj.weight,
  61. in_proj_weight=None,
  62. in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
  63. bias_k=None,
  64. bias_v=None,
  65. add_zero_attn=False,
  66. dropout_p=0,
  67. out_proj_weight=self.c_proj.weight,
  68. out_proj_bias=self.c_proj.bias,
  69. use_separate_proj_weight=True,
  70. training=self.training,
  71. need_weights=False
  72. )
  73. return x[0]
  74. class ModifiedResNet(nn.Module):
  75. """
  76. A ResNet class that is similar to torchvision's but contains the following changes:
  77. - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
  78. - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
  79. - The final pooling layer is a QKV attention instead of an average pool
  80. """
  81. def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
  82. super().__init__()
  83. self.output_dim = output_dim
  84. self.input_resolution = input_resolution
  85. # the 3-layer stem
  86. self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
  87. self.bn1 = nn.BatchNorm2d(width // 2)
  88. self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
  89. self.bn2 = nn.BatchNorm2d(width // 2)
  90. self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
  91. self.bn3 = nn.BatchNorm2d(width)
  92. self.avgpool = nn.AvgPool2d(2)
  93. self.relu = nn.ReLU(inplace=True)
  94. # residual layers
  95. self._inplanes = width # this is a *mutable* variable used during construction
  96. self.layer1 = self._make_layer(width, layers[0])
  97. self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
  98. self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
  99. self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
  100. embed_dim = width * 32 # the ResNet feature dimension
  101. self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
  102. def _make_layer(self, planes, blocks, stride=1):
  103. layers = [Bottleneck(self._inplanes, planes, stride)]
  104. self._inplanes = planes * Bottleneck.expansion
  105. for _ in range(1, blocks):
  106. layers.append(Bottleneck(self._inplanes, planes))
  107. return nn.Sequential(*layers)
  108. def forward(self, x):
  109. def stem(x):
  110. for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
  111. x = self.relu(bn(conv(x)))
  112. x = self.avgpool(x)
  113. return x
  114. x = x.type(self.conv1.weight.dtype)
  115. x = stem(x)
  116. x = self.layer1(x)
  117. x = self.layer2(x)
  118. x = self.layer3(x)
  119. x = self.layer4(x)
  120. x = self.attnpool(x)
  121. return x
  122. class LayerNorm(nn.LayerNorm):
  123. """Subclass torch's LayerNorm to handle fp16."""
  124. def forward(self, x: torch.Tensor):
  125. orig_type = x.dtype
  126. ret = super().forward(x.type(torch.float32))
  127. return ret.type(orig_type)
  128. class QuickGELU(nn.Module):
  129. def forward(self, x: torch.Tensor):
  130. return x * torch.sigmoid(1.702 * x)
  131. class ResidualAttentionBlock(nn.Module):
  132. def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
  133. super().__init__()
  134. self.attn = nn.MultiheadAttention(d_model, n_head)
  135. self.ln_1 = LayerNorm(d_model)
  136. self.mlp = nn.Sequential(OrderedDict([
  137. ("c_fc", nn.Linear(d_model, d_model * 4)),
  138. ("gelu", QuickGELU()),
  139. ("c_proj", nn.Linear(d_model * 4, d_model))
  140. ]))
  141. self.ln_2 = LayerNorm(d_model)
  142. self.attn_mask = attn_mask
  143. def attention(self, x: torch.Tensor):
  144. self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
  145. return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
  146. def forward(self, x: torch.Tensor):
  147. x = x + self.attention(self.ln_1(x))
  148. x = x + self.mlp(self.ln_2(x))
  149. return x
  150. class Transformer(nn.Module):
  151. def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
  152. super().__init__()
  153. self.width = width
  154. self.layers = layers
  155. self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
  156. def forward(self, x: torch.Tensor):
  157. return self.resblocks(x)
  158. class VisionTransformer(nn.Module):
  159. def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
  160. super().__init__()
  161. self.input_resolution = input_resolution
  162. self.output_dim = output_dim
  163. self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
  164. scale = width ** -0.5
  165. self.class_embedding = nn.Parameter(scale * torch.randn(width))
  166. self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
  167. self.ln_pre = LayerNorm(width)
  168. self.transformer = Transformer(width, layers, heads)
  169. self.ln_post = LayerNorm(width)
  170. self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
  171. ### return all features ####
  172. def forward_features(self, x: torch.Tensor, pos_embedding: torch.Tensor):
  173. x = self.conv1(x) # shape = [*, width, grid, grid]
  174. x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
  175. x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
  176. ### training time or zero-shot eval time, keep class embedding (when pos and x have different token shapes
  177. # if pos_embedding.ndim == 2 and x.size(1) != pos_embedding.size(0) or \
  178. # pos_embedding.ndim == 3 and x.size(1) != pos_embedding.size(1):
  179. # x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width]
  180. # # pass
  181. # x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width]
  182. # set_trace()
  183. #### Note that we maintain the pos_embed size to match the input
  184. # x = x + self.positional_embedding.to(x.dtype)
  185. x = x + pos_embedding.to(x.dtype)
  186. x = self.ln_pre(x)
  187. x = x.permute(1, 0, 2) # NLD -> LND
  188. x = self.transformer(x)
  189. x = x.permute(1, 0, 2) # LND -> NLD
  190. # x = x[:,0,:]
  191. ### added ###
  192. x = self.ln_post(x)
  193. if self.proj is not None:
  194. x = x @ self.proj
  195. return x
  196. def forward_features_with_clstoken(self, x: torch.Tensor, pos_embedding: torch.Tensor):
  197. x = self.conv1(x) # shape = [*, width, grid, grid]
  198. x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
  199. x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
  200. x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width]
  201. x = x + self.positional_embedding.to(x.dtype)
  202. x = self.ln_pre(x)
  203. x = x.permute(1, 0, 2) # NLD -> LND
  204. x = self.transformer(x)
  205. x = x.permute(1, 0, 2) # LND -> NLD
  206. cls_token = self.ln_post(x[:, :1, :]) # cls token
  207. img_token = self.ln_post(x[:, 1:, :]) # img token
  208. ### without projection to image-text space ###
  209. return cls_token, img_token
  210. def forward_features_with_prompts(self, x: torch.Tensor, prompt_token: torch.Tensor, pos_embedding: torch.Tensor):
  211. x = self.conv1(x) # shape = [*, width, grid, grid]
  212. x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
  213. x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
  214. x = x + pos_embedding.to(x.dtype)
  215. # [1, hw, d] || [1, 16, d] == [1, hw+16, d]
  216. x = torch.cat((prompt_token, x), dim=1)
  217. x = self.ln_pre(x)
  218. x = x.permute(1, 0, 2) # NLD -> LND
  219. x = self.transformer(x)
  220. x = x.permute(1, 0, 2) # LND -> NLD
  221. # x = x[:,0,:]
  222. ### added ###
  223. x = self.ln_post(x)
  224. if self.proj is not None:
  225. x = x @ self.proj
  226. return x
  227. ### return cls token ####
  228. def forward_multiscale_features(self, x: torch.Tensor, pos_embedding: torch.Tensor):
  229. x = self.conv1(x) # shape = [*, width, grid, grid]
  230. x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
  231. x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
  232. x = x + pos_embedding.to(x.dtype)
  233. x = self.ln_pre(x)
  234. x = x.permute(1, 0, 2) # NLD -> LND
  235. all_x = []
  236. for i in range(12):
  237. x = self.transformer.resblocks[i]
  238. if (i and i % 3 == 0) or i == 11: # 3, 6, 9
  239. cur_x = x.permute(1, 0, 2)
  240. cur_x = self.ln_post(cur_x)
  241. cur_x = cur_x @ self.proj
  242. all_x.append(cur_x)
  243. # x = self.transformer(x)
  244. x = x.permute(1, 0, 2) # LND -> NLD
  245. # x = x[:,0,:]
  246. ### added ###
  247. x = self.ln_post(x)
  248. if self.proj is not None:
  249. x = x @ self.proj
  250. return x
  251. ### return cls token ####
  252. def forward(self, x: torch.Tensor):
  253. x = self.conv1(x) # shape = [*, width, grid, grid]
  254. x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
  255. x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
  256. x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width]
  257. x = x + self.positional_embedding.to(x.dtype)
  258. x = self.ln_pre(x)
  259. x = x.permute(1, 0, 2) # NLD -> LND
  260. x = self.transformer(x)
  261. x = x.permute(1, 0, 2) # LND -> NLD
  262. x = self.ln_post(x[:, 0, :])
  263. if self.proj is not None:
  264. x = x @ self.proj
  265. return x
  266. class CLIP(nn.Module):
  267. def __init__(self,
  268. embed_dim: int,
  269. # vision
  270. image_resolution: int,
  271. vision_layers: Union[Tuple[int, int, int, int], int],
  272. vision_width: int,
  273. vision_patch_size: int,
  274. # text
  275. context_length: int,
  276. vocab_size: int,
  277. transformer_width: int,
  278. transformer_heads: int,
  279. transformer_layers: int
  280. ):
  281. super().__init__()
  282. self.context_length = context_length
  283. if isinstance(vision_layers, (tuple, list)):
  284. vision_heads = vision_width * 32 // 64
  285. self.visual = ModifiedResNet(
  286. layers=vision_layers,
  287. output_dim=embed_dim,
  288. heads=vision_heads,
  289. input_resolution=image_resolution,
  290. width=vision_width
  291. )
  292. else:
  293. vision_heads = vision_width // 64
  294. self.visual = VisionTransformer(
  295. input_resolution=image_resolution,
  296. patch_size=vision_patch_size,
  297. width=vision_width,
  298. layers=vision_layers,
  299. heads=vision_heads,
  300. output_dim=embed_dim
  301. )
  302. self.transformer = Transformer(
  303. width=transformer_width,
  304. layers=transformer_layers,
  305. heads=transformer_heads,
  306. attn_mask=self.build_attention_mask()
  307. )
  308. self.vocab_size = vocab_size
  309. self.token_embedding = nn.Embedding(vocab_size, transformer_width)
  310. self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
  311. self.ln_final = LayerNorm(transformer_width)
  312. self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
  313. self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
  314. self.initialize_parameters()
  315. def initialize_parameters(self):
  316. nn.init.normal_(self.token_embedding.weight, std=0.02)
  317. nn.init.normal_(self.positional_embedding, std=0.01)
  318. if isinstance(self.visual, ModifiedResNet):
  319. if self.visual.attnpool is not None:
  320. std = self.visual.attnpool.c_proj.in_features ** -0.5
  321. nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
  322. nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
  323. nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
  324. nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
  325. for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
  326. for name, param in resnet_block.named_parameters():
  327. if name.endswith("bn3.weight"):
  328. nn.init.zeros_(param)
  329. proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
  330. attn_std = self.transformer.width ** -0.5
  331. fc_std = (2 * self.transformer.width) ** -0.5
  332. for block in self.transformer.resblocks:
  333. nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
  334. nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
  335. nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
  336. nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
  337. if self.text_projection is not None:
  338. nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
  339. def build_attention_mask(self):
  340. # lazily create causal attention mask, with full attention between the vision tokens
  341. # pytorch uses additive attention mask; fill with -inf
  342. mask = torch.empty(self.context_length, self.context_length)
  343. mask.fill_(float("-inf"))
  344. mask.triu_(1) # zero out the lower diagonal
  345. return mask
  346. @property
  347. def dtype(self):
  348. return self.visual.conv1.weight.dtype
  349. def encode_image(self, image):
  350. return self.visual(image.type(self.dtype))
  351. def encode_text(self, text):
  352. x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model]
  353. x = x + self.positional_embedding.type(self.dtype)
  354. x = x.permute(1, 0, 2) # NLD -> LND
  355. x = self.transformer(x)
  356. x = x.permute(1, 0, 2) # LND -> NLD
  357. x = self.ln_final(x).type(self.dtype)
  358. # x.shape = [batch_size, n_ctx, transformer.width]
  359. # take features from the eot embedding (eot_token is the highest number in each sequence)
  360. x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
  361. return x
  362. def forward(self, image, text):
  363. image_features = self.encode_image(image)
  364. text_features = self.encode_text(text)
  365. # normalized features
  366. image_features = image_features / image_features.norm(dim=1, keepdim=True)
  367. text_features = text_features / text_features.norm(dim=1, keepdim=True)
  368. # cosine similarity as logits
  369. logit_scale = self.logit_scale.exp()
  370. logits_per_image = logit_scale * image_features @ text_features.t()
  371. logits_per_text = logits_per_image.t()
  372. # shape = [global_batch_size, global_batch_size]
  373. return logits_per_image, logits_per_text
  374. def convert_weights(model: nn.Module):
  375. """Convert applicable model parameters to fp16"""
  376. def _convert_weights_to_fp16(l):
  377. if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
  378. l.weight.data = l.weight.data.half()
  379. if l.bias is not None:
  380. l.bias.data = l.bias.data.half()
  381. if isinstance(l, nn.MultiheadAttention):
  382. for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
  383. tensor = getattr(l, attr)
  384. if tensor is not None:
  385. tensor.data = tensor.data.half()
  386. for name in ["text_projection", "proj"]:
  387. if hasattr(l, name):
  388. attr = getattr(l, name)
  389. if attr is not None:
  390. attr.data = attr.data.half()
  391. model.apply(_convert_weights_to_fp16)
  392. def build_model(state_dict: dict):
  393. vit = "visual.proj" in state_dict
  394. if vit:
  395. vision_width = state_dict["visual.conv1.weight"].shape[0]
  396. vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
  397. vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
  398. grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
  399. image_resolution = vision_patch_size * grid_size
  400. else:
  401. counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
  402. vision_layers = tuple(counts)
  403. vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
  404. output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
  405. vision_patch_size = None
  406. assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
  407. image_resolution = output_width * 32
  408. embed_dim = state_dict["text_projection"].shape[1]
  409. context_length = state_dict["positional_embedding"].shape[0]
  410. vocab_size = state_dict["token_embedding.weight"].shape[0]
  411. transformer_width = state_dict["ln_final.weight"].shape[0]
  412. transformer_heads = transformer_width // 64
  413. transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
  414. model = CLIP(
  415. embed_dim,
  416. image_resolution, vision_layers, vision_width, vision_patch_size,
  417. context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
  418. )
  419. for key in ["input_resolution", "context_length", "vocab_size"]:
  420. if key in state_dict:
  421. del state_dict[key]
  422. convert_weights(model)
  423. model.load_state_dict(state_dict)
  424. return model.eval()