ViT的极简pytorch实现及其即插即用

先放一张ViT的网络图
在这里插入图片描述
可以看到是把图像分割成小块,像NLP的句子那样按顺序进入transformer,经过MLP后,输出类别。每个小块是16x16,进入Linear Projection of Flattened Patches, 在每个的开头加上cls token和位置信息,也就是position embedding。
去掉数据读取部分,直接上一个极简的ViT代码:

import torch
from torch import nnfrom einops import rearrange, repeat
from einops.layers.torch import Rearrange# helpersdef pair(t):return t if isinstance(t, tuple) else (t, t)# classesclass PreNorm(nn.Module):def __init__(self, dim, fn):super().__init__()self.norm = nn.LayerNorm(dim)self.fn = fndef forward(self, x, **kwargs):return self.fn(self.norm(x), **kwargs)class FeedForward(nn.Module):def __init__(self, dim, hidden_dim, dropout = 0.):super().__init__()self.net = nn.Sequential(nn.Linear(dim, hidden_dim),nn.GELU(),nn.Dropout(dropout),nn.Linear(hidden_dim, dim),nn.Dropout(dropout))def forward(self, x):return self.net(x)class Attention(nn.Module):def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):super().__init__()inner_dim = dim_head *  headsproject_out = not (heads == 1 and dim_head == dim)self.heads = headsself.scale = dim_head ** -0.5self.attend = nn.Softmax(dim = -1)self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)self.to_out = nn.Sequential(nn.Linear(inner_dim, dim),nn.Dropout(dropout)) if project_out else nn.Identity()def forward(self, x):qkv = self.to_qkv(x).chunk(3, dim = -1)## 对tensor张量分块 x :1 197 1024   qkv 最后是一个元祖,tuple,长度是3,每个元素形状:1 197 1024q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)dots = torch.matmul(q, k.transpose(-1, -2)) * self.scaleattn = self.attend(dots)out = torch.matmul(attn, v)out = rearrange(out, 'b h n d -> b n (h d)')return self.to_out(out)class Transformer(nn.Module):def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):super().__init__()self.layers = nn.ModuleList([])for _ in range(depth):self.layers.append(nn.ModuleList([PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))]))def forward(self, x):for attn, ff in self.layers:x = attn(x) + xx = ff(x) + xreturn xclass ViT(nn.Module):def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):super().__init__()image_height, image_width = pair(image_size)   # 224*224patch_height, patch_width = pair(patch_size)   # 16 * 16assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'num_patches = (image_height // patch_height) * (image_width // patch_width)patch_dim = channels * patch_height * patch_widthassert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'self.to_patch_embedding = nn.Sequential(# (b,3,224,224) -> (b,196,768)    14*14=196  16*16*3=768Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),nn.Linear(patch_dim, dim),    # (b,196,1024))self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))self.cls_token = nn.Parameter(torch.randn(1, 1, dim))self.dropout = nn.Dropout(emb_dropout)self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)self.pool = poolself.to_latent = nn.Identity()self.mlp_head = nn.Sequential(nn.LayerNorm(dim),nn.Linear(dim, num_classes))def forward(self, img):x = self.to_patch_embedding(img)        # img 1 3 224 224  输出形状x : 1 196 1024b, n, _ = x.shape                       # 1 196cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)    # (1,1,1024)x = torch.cat((cls_tokens, x), dim=1)   # (1,197,1024)x += self.pos_embedding[:, :(n + 1)]    # (1,197,1024)x = self.dropout(x)                     # (1,197,1024)x = self.transformer(x)                 # (1,197,1024)x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]     # (1,1024)x = self.to_latent(x)      # (1,1024)return self.mlp_head(x)    # (1,1000)if __name__ == '__main__':v = ViT(image_size = 224,patch_size = 16,num_classes = 1000,dim = 1024,depth = 6,heads = 16,mlp_dim = 2048,dropout = 0.1,emb_dropout = 0.1)img = torch.randn(1, 3, 224, 224)preds = v(img)        # (1, 1000)print(preds.shape)

去掉cls和最后的全连接分类头,变成即插即用的模块:

import torch
from torch import nnfrom einops import rearrange
from einops.layers.torch import Rearrange# helpersdef pair(t):return t if isinstance(t, tuple) else (t, t)# classesclass PreNorm(nn.Module):def __init__(self, dim, fn):super().__init__()self.norm = nn.LayerNorm(dim)self.fn = fndef forward(self, x, **kwargs):return self.fn(self.norm(x), **kwargs)class FeedForward(nn.Module):def __init__(self, dim, hidden_dim, dropout = 0.):super().__init__()self.net = nn.Sequential(nn.Linear(dim, hidden_dim),nn.GELU(),nn.Dropout(dropout),nn.Linear(hidden_dim, dim),nn.Dropout(dropout))def forward(self, x):return self.net(x)class Attention(nn.Module):def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):super().__init__()inner_dim = dim_head *  headsproject_out = not (heads == 1 and dim_head == dim)self.heads = headsself.scale = dim_head ** -0.5self.attend = nn.Softmax(dim = -1)self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)self.to_out = nn.Sequential(nn.Linear(inner_dim, dim),nn.Dropout(dropout)) if project_out else nn.Identity()def forward(self, x):qkv = self.to_qkv(x).chunk(3, dim = -1)## 对tensor张量分块 x :1 197 1024   qkv 最后是一个元祖,tuple,长度是3,每个元素形状:1 197 1024q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)dots = torch.matmul(q, k.transpose(-1, -2)) * self.scaleattn = self.attend(dots)out = torch.matmul(attn, v)out = rearrange(out, 'b h n d -> b n (h d)')return self.to_out(out)class Transformer(nn.Module):def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):super().__init__()self.layers = nn.ModuleList([])for _ in range(depth):self.layers.append(nn.ModuleList([PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))]))def forward(self, x):for attn, ff in self.layers:x = attn(x) + xx = ff(x) + xreturn xclass ViT(nn.Module):def __init__(self, *, image_size, patch_size, dim = 1024, depth = 3, heads = 16, mlp_dim = 2048, dim_head = 64, dropout = 0.1, emb_dropout = 0.1):super().__init__()channels, image_height, image_width = image_size   # 256,64,80patch_height, patch_width = pair(patch_size)       # 4*4assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'num_patches = (image_height // patch_height) * (image_width // patch_width)     # 16*20patch_dim = 64 * patch_height * patch_width    # 64*8*10self.conv1 = nn.Conv2d(256, 64, 1)self.to_patch_embedding = nn.Sequential(# (b,64,64,80) -> (b,320,1024)    16*20=320  4*4*64=1024Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),nn.Linear(patch_dim, dim),    # (b,320,1024))self.to_img = nn.Sequential(# b c (h p1) (w p2) -> (b,64,64,80)      16*20=320  4*4*64=1024Rearrange('b (h w) (p1 p2 c) -> b c (h p1) (w p2)', \p1 = patch_height, p2 = patch_width, h = image_height // patch_height, w = image_width // patch_width),nn.Conv2d(64, 256, 1),      # (b,64,64,80) -> (b,256,64,80))# 位置编码self.pos_embedding = nn.Parameter(torch.randn(1, num_patches, dim))self.dropout = nn.Dropout(emb_dropout)self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)def forward(self, img):x = self.conv1(img)                     # img 1 256 64 80 -> 1 64 64 80x = self.to_patch_embedding(x)          # 1 320 1024b, n, _ = x.shape                       # 1 320x += self.pos_embedding[:, :(n + 1)]    # (1,320,1024)x = self.dropout(x)                     # (1,320,1024)x = self.transformer(x)                 # (1,320,1024)x = self.to_img(x)return x                                # (1 256 64 80)if __name__ == '__main__':v = ViT(image_size = (256,64,80), patch_size = 4)img = torch.randn(1, 256, 64, 80)preds = v(img)         # (1, 256, 64, 80)print(preds.shape)

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.hqwc.cn/news/308799.html

如若内容造成侵权/违法违规/事实不符,请联系编程知识网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

数模混合SoC芯片中LEF2Milkyway的golden flow

在数模混合芯片中的项目中,特别是数字模块很少甚至只有一个简单的数字控制逻辑时,我们要做数字模块的后端实现时,通常模拟那边会问我们实现需要他们提供哪些数据。 通常来说,我们可以让模拟设计提供数字模块的GDS或LEF文件即可。…

tcp/ip实现两个手机之间连接同步显示

app主界面 选择一:TCP客户端 选择二:TCP服务端 点击下图item时进入曲线绘制页面 如果是服务器端它不需要连任何设备就可以直接进入绘制界面如果是TCP的话就不能直接进入,否则就会提示未连接网络连接不能放在主线程,页面去调方法&…

2023年末,软件测试面试题总结与分享

大家好,最近有不少小伙伴在后台留言,得准备年后面试了,又不知道从何下手!为了帮大家节约时间,特意准备了一份面试相关的资料,内容非常的全面,真的可以好好补一补,希望大家在都能拿到…

Oracle VirtualBox中Linux系统基本使用方法——备赛笔记——2024全国职业院校技能大赛“大数据应用开发”赛项

前言 小北的这篇博客介绍的是关于用VirtualBox中下载好的ubuntu系统中,为搭建Hadoop平台所做的命令操作准备,希望能帮助不会的人有指引作用。 没有安装好VirtualBox中的ubuntu系统以及创建Hadoop账户的请参考小北之前的三篇博客: ubuntu18…

探索大型预训练模型:解析人工智能的通用知识引擎

目录 前言1 大型预训练模型的演进与重要性1.1 Word2Vec1.2 Transformer1.3 GPT模型 2 大型预训练模型的发展趋势2.1 参数规模与速度的飞跃提升2.2 数据量的持续增长2.3 知识丰富性与少样本学习的突破 3 大型预训练模型的核心机制结语 前言 在当今迅猛发展的人工智能领域&#…

C++系列-第1章顺序结构-4-整型int

C系列-第1章顺序结构-4-整型int 在线练习: http://noi.openjudge.cn/ https://www.luogu.com.cn/ 总结 本文是C系列博客,主要讲述整型int的用法 整型int 在C中,int 是一个关键字,用于声明整型变量。int 类型用于存储整数&…

基于流程挖掘的保险理赔优化策略实践

引言 在当今日益竞争的商业环境中,保险公司面临着日益增长的业务量和客户期望的挑战。特别是在理赔领域,理赔是保险行业的重要环节,也是保险公司和客户之间最直接的联系点。然而,长周期和繁琐的理赔流程常常给保险公司和投保人带来困扰。因此,如何提供准确且高效的理赔处…

(self-supervised learning)Event Camera Data Pre-training

Publisher: ICCV 2023 MOTIVATION OF READING: 自监督学习、稀疏事件 NILM link: https://arxiv.org/pdf/2301.01928.pdf Code: GitHub - Yan98/Event-Camera-Data-Pre-training 1. Overview Contributions are summarized as follows: 1. A self-supervised framework f…

SONiC和ONL所依赖的Debian版本说明

Debian 的最新几个版本 下一代 Debian 正式发行版的代号为 trixie — 测试(testing)版 Debian 12 (bookworm) — 当前的稳定(stable)版 Debian 11 (bullseye) — 当前的旧的稳定(oldstable)版 Debian 10&a…

OR-NeRF论文笔记

OR-NeRF论文笔记 文章目录 OR-NeRF论文笔记论文概述Abstract1 Introduction2 Related Work3 Background4 Method4.1 Multiview Segmentation4.2 Scene Object Removal 5 ExperimentsDatasetsMetricsMultiview SegmentationScene Object Removal 6 Conclusion 论文概述 目的&am…

1panel使用指南(一)面板安装

一、1panel简介 1Panel是杭州飞致云信息科技有限公司推出的产品 [1],帮助用户实现快速建站。 [2]是一款现代化、开源的Linux服务器运维管理面板,于2023年3月推出,深度集成WordPress和Halo,一键完成域名绑定、SSL证书配置等操作&a…

懒加载的el-tree中没有了子节点之后还是有前面icon箭头的展示,如何取消没有子节点之后的箭头显示

没有特别多的数据 <template><el-tree:props"props":load"loadNode"lazyshow-checkbox></el-tree></template><script>export default {data() {return {props: {label: name,children: zones,isLeaf:"leaf",//关…