第N4周:中文文本分类-Pytorch实现

>- **🍨 本文为[🔗365天深度学习训练营](https://mp.weixin.qq.com/s/rbOOmire8OocQ90QM78DRA) 中的学习记录博客** >- **🍖 原作者:[K同学啊 | 接辅导、项目定制](https://mtyjkh.blog.csdn.net/)**

# -*- coding: utf-8 -*-
import  torch
import torch.nn as nn
import torchvision
from torchvision import transforms, datasets
import os,PIL,pathlib,warnings#忽略警告信息
warnings.filterwarnings("ignore")
#win10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")import pandas as pd#加载自定义中文数据
train_data = pd.read_csv('./data/train.csv', sep='\t', header=None)
train_data.head()#构造数据集迭代器
def coustom_data_iter(texts,labels):for x,y in zip(texts,labels):yield x,ytrain_iter =coustom_data_iter(train_data[0].values[:],train_data[1].values[:])from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import jieba
#中文分词方法
tokenizer =jieba.lcut
def yield_tokens(data_iter):for text,_ in data_iter:yield tokenizer(text)
vocab =build_vocab_from_iterator(yield_tokens(train_iter),specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])#设置默认索引,如果找不到单词,则会选择默认索引13
vocab(['我','想','看','和平','精英','上','战神','必备','技巧','的','游戏','视频'])label_name =list(set(train_data[1].values[:]))
print(label_name)
['TVProgram-Play','Other','Radio-Listen','FilmTele-Play','Weather-Query','Calendar-Query','Audio-Play', 'Travel-Query', 'Video-Play','HomeAppliance-Control', 'Music-Play', 'Alarm-Update']text_pipeline =lambda x:vocab(tokenizer(x))
label_pipeline =lambda x:label_name.index(x)
print(text_pipeline('我想看和平精英上战神必备技巧的游戏视频'))
print(label_pipeline('Video-Play'))from torch.utils.data import DataLoaderdef collate_batch(batch):label_list,text_list,offsets =[],[],[0]for(_text,_label)in batch:#标签列表label_list.append(label_pipeline(_label))#文本列表processed_text =torch.tensor(text_pipeline(_text),dtype=torch.int64)text_list.append(processed_text)#偏移量,即语句的总词汇量offsets.append(processed_text.size(0))label_list =torch.tensor(label_list,dtype=torch.int64)text_list =torch.cat(text_list)offsets=torch.tensor(offsets[:-1]).cumsum(dim=0)#返回维度dim中输入元素的累计和offsetsreturn text_list.to(device),label_list.to(device),offsets.to(device)#数据加载器,调用示例
dataloader = DataLoader(train_iter,batch_size=8,shuffle=False,collate_fn=collate_batch)from torch import nn
class TextClassificationModel(nn.Module):def __init__(self,vocab_size,embed_dim,num_class):super(TextClassificationModel,self).__init__()self.embedding =nn.EmbeddingBag(vocab_size,   #词典大小embed_dim,    #嵌入的维度sparse=False)#self.fc =nn.Linear(embed_dim,num_class)self.init_weights()def init_weights(self):initrange =0.5self.embedding.weight.data.uniform_(-initrange,initrange)#初始化权重self.fc.weight.data.uniform_(-initrange,initrange)self.fc.bias.data.zero_()#偏置值归零def forward(self,text,offsets):embedded =self.embedding(text,offsets)return self.fc(embedded)num_class =len(label_name)
vocab_size =len(vocab)
em_size=  64
model=TextClassificationModel(vocab_size,em_size,num_class).to(device)import timedef train(dataloader):model.train()#切换为训练模式total_acc,train_loss,total_count =0,0,0log_interval =50start_time   =time.time()for idx,(text,label,offsets) in enumerate(dataloader):predicted_label = model(text,offsets)optimizer.zero_grad()#grad属性归零loss =criterion(predicted_label,label)#计算网络输出和真实值之间的差距,label为真实值loss.backward()#反向传播torch.nn.utils.clip_grad_norm_(model.parameters(),0.1)#梯度裁剪optimizer.step()#每一步自动更新#记录acc与losstotal_acc  +=(predicted_label.argmax(1)==label).sum().item()train_loss +=loss.item()total_count +=label.size(0)if idx % log_interval ==0 and idx>0:elapsed =time.time()-start_timeprint('| epoch {:1d} | {:4d}/{:4d} batches''| train_acc {:4.3f} train_loss {:4.5f}'.format(epoch,idx,len(dataloader),total_acc/total_count,train_loss/total_count))total_acc,train_loss,total_count =0,0,0start_time = time.time()
def evaluate(dataloader):model.eval()#切换为测试模式total_acc,train_loss,total_count =0,0,0with torch.no_grad():for idx,(text,label,offsets)in enumerate(dataloader):predicted_label =model(text,offsets)loss = criterion(predicted_label,label)#计算loss值#记录测试数据total_acc   +=(predicted_label.argmax(1)==label).sum().item()train_loss +=loss.item()total_count +=label.size(0)return total_acc/total_count,train_loss/total_countfrom torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset#超参数EPOCHS=10 #epoch
LR=5  #学习率
BATCH_SIZE =64 #batch size for training
criterion =torch.nn.CrossEntropyLoss()
optimizer =torch.optim.SGD(model.parameters(),lr=LR)
scheduler =torch.optim.lr_scheduler.StepLR(optimizer,1.0,gamma=0.1)
total_accu =None#构建数据集
train_iter =coustom_data_iter(train_data[0].values[:],train_data[1].values[:])
train_dataset =to_map_style_dataset(train_iter)split_train_,split_valid_=random_split(train_dataset,[int(len(train_dataset)*0.8),int(len(train_dataset)*0.2)])train_dataloader =DataLoader(split_train_,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_batch)
valid_dataloader =DataLoader(split_valid_,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_batch)for epoch in range(1,EPOCHS +1):epoch_start_time =time.time()train(train_dataloader)val_acc,val_loss =evaluate(valid_dataloader)#获取当前的学习率lr =optimizer.state_dict()['param_groups'][0]['lr']if total_accu is not None and total_accu >val_acc:scheduler.step()else:total_accu =val_accprint('-'*69)print('l epoch {:1d}|time:{:4.2f}s |''valid_acc {:4.3f}valid_loss {:4.3f}|lr {:4.6f}'.format(epoch,time.time()-epoch_start_time,val_acc,val_loss,lr))print('-'*69)test_acc,test_loss =evaluate(valid_dataloader)
print('模型准确率为:{:5.4f}'.format(test_acc))def predict(text,text_pipeline):with torch.no_grad():text =torch.tensor(text_pipeline(text))output =model(text,torch.tensor([0]))return output.argmax(1).item()
#ex_text_str="随便播放一首专辑阁楼里的佛里的歌"
ex_text_str ="还有双鸭山到淮阴的汽车票吗13号的"
model =model.to("cpu")
print("该文本的类别是:%s"%label_name[predict(ex_text_str,text_pipeline)])

下面是运行结果:

| epoch 1 |   50/ 152 batches| train_acc 0.453 train_loss 0.03016
| epoch 1 |  100/ 152 batches| train_acc 0.696 train_loss 0.01937
| epoch 1 |  150/ 152 batches| train_acc 0.760 train_loss 0.01392
---------------------------------------------------------------------
l epoch 1|time:1.15s |valid_acc 0.795valid_loss 0.012|lr 5.000000
---------------------------------------------------------------------
| epoch 2 |   50/ 152 batches| train_acc 0.813 train_loss 0.01067
| epoch 2 |  100/ 152 batches| train_acc 0.836 train_loss 0.00929
| epoch 2 |  150/ 152 batches| train_acc 0.850 train_loss 0.00823
---------------------------------------------------------------------
l epoch 2|time:1.03s |valid_acc 0.847valid_loss 0.008|lr 5.000000
---------------------------------------------------------------------
| epoch 3 |   50/ 152 batches| train_acc 0.874 train_loss 0.00688
| epoch 3 |  100/ 152 batches| train_acc 0.882 train_loss 0.00648
| epoch 3 |  150/ 152 batches| train_acc 0.889 train_loss 0.00610
---------------------------------------------------------------------
l epoch 3|time:1.03s |valid_acc 0.865valid_loss 0.007|lr 5.000000
---------------------------------------------------------------------
| epoch 4 |   50/ 152 batches| train_acc 0.905 train_loss 0.00530
| epoch 4 |  100/ 152 batches| train_acc 0.914 train_loss 0.00464
| epoch 4 |  150/ 152 batches| train_acc 0.913 train_loss 0.00478
---------------------------------------------------------------------
l epoch 4|time:1.03s |valid_acc 0.882valid_loss 0.006|lr 5.000000
---------------------------------------------------------------------
| epoch 5 |   50/ 152 batches| train_acc 0.933 train_loss 0.00389
| epoch 5 |  100/ 152 batches| train_acc 0.940 train_loss 0.00346
| epoch 5 |  150/ 152 batches| train_acc 0.928 train_loss 0.00410
---------------------------------------------------------------------
l epoch 5|time:1.05s |valid_acc 0.889valid_loss 0.006|lr 5.000000
---------------------------------------------------------------------
| epoch 6 |   50/ 152 batches| train_acc 0.956 train_loss 0.00275
| epoch 6 |  100/ 152 batches| train_acc 0.945 train_loss 0.00306
| epoch 6 |  150/ 152 batches| train_acc 0.943 train_loss 0.00321
---------------------------------------------------------------------
l epoch 6|time:1.03s |valid_acc 0.893valid_loss 0.006|lr 5.000000
---------------------------------------------------------------------
| epoch 7 |   50/ 152 batches| train_acc 0.962 train_loss 0.00231
| epoch 7 |  100/ 152 batches| train_acc 0.962 train_loss 0.00240
| epoch 7 |  150/ 152 batches| train_acc 0.962 train_loss 0.00237
---------------------------------------------------------------------
l epoch 7|time:1.01s |valid_acc 0.898valid_loss 0.005|lr 5.000000
---------------------------------------------------------------------
| epoch 8 |   50/ 152 batches| train_acc 0.971 train_loss 0.00203
| epoch 8 |  100/ 152 batches| train_acc 0.978 train_loss 0.00170
| epoch 8 |  150/ 152 batches| train_acc 0.971 train_loss 0.00183
---------------------------------------------------------------------
l epoch 8|time:1.02s |valid_acc 0.898valid_loss 0.005|lr 5.000000
---------------------------------------------------------------------
| epoch 9 |   50/ 152 batches| train_acc 0.983 train_loss 0.00142
| epoch 9 |  100/ 152 batches| train_acc 0.980 train_loss 0.00145
| epoch 9 |  150/ 152 batches| train_acc 0.978 train_loss 0.00151
---------------------------------------------------------------------
l epoch 9|time:1.01s |valid_acc 0.900valid_loss 0.005|lr 5.000000
---------------------------------------------------------------------
| epoch 10 |   50/ 152 batches| train_acc 0.987 train_loss 0.00116
| epoch 10 |  100/ 152 batches| train_acc 0.985 train_loss 0.00117
| epoch 10 |  150/ 152 batches| train_acc 0.986 train_loss 0.00111
---------------------------------------------------------------------
l epoch 10|time:1.01s |valid_acc 0.903valid_loss 0.005|lr 5.000000
---------------------------------------------------------------------
模型准确率为:0.9033
该文本的类别是:Travel-Query

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.hqwc.cn/news/541394.html

如若内容造成侵权/违法违规/事实不符,请联系编程知识网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

js中的原型(原型对象,对象原型,原型继承,原型链)

js中的原型 一.原型二.constructor 属性三.对象原型四.原型继承五.原型链 一.原型 构造函数通过原型分配的函数是所有实例化对象所共享的。 JavaScript 规定&#xff0c;每一个构造函数都有一个 prototype 属性&#xff0c;指向另一个对象&#xff0c;所以我们也称为原型对象 …

Windows中在C#中使用Dapper和Mysql.Data库连接MySQL数据库

Windows中在C#中使用Dapper和Mysql.Data库连接MySQL数据库 在Windows中使用C#连接Mysql数据库比较简单&#xff0c;可以直接使用MySql.Data库&#xff0c;目前最新版本为&#xff1a;8.3.0。 当然也可以结合MySql.Data和Dapper库一起使用&#xff0c;目前Dapper的最新版本为&a…

深度学习基础知识之Atrous卷积(空洞卷积)

太久不看代码确实生疏了&#xff0c;盯着一堆不同的dilation&#xff0c;不知道有什么作用&#xff0c;论文中说是Atrous卷积&#xff0c;原来就是空洞卷积的意思。 Dilated/Atrous Convolution 空洞卷积&#xff08;膨胀卷积/扩张卷积&#xff09; 空洞卷积是一种不增加参数量…

【C++ 】list 类

1. 标准库中的list类 list 类 的介绍&#xff1a; 1. list是可以在常数范围内在任意位置进行插入和删除的序列式容器&#xff0c;并且该容器可以前后双向迭代 2. list与forward_list非常相似&#xff1a;最主要的不同在于forward_list是单链表 3. 与其他的序列式容器相比(a…

白话微机:9.解释SoC和Linux

一. 前言&#xff08;回顾世界观&#xff09; 在“微机世界”&#xff0c;普通的城市(单片机)里&#xff0c;人又有一个别的名字叫做“数据”&#xff0c;人有0有1&#xff1b;人们也有住房&#xff0c;这些住房在这个世界叫做“存储器”&#xff1b;地上有路&#xff0c;这些路…

企业微信如何接入第三方应用?

1.登录企业微信管理后台&#xff1a;https://work.weixin.qq.com/wework_admin​​​​​ 2.点击创建应用&#xff1b; ​​​​​​​ 3. 此时可以看到已经创建好的应用&#xff0c;并且生成应用的唯一id&#xff08;agentId&#xff09; 4. 第三方应用申请域名 (举例&…

YOLOv9改进策略:注意力机制 |通道注意力和空间注意力CBAM | GAM超越CBAM,不计成本提高精度

&#x1f4a1;&#x1f4a1;&#x1f4a1;本文改进内容&#xff1a;通道注意力和空间注意力CBAM&#xff0c;全新注意力GAM&#xff1a;超越CBAM&#xff0c;不计成本提高精度 改进结构图如下&#xff1a; YOLOv9魔术师专栏 ☁️☁️☁️☁️☁️☁️☁️☁️☁️☁️☁️☁️…

【SystemVerilog】结构体真是太好用了~

前言 Verilog最大的缺陷之一是没有数据结构。在SystemVerilog中可以使用struct创建结构&#xff0c;struct只是把数据组织到一起&#xff0c;是数据的集合&#xff0c;所以是可综合的。 结构体是可以通过模块接口进行传递的&#xff0c;这就是本文想描述的内容。 一、结构体的…

VSSM VMamba实现

文章目录 VSSM维度变换初始化模型参数初始化模型搭建def_make_layerdef _make_downsample patch embed第一至四阶段分类器 VSSBlockdef __ init__ssm分支mlp分支 def forward VSSM Mamba实现可以参照之前的 mamba_minimal系列 论文地址&#xff1a; VMamba 论文阅读&#xff1…

c++入门你需要知道的知识点(上)

&#x1fa90;&#x1fa90;&#x1fa90;欢迎来到程序员餐厅&#x1f4ab;&#x1f4ab;&#x1f4ab; 今日主菜&#xff1a;c入门 主厨&#xff1a;邪王真眼 所属专栏&#xff1a;c专栏 主厨的主页&#xff1a;Chef‘s blog 前言&#xff1a; 咱也是好久没有更…

「THUWC 2017」大葱的神力 - 题解

忠告&#xff1a;如果你想抄题解的&#xff0c;可以离开&#xff0c;这不是一时半会儿能解决的问题 前置知识&#xff1a; 学习笔记&#xff1a;费用流https://blog.csdn.net/weixin_44043668/article/details/108738212C动态规划详解https://blog.csdn.net/weixin_51951103/a…

GPT实战系列-LangChain的OutPutParser解析器

GPT实战系列-LangChain的OutPutParser解析器 LangChain GPT实战系列-LangChain如何构建基通义千问的多工具链 GPT实战系列-构建多参数的自定义LangChain工具 GPT实战系列-通过Basetool构建自定义LangChain工具方法 GPT实战系列-一种构建LangChain自定义Tool工具的简单方法…