百度点石杯情感极性分析(微改,双head,方面 和 情感极性 多标签)
百度点石杯情感极性分析(微改,双head,方面 和 情感极性 多标签)
·
百度点石杯情感极性分析(微改,双head,方面 和 情感极性 多标签任务)
任务介绍
由于原始比赛任务为三分类,即“正向”、“负向”和“中性”情感极性,而同时包含食品餐饮、旅游住宿、金融服务、医疗服务和物流快递等五方面,故可改造为多标签、多类别(细粒度)情感分析任务。
相关资料汇总:
——对比
DataFountain-天气以及时间分类:CNN多标签分类 准确率0.92
——多头,即自定义模型,模型有两个head,分别判断 方面 和 情感极性。
方案设计
1.改造现有预训练模型,双Head输出,将原本情感分类任务转化成 方面判断(5类) 和 情感极性(3类)同时预测的多标签分类任务;
2.尝试使用提示学习方法(is_prompt=True),构造“[X]是[Y]方面的评论”任务(注:提升不显著);
3.尝试使用Focal Loss损失函数(unbalance = ‘Focal_loss’)(注:效果不佳);
!pip install -U paddlenlp
import os
import json
import random
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import paddle
import paddlenlp
import paddle.nn.functional as F
from functools import partial
from paddlenlp.data import Stack, Dict, Pad, Tuple
from paddlenlp.datasets import load_dataset
import paddle.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from paddlenlp.transformers import *
init_from_ckpt=None
# 切换语言模型,加载预训练模型
# ernie-3.0-xbase-zh
# ernie-3.0-base-zh
# ernie-3.0-medium-zh
# ernie-3.0-mini-zh
# ernie-3.0-micro-zh
# ernie-3.0-nano-zh
MODEL_NAME = 'ernie-3.0-nano-zh'
max_seq_length = 96
batch_size=64
# 训练过程中的最大学习率
learning_rate = 5e-5
# 训练轮次
epochs = 10
# 是否加入提示学习
is_prompt=True
# 损失函数设置
unbalance = 'Focal_loss' # None , Focal_loss
focalloss_alpha = 0.5
focalloss_gamma = 2
# 学习率预热比例
warmup_proportion = 0.1
# 学习率衰减比例
decay_proportion = 0.2
# 权重衰减系数,类似模型正则项策略,避免模型过拟合
weight_decay = 0.01
# 用于控制梯度膨胀,如果梯度向量的L2模超过max_grad_norm,则等比例缩小
max_grad_norm = 1.0
# 训练结束后,存储模型参数
save_dir = "checkpoint/{}-{}".format(MODEL_NAME.replace('/','-'),int(time.time()))
1 数据读取和EDA
1.1 读取数据并统一格式
# coding=gbk
train_columns = ['id', 'types', 'comment', 'labels']
train = pd.read_csv('data/data173029/data_train.csv',sep='\t', names=train_columns, encoding='utf-8')
train
# coding=gbk
test_columns = ['id', 'types', 'comment']
test = pd.read_csv('data/data173029/data_test.csv',sep='\t', names=test_columns, encoding='utf-8')
test
train['comment'] = train['comment'].astype(str)
train['comment'] = train['comment'].apply(lambda x: x.replace('\t',''))
train['comment'] = train['comment'].apply(lambda x: x.replace('\n',''))
train['comment'] = train['comment'].apply(lambda x: x.replace('\u3000',''))
train['comment'] = train['comment'].apply(lambda x: x.replace('\xa0',''))
train['comment'] = train['comment'].apply(lambda x: x.replace('\r',''))
train['comment'] = train['comment'].apply(lambda x: x.replace(' ',''))
test['comment'] = test['comment'].astype(str)
test['comment'] = test['comment'].apply(lambda x: x.replace('\t',''))
test['comment'] = test['comment'].apply(lambda x: x.replace('\n',''))
test['comment'] = test['comment'].apply(lambda x: x.replace('\u3000',''))
test['comment'] = test['comment'].apply(lambda x: x.replace('\xa0',''))
test['comment'] = test['comment'].apply(lambda x: x.replace('\r',''))
test['comment'] = test['comment'].apply(lambda x: x.replace(' ',''))
train['comment_len'] = [len(row) for row in train['comment']]
test['comment_len'] = [len(row) for row in test['comment']]
1.2 简易数据分析
# 拼接后的文本长度分析
for rate in [0.5,0.75,0.9,0.95,0.99]:
print("训练数据中{:.0f}%的文本长度小于等于 {:.2f}".format(rate*100,train['comment_len'].quantile(rate)))
plt.title("text length")
sns.distplot(train['comment_len'],bins=10,color='r')
sns.distplot(test['comment_len'],bins=10,color='g')
plt.show()
type_list = train["types"].unique()
# 类别和id转换
id2type= {k: v for k, v in enumerate(type_list)}
type2id = {v: k for k, v in enumerate(type_list)}
train['types'] = train['types'].map(type2id)
test['types'] = test['types'].map(type2id)
# 查看标签tpye分布
# plt.rcParams['font.sans-serif'] = ['SimHei'] # 中文字体设置-黑体
# plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
# sns.set(font='SimHei') # 解决Seaborn中文显示问题
plt.title("types distribution")
sns.countplot(y='types',data=train)
label_list = train["labels"].unique()
# 查看标签label分布
plt.title("labels distribution")
sns.countplot(y='labels',data=train)
1.3 结论
- 训练集共82025条,测试集共35157条。
- 拼文本长度大多数集中在70左右,仅有极少数文本超过250。
- 共有3个情感标签,标签分布呈现极度不平衡状态;共有5个方面,标签分布呈现基本平衡状态。
2 数据处理
2.1 数据划分
# 创建数据迭代器
def read(df,istrain=True):
if istrain:
for _,data in df.iterrows():
yield {
"comment":data['comment'],
"types":data['types'],
"labels":data['labels']
}
else:
for _,data in df.iterrows():
yield {
"comment":data['comment'],
"types":data['types']
}
# # 将生成器传入load_dataset
train,dev = train_test_split(train,test_size=0.2,random_state=80471)
train_ds = load_dataset(read, df=train, lazy=False)
dev_ds = load_dataset(read, df=dev, lazy=False)
2.2 数据编码
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# 定义数据加载和处理函数
def convert_example(example, tokenizer, max_seq_length=512,is_test=False,is_prompt=False):
if is_prompt:
text = example["comment"]+"是"+example["types"]+"方面的评论"
else:
text = example["comment"]
encoded_inputs = tokenizer(text=text,
max_seq_len=max_seq_length)
input_ids = encoded_inputs["input_ids"]
token_type_ids = encoded_inputs["token_type_ids"]
if not is_test:
types = np.array(example["types"], dtype="float32")
labels = np.array(example["labels"], dtype="float32")
return input_ids, token_type_ids, types, labels
return input_ids, token_type_ids
def create_dataloader(dataset,
mode='train',
batch_size=1,
batchify_fn=None,
trans_fn=None):
if trans_fn:
dataset = dataset.map(trans_fn)
shuffle = True if mode == 'train' else False
if mode == 'train':
batch_sampler = paddle.io.DistributedBatchSampler(
dataset, batch_size=batch_size, shuffle=shuffle)
else:
batch_sampler = paddle.io.BatchSampler(
dataset, batch_size=batch_size, shuffle=shuffle)
return paddle.io.DataLoader(
dataset=dataset,
batch_sampler=batch_sampler,
collate_fn=batchify_fn,
return_list=True)
trans_dev_func = partial(
convert_example,
tokenizer=tokenizer,
max_seq_length=max_seq_length,
is_test=False,
is_prompt=False)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment
Stack(dtype="int32"), # types
Stack(dtype="int32") # labels
): [data for data in fn(samples)]
# 构造训练集的dataloader
train_data_loader = create_dataloader(
train_ds,
mode='train',
batch_size=batch_size,
batchify_fn=batchify_fn,
trans_fn=trans_dev_func)
# 构造验证集的dataloader
dev_data_loader=create_dataloader(
dev_ds,
mode='dev',
batch_size=batch_size,
batchify_fn=batchify_fn,
trans_fn=trans_dev_func)
3 模型搭建
预训练模型 + 全连接层/Dropout策略
pretrained_model = AutoModel.from_pretrained(MODEL_NAME)
class EmotionClassifier(nn.Layer):
def __init__(self, pretrained_model,num_types,num_labels,dropout=None):
super().__init__()
self.ptm = pretrained_model
self.num_types = num_types
self.num_labels = num_labels
self.dropout = nn.Dropout(dropout if dropout is not None else
self.ptm.config["hidden_dropout_prob"])
self.fc1 = nn.Linear(self.ptm.config["hidden_size"], self.num_types)
self.fc2 = nn.Linear(self.ptm.config["hidden_size"], self.num_labels)
def forward(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None):
_, pooled_output = self.ptm(
input_ids,
token_type_ids=token_type_ids,
position_ids=position_ids,
attention_mask=attention_mask)
pooled_output = self.dropout(pooled_output)
logit_types = self.fc1(pooled_output)
logit_labels = self.fc2(pooled_output)
return logit_types,logit_labels
model = EmotionClassifier(pretrained_model, num_types=len(type_list), num_labels=len(label_list))
4 模型配置
# 如果有预训练模型,则加载模型
if init_from_ckpt and os.path.isfile(init_from_ckpt):
state_dict = paddle.load(init_from_ckpt)
model.set_dict(state_dict)
# 训练总步数
max_steps = len(train_data_loader) * epochs
# 学习率衰减策略
'''
__all__ = [
1 'LinearDecayWithWarmup',
2 'ConstScheduleWithWarmup',
3 'CosineDecayWithWarmup',
4 'PolyDecayWithWarmup',
5 'CosineAnnealingWithWarmupDecay',
]
'''
# 学习率衰减策略
# lr_scheduler = paddlenlp.transformers.LinearDecayWithWarmup(learning_rate=learning_rate, total_steps=max_steps, warmup=warmup_proportion)
lr_scheduler = paddlenlp.transformers.CosineDecayWithWarmup(learning_rate=learning_rate, total_steps=max_steps, warmup=warmup_proportion)
# lr_scheduler = paddlenlp.transformers.CosineDecayWithWarmup(
# learning_rate=learning_rate, total_steps=max_steps, warmup=warmup_proportion,
# with_hard_restarts=False, num_cycles=3, last_epoch=- 1, verbose=False)
# warmup_step=max_steps*warmup_proportion
# decay_step=max_steps*(1-decay_proportion)
# lr_scheduler = paddlenlp.transformers.CosineAnnealingWithWarmupDecay(max_lr=learning_rate, min_lr=1e-7, warmup_step=warmup_step, decay_step=decay_step)
decay_params = [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
# 定义优化器
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
parameters=model.parameters(),
weight_decay=weight_decay,
apply_decay_param_fun=lambda x: x in decay_params,
grad_clip=paddle.nn.ClipGradByGlobalNorm(max_grad_norm))
criterion1 = paddle.nn.loss.CrossEntropyLoss()
import paddle
import numpy as np
class FocalLoss(paddle.nn.Layer):
def __init__(self, alpha=0.5, gamma=2, num_classes=3, weight=None, ignore_index=-100):
super().__init__()
self.alpha = alpha
self.gamma = gamma
self.weight = weight if weight is not None else paddle.to_tensor(np.array([1.] * num_classes), dtype='float32')
self.ce_fn = paddle.nn.CrossEntropyLoss(
weight=self.weight, soft_label=False, ignore_index=ignore_index)
def forward(self, preds, labels):
logpt = -self.ce_fn(preds, labels)
pt = paddle.exp(logpt)
loss = -((1 - pt) ** self.gamma) * self.alpha * logpt
return loss
# --修改损失函数
if unbalance == "Focal_loss":
criterion2 = FocalLoss(
alpha=focalloss_alpha,
gamma=focalloss_gamma,
num_classes=len(label_list))
else:
# 交叉熵损失
criterion2 = paddle.nn.loss.CrossEntropyLoss()
5 模型训练
# 定义模型训练验证评估函数
@paddle.no_grad()
def evaluate(model, data_loader):
"""
Given a dataset, it evals model and computes the metric.
Args:
model(obj:`paddle.nn.Layer`): A model to classify texts.
data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
criterion(obj:`paddle.nn.Layer`): It can compute the loss.
metric(obj:`paddle.metric.Metric`): The evaluation metric.
"""
model.eval()
pred_types = []
pred_labels = []
real_types = []
real_labels = []
for batch in tqdm(data_loader):
input_ids, token_type_ids, types, labels = batch
logit_types, logit_labels = model(input_ids, token_type_ids)
prob_types = F.softmax(logit_types,axis=1)
prob_labels = F.softmax(logit_labels,axis=1)
pred_types.extend(prob_types.argmax(axis=1).numpy())
pred_labels.extend(prob_labels.argmax(axis=1).numpy())
real_types.extend(types.reshape([-1]).numpy())
real_labels.extend(labels.reshape([-1]).numpy())
accuracy_types = accuracy_score(pred_types, real_types)
accuracy_labels = accuracy_score(pred_labels, real_labels)
model.train()
return accuracy_types,accuracy_labels # 返回准确率
def do_train(model, train_data_loader, dev_data_loader, criterion1, criterion2, optimizer, lr_scheduler):
model.train()
max_accuracy=0
save_dir = "./checkpoint/" + MODEL_NAME
for epoch in range(1, epochs + 1):
with tqdm(total=len(train_data_loader)) as pbar:
for step, batch in enumerate(train_data_loader, start=1):
input_ids, token_type_ids, types, labels = batch
logit_types,logit_labels = model(input_ids, token_type_ids)
loss = criterion1(logit_types,types) + criterion2(logit_labels,labels)
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_grad()
pbar.set_postfix({'loss' : '%.5f' % (loss.numpy())})
pbar.update(1)
accuracy_types,accuracy_labels = evaluate(model, dev_data_loader)
print("Epoch: %d, Types Accuracy: %.5f, Labels Accuracy: %.5f" % (epoch, accuracy_types,accuracy_labels))
if not os.path.exists(save_dir):
os.makedirs(save_dir) ## 递归创建
print("Epoch: %d, Types Accuracy: %.5f, Labels Accuracy: %.5f" % (epoch, accuracy_types,accuracy_labels),
file=open(save_dir +'/best_model_log.txt', 'a'))
if accuracy_types*accuracy_labels >= max_accuracy:
max_accuracy = accuracy_types*accuracy_labels
save_param_path = os.path.join(save_dir, 'best_model.pdparams')
paddle.save(model.state_dict(), save_param_path)
tokenizer.save_pretrained(save_dir)
save_param_path = os.path.join(save_dir, 'last_model.pdparams')
paddle.save(model.state_dict(), save_param_path)
do_train(model, train_data_loader, dev_data_loader, criterion1, criterion2, optimizer, lr_scheduler)
此文章为搬运
原项目链接
更多推荐
所有评论(0)