百度点石杯情感极性分析（微改，双head，方面和情感极性多标签）

AI Studio

334人浏览 · 2022-11-23 18:56:15

AI Studio · 2022-11-23 18:56:15 发布

百度点石杯情感极性分析（微改，双head，方面和情感极性多标签任务）

任务介绍

由于原始比赛任务为三分类，即“正向”、“负向”和“中性”情感极性，而同时包含食品餐饮、旅游住宿、金融服务、医疗服务和物流快递等五方面，故可改造为多标签、多类别（细粒度）情感分析任务。

方案设计

1.改造现有预训练模型，双Head输出，将原本情感分类任务转化成方面判断（5类）和情感极性（3类）同时预测的多标签分类任务；

2.尝试使用提示学习方法（is_prompt=True），构造“[X]是[Y]方面的评论”任务（注：提升不显著）；

3.尝试使用Focal Loss损失函数（unbalance = ‘Focal_loss’）（注：效果不佳）；

!pip install -U paddlenlp

import os
import json
import random
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import paddle
import paddlenlp
import paddle.nn.functional as F
from functools import partial
from paddlenlp.data import Stack, Dict, Pad, Tuple
from paddlenlp.datasets import load_dataset
import paddle.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from paddlenlp.transformers import *

init_from_ckpt=None

# 切换语言模型,加载预训练模型

# ernie-3.0-xbase-zh
# ernie-3.0-base-zh
# ernie-3.0-medium-zh
# ernie-3.0-mini-zh
# ernie-3.0-micro-zh
# ernie-3.0-nano-zh

MODEL_NAME = 'ernie-3.0-nano-zh'

max_seq_length = 96

batch_size=64

# 训练过程中的最大学习率
learning_rate = 5e-5
# 训练轮次
epochs = 10

# 是否加入提示学习
is_prompt=True

# 损失函数设置
unbalance = 'Focal_loss' #  None , Focal_loss
focalloss_alpha = 0.5
focalloss_gamma = 2


# 学习率预热比例
warmup_proportion = 0.1
# 学习率衰减比例
decay_proportion = 0.2

# 权重衰减系数，类似模型正则项策略，避免模型过拟合
weight_decay = 0.01

# 用于控制梯度膨胀，如果梯度向量的L2模超过max_grad_norm，则等比例缩小
max_grad_norm = 1.0

# 训练结束后，存储模型参数
save_dir = "checkpoint/{}-{}".format(MODEL_NAME.replace('/','-'),int(time.time()))

1 数据读取和EDA

1.1 读取数据并统一格式

# coding=gbk
train_columns = ['id', 'types', 'comment', 'labels']
train = pd.read_csv('data/data173029/data_train.csv',sep='\t', names=train_columns, encoding='utf-8')
train

# coding=gbk
test_columns = ['id', 'types', 'comment']
test = pd.read_csv('data/data173029/data_test.csv',sep='\t', names=test_columns, encoding='utf-8')
test

train['comment'] = train['comment'].astype(str)
train['comment'] = train['comment'].apply(lambda x: x.replace('\t',''))
train['comment'] = train['comment'].apply(lambda x: x.replace('\n',''))
train['comment'] = train['comment'].apply(lambda x: x.replace('\u3000',''))
train['comment'] = train['comment'].apply(lambda x: x.replace('\xa0',''))
train['comment'] = train['comment'].apply(lambda x: x.replace('\r',''))
train['comment'] = train['comment'].apply(lambda x: x.replace(' ',''))

test['comment'] = test['comment'].astype(str)
test['comment'] = test['comment'].apply(lambda x: x.replace('\t',''))
test['comment'] = test['comment'].apply(lambda x: x.replace('\n',''))
test['comment'] = test['comment'].apply(lambda x: x.replace('\u3000',''))
test['comment'] = test['comment'].apply(lambda x: x.replace('\xa0',''))
test['comment'] = test['comment'].apply(lambda x: x.replace('\r',''))
test['comment'] = test['comment'].apply(lambda x: x.replace(' ',''))

train['comment_len'] = [len(row) for row in train['comment']]
test['comment_len'] = [len(row) for row in test['comment']]

1.2 简易数据分析

# 拼接后的文本长度分析
for rate in [0.5,0.75,0.9,0.95,0.99]:
    print("训练数据中{:.0f}%的文本长度小于等于 {:.2f}".format(rate*100,train['comment_len'].quantile(rate)))
plt.title("text length")
sns.distplot(train['comment_len'],bins=10,color='r')
sns.distplot(test['comment_len'],bins=10,color='g')
plt.show()

type_list = train["types"].unique()

# 类别和id转换
id2type= {k: v for k, v in enumerate(type_list)}
type2id = {v: k for k, v in enumerate(type_list)}
train['types'] = train['types'].map(type2id)
test['types'] = test['types'].map(type2id)

# 查看标签tpye分布
# plt.rcParams['font.sans-serif'] = ['SimHei']  # 中文字体设置-黑体
# plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
# sns.set(font='SimHei')  # 解决Seaborn中文显示问题
plt.title("types distribution")
sns.countplot(y='types',data=train)

label_list = train["labels"].unique()
# 查看标签label分布
plt.title("labels distribution")
sns.countplot(y='labels',data=train)

1.3 结论

训练集共82025条,测试集共35157条。
拼文本长度大多数集中在70左右，仅有极少数文本超过250。
共有3个情感标签，标签分布呈现极度不平衡状态；共有5个方面，标签分布呈现基本平衡状态。

2 数据处理

2.1 数据划分

# 创建数据迭代器
def read(df,istrain=True):
    if istrain:
        for _,data in df.iterrows():
            yield {
                "comment":data['comment'],
                "types":data['types'],
                "labels":data['labels']
                }
    else:
        for _,data in df.iterrows():
            yield {
                "comment":data['comment'],
                "types":data['types']
                }

# # 将生成器传入load_dataset
train,dev = train_test_split(train,test_size=0.2,random_state=80471)
train_ds = load_dataset(read, df=train, lazy=False)
dev_ds = load_dataset(read, df=dev, lazy=False)

2.2 数据编码

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 定义数据加载和处理函数
def convert_example(example, tokenizer, max_seq_length=512,is_test=False,is_prompt=False):
    if is_prompt:
        text = example["comment"]+"是"+example["types"]+"方面的评论"
    else:
        text = example["comment"]
    encoded_inputs = tokenizer(text=text,
                               max_seq_len=max_seq_length)
    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]

    if not is_test:
        types = np.array(example["types"], dtype="float32")
        labels = np.array(example["labels"], dtype="float32")
        return input_ids, token_type_ids, types, labels

    return input_ids, token_type_ids

def create_dataloader(dataset,
                      mode='train',
                      batch_size=1,
                      batchify_fn=None,
                      trans_fn=None):
    if trans_fn:
        dataset = dataset.map(trans_fn)

    shuffle = True if mode == 'train' else False
    if mode == 'train':
        batch_sampler = paddle.io.DistributedBatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)
    else:
        batch_sampler = paddle.io.BatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)

    return paddle.io.DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        return_list=True)

trans_dev_func = partial(
        convert_example,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        is_test=False,
        is_prompt=False)

batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
        Stack(dtype="int32"),  # types
        Stack(dtype="int32") # labels
    ): [data for data in fn(samples)]


# 构造训练集的dataloader
train_data_loader = create_dataloader(
        train_ds,
        mode='train',
        batch_size=batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_dev_func)
# 构造验证集的dataloader
dev_data_loader=create_dataloader(
        dev_ds,
        mode='dev',
        batch_size=batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_dev_func)

3 模型搭建

预训练模型 + 全连接层/Dropout策略

pretrained_model = AutoModel.from_pretrained(MODEL_NAME)

class EmotionClassifier(nn.Layer):
    def __init__(self, pretrained_model,num_types,num_labels,dropout=None):
        super().__init__()
        self.ptm = pretrained_model
        self.num_types = num_types
        self.num_labels = num_labels
        self.dropout = nn.Dropout(dropout if dropout is not None else
                                  self.ptm.config["hidden_dropout_prob"])

        self.fc1 = nn.Linear(self.ptm.config["hidden_size"], self.num_types)

        self.fc2 = nn.Linear(self.ptm.config["hidden_size"], self.num_labels)

    def forward(self,
                input_ids,
                token_type_ids=None,
                position_ids=None,
                attention_mask=None):
        _, pooled_output = self.ptm(
            input_ids,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            attention_mask=attention_mask)

        pooled_output = self.dropout(pooled_output)

        logit_types = self.fc1(pooled_output)        
        logit_labels = self.fc2(pooled_output)
        
        return logit_types,logit_labels

model = EmotionClassifier(pretrained_model, num_types=len(type_list), num_labels=len(label_list))

4 模型配置

# 如果有预训练模型，则加载模型
if init_from_ckpt and os.path.isfile(init_from_ckpt):
        state_dict = paddle.load(init_from_ckpt)
        model.set_dict(state_dict)

# 训练总步数
max_steps = len(train_data_loader) * epochs

# 学习率衰减策略
'''
__all__ = [
   1 'LinearDecayWithWarmup',
   2 'ConstScheduleWithWarmup',
   3 'CosineDecayWithWarmup',
   4 'PolyDecayWithWarmup',
   5 'CosineAnnealingWithWarmupDecay',
]
'''

# 学习率衰减策略
# lr_scheduler = paddlenlp.transformers.LinearDecayWithWarmup(learning_rate=learning_rate, total_steps=max_steps, warmup=warmup_proportion)
lr_scheduler = paddlenlp.transformers.CosineDecayWithWarmup(learning_rate=learning_rate, total_steps=max_steps, warmup=warmup_proportion)
# lr_scheduler = paddlenlp.transformers.CosineDecayWithWarmup(
#     learning_rate=learning_rate, total_steps=max_steps, warmup=warmup_proportion,
#      with_hard_restarts=False, num_cycles=3, last_epoch=- 1, verbose=False)

# warmup_step=max_steps*warmup_proportion
# decay_step=max_steps*(1-decay_proportion)
# lr_scheduler = paddlenlp.transformers.CosineAnnealingWithWarmupDecay(max_lr=learning_rate, min_lr=1e-7, warmup_step=warmup_step, decay_step=decay_step)

decay_params = [
    p.name for n, p in model.named_parameters()
    if not any(nd in n for nd in ["bias", "norm"])
]

# 定义优化器
optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=weight_decay,
    apply_decay_param_fun=lambda x: x in decay_params,
    grad_clip=paddle.nn.ClipGradByGlobalNorm(max_grad_norm))

criterion1 = paddle.nn.loss.CrossEntropyLoss()

import paddle
import numpy as np

class FocalLoss(paddle.nn.Layer):
    def __init__(self, alpha=0.5, gamma=2, num_classes=3, weight=None, ignore_index=-100):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weight = weight if weight is not None else paddle.to_tensor(np.array([1.] * num_classes), dtype='float32')
        self.ce_fn = paddle.nn.CrossEntropyLoss(
            weight=self.weight, soft_label=False, ignore_index=ignore_index)

    def forward(self, preds, labels):
        logpt = -self.ce_fn(preds, labels)
        pt = paddle.exp(logpt)
        loss = -((1 - pt) ** self.gamma) * self.alpha * logpt
        return loss

# --修改损失函数
if unbalance == "Focal_loss":
    criterion2 = FocalLoss(
        alpha=focalloss_alpha,
        gamma=focalloss_gamma,
        num_classes=len(label_list))
else:
    # 交叉熵损失
    criterion2 = paddle.nn.loss.CrossEntropyLoss()

5 模型训练

# 定义模型训练验证评估函数
@paddle.no_grad()
def evaluate(model, data_loader):
    """
    Given a dataset, it evals model and computes the metric.

    Args:
        model(obj:`paddle.nn.Layer`): A model to classify texts.
        data_loader(obj:`paddle.io.DataLoader`): The dataset loader which generates batches.
        criterion(obj:`paddle.nn.Layer`): It can compute the loss.
        metric(obj:`paddle.metric.Metric`): The evaluation metric.
    """
    model.eval()

    pred_types = []
    pred_labels = []
    real_types = []
    real_labels = []

    for batch in tqdm(data_loader):
        input_ids, token_type_ids, types, labels = batch
        logit_types, logit_labels = model(input_ids, token_type_ids)
        
        prob_types = F.softmax(logit_types,axis=1)
        prob_labels = F.softmax(logit_labels,axis=1)

        pred_types.extend(prob_types.argmax(axis=1).numpy())
        pred_labels.extend(prob_labels.argmax(axis=1).numpy())

        real_types.extend(types.reshape([-1]).numpy())
        real_labels.extend(labels.reshape([-1]).numpy())
        
    accuracy_types = accuracy_score(pred_types, real_types)  
    accuracy_labels = accuracy_score(pred_labels, real_labels)

    model.train()

    return accuracy_types,accuracy_labels  # 返回准确率

def do_train(model, train_data_loader, dev_data_loader, criterion1, criterion2, optimizer, lr_scheduler):
    model.train()
    max_accuracy=0
    save_dir = "./checkpoint/" + MODEL_NAME
    for epoch in range(1, epochs + 1):
        with tqdm(total=len(train_data_loader)) as pbar:
            for step, batch in enumerate(train_data_loader, start=1):

                input_ids, token_type_ids, types, labels = batch
                logit_types,logit_labels = model(input_ids, token_type_ids)
                loss = criterion1(logit_types,types) + criterion2(logit_labels,labels)

                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()
                pbar.set_postfix({'loss' : '%.5f' % (loss.numpy())})
                pbar.update(1)
        accuracy_types,accuracy_labels = evaluate(model, dev_data_loader)
        print("Epoch: %d, Types Accuracy: %.5f, Labels Accuracy: %.5f" % (epoch, accuracy_types,accuracy_labels))
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)  ## 递归创建

        print("Epoch: %d, Types Accuracy: %.5f, Labels Accuracy: %.5f" % (epoch, accuracy_types,accuracy_labels), 
            file=open(save_dir +'/best_model_log.txt', 'a'))
            
        if accuracy_types*accuracy_labels >= max_accuracy:
            max_accuracy = accuracy_types*accuracy_labels
            save_param_path = os.path.join(save_dir, 'best_model.pdparams')
            paddle.save(model.state_dict(), save_param_path)
            tokenizer.save_pretrained(save_dir)                
    save_param_path = os.path.join(save_dir, 'last_model.pdparams')
    paddle.save(model.state_dict(), save_param_path)

do_train(model, train_data_loader, dev_data_loader, criterion1, criterion2, optimizer, lr_scheduler)

此文章为搬运
原项目链接

百度飞桨AI Studio社区

学大模型，用大模型上飞桨星河社区！每天8点V100G算力免费领！免费领取ERNIE 4.0 100w Token >>>

更多推荐

利用Amazon Bedrock畅玩Claude 3等多种领先模型，抢占AI高地(体验倒计时4小时)

百度飞桨星河社区

RAPTOR：索引树状 RAG，使用树结构来捕捉文本的高级和低级细节

百度飞桨星河社区

MultiHop-RAG：多跳查询的基准检索增强生成

百度飞桨星河社区

所有评论(0)

查看更多评论

AI Studio

@m0_63642362

已为社区贡献1436条内容

百度点石杯情感极性分析（微改，双head，方面 和 情感极性 多标签）

AI Studio

百度点石杯情感极性分析（微改，双head，方面 和 情感极性 多标签任务）

任务介绍

方案设计

1 数据读取和EDA

1.1 读取数据并统一格式

1.2 简易数据分析

1.3 结论

2 数据处理

2.1 数据划分

2.2 数据编码

3 模型搭建

4 模型配置

5 模型训练

所有评论(0)

AI Studio

百度点石杯情感极性分析（微改，双head，方面和情感极性多标签）

百度点石杯情感极性分析（微改，双head，方面和情感极性多标签任务）