常规赛:中文新闻文本标题分类(PaddleNLP)

1.赛题简介

文本分类是借助计算机对文本集(或其他实体或物件)按照一定的分类体系或标准进行自动分类标记。本次比赛为新闻标题文本分类 ,选手需要根据提供的新闻标题文本和类别标签训练一个新闻分类模型,然后对测试集的新闻标题文本进行分类,评价指标上使用Accuracy = 分类正确数量 / 需要分类总数量。同时本次参赛选手需使用飞桨框架和飞桨文本领域核心开发库PaddleNLP,PaddleNLP具备简洁易用的文本领域全流程API、多场景的应用示例、非常丰富的预训练模型,深度适配飞桨框架2.x版本。

2.数据介绍

THUCNews是根据新浪新闻RSS订阅频道2005~2011年间的历史数据筛选过滤生成,包含74万篇新闻文档(2.19 GB),均为UTF-8纯文本格式。本次比赛数据集在原始新浪新闻分类体系的基础上,重新整合划分出14个候选分类类别:财经、彩票、房产、股票、家居、教育、科技、社会、时尚、时政、体育、星座、游戏、娱乐。提供训练数据共832471条。

比赛提供数据集的格式:训练集和验证集格式:原文标题+\t+标签,测试集格式:原文标题。

该项目基本内容框架与大神炼丹师的基本一致。

这里主要分享一些我认为比较重要的,可以提升分数的方法。虽然我尝试了一些,效果都不太好,但是希望能抛砖引玉,仅供参考。

1.数据分类的不均衡

14个类别中最多的有14万的数据,最少的只有3000。然而我之前采用了过采样或欠采样的方法,效果一般。

过采样可能会导致某些少数据量的类别过拟合,而欠采样又会浪费大量的其他类别的数据。

解决的方法还有

(1)修改分类阈值:直接使用类别分布不均衡的数据训练分类器,会使得模型在预测时更偏向于多数类,所以不再以0.5为分类阈值,而是针对少数类在模型仅有较小把握时就将样本归为少数类。

(2)代价敏感学习:比如LR算法中设置class_weight参数。

2.r-drop正则化

看到相关文献报道,在很多任务上使用r-drop正则化后能明显提升模型的泛化能力,提升精度。是否可以在文本分类中使用r-drop以提高分数

3.数据增广

通过nlpcda中的方法来增加少样本类别的数据

import pandas as pd

train = pd.read_table('data/data103654/train.txt', sep='\t',header=None)  # 训练集
dev = pd.read_table('data/data103654/dev.txt', sep='\t',header=None)      # 验证集
test = pd.read_table('data/data103654/test.txt', sep='\t',header=None)    # 测试集

# 添加列名便于对数据进行更好处理
train.columns = ["text_a",'label']
dev.columns = ["text_a",'label']
test.columns = ["text_a"]
train.to_csv('data/train.csv', sep='\t', index=False)  # 保存训练集,格式为text_a,label
dev.to_csv('data/dev.csv', sep='\t', index=False)      # 保存验证集,格式为text_a,label
test.to_csv('data/test.csv', sep='\t', index=False)    # 保存测试集,格式为text_a
import math
import numpy as np
import os
import collections
from functools import partial
import random
import time
import inspect
import importlib
from tqdm import tqdm
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.io import IterableDataset
from paddle.utils.download import get_path_from_url
import paddlenlp as ppnlp
from paddlenlp.data import JiebaTokenizer, Pad, Stack, Tuple, Vocab
from paddlenlp.datasets import MapDataset
from paddle.dataset.common import md5file
from paddlenlp.datasets import DatasetBuilder
MODEL_NAME = "roberta-wwm-ext-large"
tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained(MODEL_NAME)
model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained(MODEL_NAME,num_classes=14,dropout=0.3)

[2021-11-18 17:41:09,006] [    INFO] - Downloading vocab.txt from https://paddlenlp.bj.bcebos.com/models/transformers/roberta_large/vocab.txt
100%|██████████| 107/107 [00:00<00:00, 3890.58it/s]
[2021-11-18 17:41:09,247] [    INFO] - Downloading https://paddlenlp.bj.bcebos.com/models/transformers/roberta_large/roberta_chn_large.pdparams and saved to /home/aistudio/.paddlenlp/models/roberta-wwm-ext-large
[2021-11-18 17:41:09,249] [    INFO] - Downloading roberta_chn_large.pdparams from https://paddlenlp.bj.bcebos.com/models/transformers/roberta_large/roberta_chn_large.pdparams
100%|██████████| 1271615/1271615 [00:26<00:00, 48469.13it/s]
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py:1301: UserWarning: Skip loading for classifier.weight. classifier.weight is not found in the provided dict.
  warnings.warn(("Skip loading for {}. ".format(key) + str(err)))
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py:1301: UserWarning: Skip loading for classifier.bias. classifier.bias is not found in the provided dict.
  warnings.warn(("Skip loading for {}. ".format(key) + str(err)))
label_list=list(train.label.unique())
class NewsData(DatasetBuilder):
    SPLITS = {
        'train': 'data/train.csv',  # 训练集
        'dev': 'data/dev.csv',      # 验证集
    }

    def _get_data(self, mode, **kwargs):
        filename = self.SPLITS[mode]
        return filename

    def _read(self, filename):
        """读取数据"""
        with open(filename, 'r', encoding='utf-8') as f:
            head = None
            for line in f:
                data = line.strip().split("\t")    # 以'\t'分隔各列
                if not head:
                    head = data
                else:
                    text_a, label = data
                    yield {"text_a": text_a, "label": label}  # 此次设置数据的格式为:text_a,label,可以根据具体情况进行修改

    def get_labels(self):
        return label_list   # 类别标签
def load_dataset(name=None,
                 data_files=None,
                 splits=None,
                 lazy=None,
                 **kwargs):
   
    reader_cls = NewsData  # 加载定义的数据集格式
    print(reader_cls)
    if not name:
        reader_instance = reader_cls(lazy=lazy, **kwargs)
    else:
        reader_instance = reader_cls(lazy=lazy, name=name, **kwargs)

    datasets = reader_instance.read_datasets(data_files=data_files, splits=splits)
    return datasets
train_ds, dev_ds = load_dataset(splits=["train", "dev"])
def convert_example(example, tokenizer, max_seq_length=128, is_test=False):
    qtconcat = example["text_a"]
    encoded_inputs = tokenizer(text=qtconcat, max_seq_len=max_seq_length)  # tokenizer处理为模型可接受的格式 
    input_ids = encoded_inputs["input_ids"]
    token_type_ids = encoded_inputs["token_type_ids"]

    if not is_test:
        label = np.array([example["label"]], dtype="int64")
        return input_ids, token_type_ids, label
    else:
        return input_ids, token_type_ids
def create_dataloader(dataset,
                      mode='train',
                      batch_size=1,
                      batchify_fn=None,
                      trans_fn=None):
    if trans_fn:
        dataset = dataset.map(trans_fn)

    shuffle = True if mode == 'train' else False
    # 训练数据集随机打乱,测试数据集不打乱
    if mode == 'train':
        batch_sampler = paddle.io.DistributedBatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)
    else:
        batch_sampler = paddle.io.BatchSampler(
            dataset, batch_size=batch_size, shuffle=shuffle)

    return paddle.io.DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        collate_fn=batchify_fn,
        return_list=True)
<class '__main__.NewsData'>
# 参数设置:
# 批处理大小,显存如若不足的话可以适当改小该值  
batch_size = 300
# 文本序列最大截断长度,需要根据文本具体长度进行确定,最长不超过512。 通过文本长度分析可以看出文本长度最大为48,故此处设置为48
max_seq_length = 48
# 将数据处理成模型可读入的数据格式
trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length)

batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # token_type_ids
    Stack()  # labels
): [data for data in fn(samples)]

# 训练集迭代器
train_data_loader = create_dataloader(
    train_ds,
    mode='train',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func)

# 验证集迭代器
dev_data_loader = create_dataloader(
    dev_ds,
    mode='dev',
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func)
print("参数done")
参数done
class RDropLoss(nn.Layer):
    """
    R-Drop Loss implementation
    For more information about R-drop please refer to this paper: https://arxiv.org/abs/2106.14448
    Original implementation please refer to this code: https://github.com/dropreg/R-Drop

    Args:
        reduction(str, optional):
            Indicate how to average the loss, the candicates are ``'none'``,``'batchmean'``,``'mean'``,``'sum'``.
            If `reduction` is ``'mean'``, the reduced mean loss is returned;
            If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
            If `reduction` is ``'sum'``, the reduced sum loss is returned;
            If `reduction` is ``'none'``, no reduction will be applied.
            Defaults to ``'none'``.
    """

    def __init__(self, reduction='none'):
        super(RDropLoss, self).__init__()
        if reduction not in ['sum', 'mean', 'none', 'batchmean']:
            raise ValueError(
                "'reduction' in 'RDropLoss' should be 'sum', 'mean' 'batchmean', or 'none', "
                "but received {}.".format(reduction))
        self.reduction = reduction

    def forward(self, p, q, pad_mask=None):
        """
        Args:
            p(Tensor): the first forward logits of training examples.
            q(Tensor): the second forward logits of training examples.
            pad_mask(Tensor, optional): The Tensor containing the binary mask to index with, it's data type is bool.

        Returns:
            Tensor: Returns tensor `loss`, the rdrop loss of p and q.
        """
        p_loss = F.kl_div(
            F.log_softmax(
                p, axis=-1),
            F.softmax(
                q, axis=-1),
            reduction=self.reduction)
        q_loss = F.kl_div(
            F.log_softmax(
                q, axis=-1),
            F.softmax(
                p, axis=-1),
            reduction=self.reduction)

        # pad_mask is for seq-level tasks
        if pad_mask is not None:
            p_loss = paddle.masked_select(p_loss, pad_mask)
            q_loss = paddle.masked_select(q_loss, pad_mask)

        # You can choose whether to use function "sum" and "mean" depending on your task
        p_loss = p_loss.sum()
        q_loss = q_loss.sum()
        loss = (p_loss + q_loss) / 2
        return loss

# 固定随机种子便于结果的复现
seed = 1024
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
<paddle.fluid.core_avx.Generator at 0x7fece4bad670>
from paddlenlp.transformers import LinearDecayWithWarmup

# 定义训练配置参数:
# 定义训练过程中的最大学习率
learning_rate = 4e-5
# 训练轮次
epochs = 10
# 学习率预热比例
warmup_proportion = 0.1
# 权重衰减系数,类似模型正则项策略,避免模型过拟合
weight_decay = 0.01

num_training_steps = len(train_data_loader) * epochs
lr_scheduler = LinearDecayWithWarmup(learning_rate, num_training_steps, warmup_proportion)

# AdamW优化器
optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=weight_decay,
    apply_decay_param_fun=lambda x: x in [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ])
from collections import Counter
c = Counter(train['label'])
total = sum(c.values())
weight = paddle.to_tensor([min(5,total/(len(label_list)*c[x])) for x in label_list],dtype='float64')
#weight = paddle.to_tensor([1]*14,dtype='float64')
print(weight)
criterion = paddle.nn.loss.CrossEntropyLoss(weight=weight)  # 交叉熵损失函数
rdrop_loss = RDropLoss()
metric = paddle.metric.Accuracy()              # accuracy评价指标
Tensor(shape=[14], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
       [0.36653729, 0.45379879, 0.94663300, 0.38678983, 0.64470694, 1.42405025, 1.83264896, 1.60974958, 2.97854966, 1.17443305, 2.45021556, 5.        , 5.        , 4.46708183])
# 定义模型训练验证评估函数
@paddle.no_grad()
def evaluate(model, criterion, metric, data_loader):
    model.eval()
    metric.reset()
    losses = []
    for batch in data_loader:
        input_ids, token_type_ids, labels = batch
        logits = model(input_ids, token_type_ids)
        loss = criterion(logits, labels)
        losses.append(loss.numpy())
        correct = metric.compute(logits, labels)
        metric.update(correct)
        accu = metric.accumulate()
    print("eval loss: %.5f, accu: %.5f" % (np.mean(losses), accu))  # 输出验证集上评估效果
    metric.reset()
    model.train()
    return accu  # 返回准确率
import paddle.nn.functional as F

save_dir = "checkpoint"
if not  os.path.exists(save_dir):
    os.makedirs(save_dir)

print('start')

rdrop_coef = 0
pre_accu=0
accu=0
global_step = 0
for epoch in range(1, epochs + 1):
    for step, batch in enumerate(train_data_loader, start=1):
        input_ids, segment_ids, labels = batch
        logits = model(input_ids, segment_ids)
        if rdrop_coef>0:
            logits_2 = model(
                input_ids, segment_ids)
            ce_loss = (criterion(logits, labels) + criterion(logits_2, labels)) * 0.5
            kl_loss = rdrop_loss(logits, logits_2)
            loss = ce_loss + kl_loss * rdrop_coef
        else:
            loss = criterion(logits, labels)
        probs = F.softmax(logits, axis=1)
        correct = metric.compute(probs, labels)
        metric.update(correct)
        acc = metric.accumulate()
        
        global_step += 1
        if global_step % 10 == 0 :
            print("global step %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f" % (global_step, epoch, step, loss, acc))
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.clear_grad()
    # 每轮结束对验证集进行评估
    accu = evaluate(model, criterion, metric, dev_data_loader)
    print(accu)
    if accu > pre_accu:
        # 保存较上一轮效果更优的模型参数
        save_param_path = os.path.join(save_dir, 'model_state.pdparams')  # 保存模型参数
        paddle.save(model.state_dict(), save_param_path)
        pre_accu=accu
    paddle.save(lr_scheduler.state_dict(), "lr")
    paddle.save(optimizer.state_dict(), "opt")
    paddle.save(model.state_dict(), "para")
start


---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

/tmp/ipykernel_2836/3404433392.py in <module>
----> 1 paddle.save(lr_scheduler.state_dict(), "lr")
      2 paddle.save(optimizer.state_dict(), "opt")
      3 paddle.save(model.state_dict(), "para")


NameError: name 'paddle' is not defined

lr_scheduler.set_state_dict(paddle.load("lr"))
optimizer.set_state_dict(paddle.load("opt"))
model.set_state_dict(paddle.load("para"))

import os
import paddle

params_path = 'checkpoint/model_state.pdparams'
if params_path and os.path.isfile(params_path):
    # 加载模型参数
    state_dict = paddle.load(params_path)
    model.set_dict(state_dict)
    print("Loaded parameters from %s" % params_path)
# 定义模型预测函数
def predict(model, data, tokenizer, label_map, batch_size=1):
    examples = []
    # 将输入数据(list格式)处理为模型可接受的格式
    for text in data:
        input_ids, segment_ids = convert_example(
            text,
            tokenizer,
            max_seq_length=128,
            is_test=True)
        examples.append((input_ids, segment_ids))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
    ): fn(samples)

    # Seperates data into some batches.
    batches = []
    one_batch = []
    for example in examples:
        one_batch.append(example)
        if len(one_batch) == batch_size:
            batches.append(one_batch)
            one_batch = []
    if one_batch:
        # The last batch whose size is less than the config batch_size setting.
        batches.append(one_batch)

    results = []
    model.eval()
    for batch in batches:
        input_ids, segment_ids = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        segment_ids = paddle.to_tensor(segment_ids)
        logits = model(input_ids, segment_ids)
        probs = F.softmax(logits, axis=1)
        idx = paddle.argmax(probs, axis=1).numpy()
        idx = idx.tolist()
        labels = [label_map[i] for i in idx]
        results.extend(labels)
    return results  # 返回预测结果
label_list=list(train.label.unique())
label_map = { 
    idx: label_text for idx, label_text in enumerate(label_list)
}
test = pd.read_csv('data/test.csv',sep='\t')  

# 定义对数据的预处理函数,处理为模型输入指定list格式
def preprocess_prediction_data(data):
    examples = []
    for text_a in data:
        examples.append({"text_a": text_a})
    return examples

# 对测试集数据进行格式处理
data1 = list(test.text_a)
examples = preprocess_prediction_data(data1)

# 对测试集进行预测
results = predict(model, examples, tokenizer, label_map, batch_size=16)   

# 将list格式的预测结果存储为txt文件,提交格式要求:每行一个类别
def write_results(labels, file_path):
    with open(file_path, "w", encoding="utf8") as f:
        f.writelines("\n".join(labels))

write_results(results, "./result.txt")


# 因格式要求为zip,故需要将结果文件压缩为submission.zip提交文件
!zip 'submission.zip' 'result.txt'

参考资料

  1. 数据类别分布不均衡, 有哪些应对方法?

  2. R-Drop: Regularized Dropout for Neural Networks

Logo

学大模型,用大模型上飞桨星河社区!每天8点V100G算力免费领!免费领取ERNIE 4.0 100w Token >>>

更多推荐