DataFountain-天气以及时间分类：CNN多标签分类准确率0.9_副本

在自动驾驶场景中，天气和时间（黎明、早上、下午、黄昏、夜晚）会对传感器的精度造成影响，比如雨天和夜晚会对视觉传感器的精度造成很大的影响。此赛题旨在对拍摄的照片天气和时间进行分类，

AI Studio

2086人浏览 · 2022-03-07 08:02:15

AI Studio · 2022-03-07 08:02:15 发布

转载自AI Studio
项目链接https://aistudio.baidu.com/aistudio/projectdetail/3169455

天气以及时间分类

赛题名称

天气以及时间分类

https://www.datafountain.cn/competitions/555

赛题背景

在自动驾驶场景中，天气和时间（黎明、早上、下午、黄昏、夜晚）会对传感器的精度造成影响，比如雨天和夜晚会对视觉传感器的精度造成很大的影响。此赛题旨在使用Oneflow框架对拍摄的照片天气和时间进行分类，从而在不同的天气和时间使用不同的自动驾驶策略。

赛题任务

此赛题的数据集由云测数据提供。比赛数据集中包含3000张真实场景下行车记录仪采集的图片，其中训练集包含2600张带有天气和时间类别标签的图片，测试集包含400张不带有标签的图片。

本赛题的数据集包含2600张人工标注的天气和时间标签。天气类别包含多云、晴天、雨天、雪天和雾天5个类别；时间包含黎明、早上、下午、黄昏、夜晚5个类别。
部分数据可视化及标签如下：

数据说明

数据集包含anno和image两个文件夹，anno文件夹中包含2600个标签json文件，image文件夹中包含3000张行车记录仪拍摄的JPEG编码照片。图片标签将字典以json格式序列化进行保存：

列名	取值范围	作用
Period	黎明、早上、下午、黄昏、夜晚	图片拍摄时间
Weather	多云、晴天、雨天、雪天、雾天	图片天气

!echo y | unzip test_dataset.zip > log.log
!echo y | unzip train_dataset.zip > log.log

replace test_images/00008.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace test_images/00018.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)
replace train_images/00001.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: replace train_images/00002.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)

import io
import math, json
import numpy as np
import pandas as pd
from PIL import Image

import matplotlib.pyplot as plt
import paddle
import paddle.nn.functional as F
import paddle.vision.transforms as T
from paddle.io import DataLoader, Dataset

import warnings
warnings.filterwarnings("ignore")

paddle.__version__

'2.2.0'

数据读取

# 读取数据集标注，提取标注信息中的关键信息
train_json = pd.read_json('train.json')
train_json['filename'] = train_json['annotations'].apply(lambda x: x['filename'].replace('\\', '/'))
train_json['period'] = train_json['annotations'].apply(lambda x: x['period'])
train_json['weather'] = train_json['annotations'].apply(lambda x: x['weather'])

train_json.head()

	annotations	filename	period	weather
0	{'filename': 'train_images\00001.jpg', 'period...	train_images/00001.jpg	Morning	Cloudy
1	{'filename': 'train_images\00002.jpg', 'period...	train_images/00002.jpg	Afternoon	Cloudy
2	{'filename': 'train_images\00003.jpg', 'period...	train_images/00003.jpg	Morning	Cloudy
3	{'filename': 'train_images\00004.jpg', 'period...	train_images/00004.jpg	Morning	Sunny
4	{'filename': 'train_images\00005.jpg', 'period...	train_images/00005.jpg	Afternoon	Cloudy

标签处理

# 将标签进行编码，这里需要记住编码的次序。
# 可以手动使用dict来实现，这里用factorize
train_json['period'], period_dict = pd.factorize(train_json['period'])
train_json['weather'], weather_dict = pd.factorize(train_json['weather'])

统计标签

train_json['period'].value_counts()

0    1613
1     829
3     124
2      34
Name: period, dtype: int64

train_json['weather'].value_counts()

0    1119
1     886
2     595
Name: weather, dtype: int64

自定义数据集

# 自定义数据集
class WeatherDataset(Dataset):
    def __init__(self, df):
        super(WeatherDataset, self).__init__()
        self.df = df
    
        # 定义数据扩增方法
        self.transform = T.Compose([
            T.Resize(size=(340,340)),
            T.RandomCrop(size=(256, 256)),
            T.RandomRotation(10),
            T.RandomHorizontalFlip(),
            T.RandomVerticalFlip(),
            T.ToTensor(),
            T.Normalize(mean=0.5, std=0.5)
        ])

    def __getitem__(self, index):
        file_name = self.df['filename'].iloc[index]
        img = Image.open(file_name)
        img = self.transform(img)
        return img,\
                paddle.to_tensor(self.df['period'].iloc[index]),\
                paddle.to_tensor(self.df['weather'].iloc[index])

    def __len__(self):
        return len(self.df)

# 训练集
train_dataset = WeatherDataset(train_json.iloc[:-500])
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# 验证集
val_dataset = WeatherDataset(train_json.iloc[-500:])
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)

搭建模型

from paddle.vision.models import resnet18

# 自定义模型，模型有两个head
class WeatherModel(paddle.nn.Layer):
    def __init__(self):
        super(WeatherModel, self).__init__()
        backbone = resnet18(pretrained=True)
        backbone.fc = paddle.nn.Identity()
        self.backbone = backbone

        # 分类1
        self.fc1 = paddle.nn.Linear(512, 4)

        # 分类2
        self.fc2 = paddle.nn.Linear(512, 3)

    def forward(self, x):
        out = self.backbone(x)

        # 同时完成类别1 和 类别2 分类
        logits1 = self.fc1(out)
        logits2 = self.fc2(out)
        return logits1, logits2

model = WeatherModel()
model(paddle.to_tensor(np.random.rand(10, 3, 256, 256).astype(np.float32)))

W1203 00:28:16.542893  7854 device_context.cc:447] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W1203 00:28:16.546566  7854 device_context.cc:465] device: 0, cuDNN Version: 7.6.
INFO:paddle.utils.download:unique_endpoints {''}
INFO:paddle.utils.download:File /home/aistudio/.cache/paddle/hapi/weights/resnet18.pdparams md5 checking...
INFO:paddle.utils.download:Found /home/aistudio/.cache/paddle/hapi/weights/resnet18.pdparams





(Tensor(shape=[10, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
        [[ 0.82836318,  1.35641491,  0.46156025, -1.45563996],
         [ 0.08160630,  1.42760110,  0.87870872, -2.34320068],
         [ 0.33396342,  1.35725105, -0.14363584, -2.19671369],
         [ 0.07748616, -0.12090680, -0.29871672, -1.20805240],
         [ 0.11444393,  1.95767570,  0.52079809, -1.79601681],
         [-1.35744286,  0.13195890, -0.26686692, -1.47781038],
         [ 0.99778777, -0.46945763,  1.13994253, -0.39733961],
         [-0.46440995, -0.54322171, -0.02341729, -1.50987113],
         [-0.85322177,  1.23348832, -0.56562370, -2.12171173],
         [-0.74074465,  2.26537490,  1.03538465, -2.16883540]]),
 Tensor(shape=[10, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
        [[ 0.27792573,  0.37752092,  0.09397808],
         [-1.53032672,  1.26350784,  0.16554639],
         [-0.11922672,  2.85570526, -1.09816480],
         [ 0.15814775, -0.65982032, -0.02658810],
         [-0.84676772,  1.20923686, -0.41986156],
         [-0.72880727,  0.98558438, -1.09161079],
         [-0.65401202,  1.84503722, -0.01031107],
         [-0.10907726,  0.86592245,  0.49232167],
         [-0.59027153,  0.95352286, -0.89762455],
         [-0.68332863,  1.41990232, -1.38094914]]))

训练与验证

# 定义损失函数和优化器
optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=0.0001)
criterion = paddle.nn.CrossEntropyLoss()

for epoch in range(0, 1):
    Train_Loss, Val_Loss = [], []
    Train_ACC1, Train_ACC2 = [], []
    Val_ACC1, Val_ACC2 = [], []
    
    # 模型训练
    model.train()
    for i, (x, y1, y2) in enumerate(train_loader):
        pred1, pred2 = model(x)

        # 类别1 loss + 类别2 loss 为总共的loss
        loss = criterion(pred1, y1) + criterion(pred2, y2)
        Train_Loss.append(loss.item())
        loss.backward()
        optimizer.step()
        optimizer.clear_grad()

        Train_ACC1.append((pred1.argmax(1) == y1.flatten()).numpy().mean())
        Train_ACC2.append((pred2.argmax(1) == y2.flatten()).numpy().mean())

    # 模型验证
    model.eval()
    for i, (x, y1, y2) in enumerate(val_loader):
        pred1, pred2 = model(x)
        loss = criterion(pred1, y1) + criterion(pred2, y2)
        Val_Loss.append(loss.item())
        Val_ACC1.append((pred1.argmax(1) == y1.flatten()).numpy().mean())
        Val_ACC2.append((pred2.argmax(1) == y2.flatten()).numpy().mean())

    if epoch % 1 == 0:
        print(f'\nEpoch: {epoch}')
        print(f'Loss {np.mean(Train_Loss):3.5f}/{np.mean(Val_Loss):3.5f}')
        print(f'period.ACC {np.mean(Train_ACC1):3.5f}/{np.mean(Val_ACC1):3.5f}')
        print(f'weather.ACC {np.mean(Train_ACC2):3.5f}/{np.mean(Val_ACC2):3.5f}')

Epoch: 0
Loss 1.60270/1.57734
period.ACC 0.64383/0.72972
weather.ACC 0.72986/0.62680

预测与提交

import glob

# 测试集数据路径
test_df = pd.DataFrame({'filename': glob.glob('./test_images/*.jpg')})
test_df['period'] = 0
test_df['weather'] = 0
test_df = test_df.sort_values(by='filename')

test_dataset = WeatherDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model.eval()
period_pred = []
weather_pred = []

# 测试集进行预测
for i, (x, y1, y2) in enumerate(test_loader):
    pred1, pred2 = model(x)
    period_pred += period_dict[pred1.argmax(1).numpy()].tolist()
    weather_pred += weather_dict[pred2.argmax(1).numpy()].tolist()

test_df['period'] = period_pred
test_df['weather'] = weather_pred

submit_json = {
    'annotations':[]
}

# 生成测试集结果
for row in test_df.iterrows():
    submit_json['annotations'].append({
        'filename': 'test_images\\' + row[1].filename.split('/')[-1],
        'period': row[1].period,
        'weather': row[1].weather,
    })

with open('submit.json', 'w') as up:
    json.dump(submit_json, up)