使用Seq2seq框架搭建对联生成系统
一个基于Seq2Seq框架的从0开始搭建的具有界面的对联生成系统。对出的对联对仗工整,仄起平收,感觉效果还不错。
项目简介
该项目是基于Seq2Seq框架搭建的对联生成系统。(我的自然语言理解课程作业)。
该项目参考了官方给出的代码,并在官方基础上进行改进。
然后我写了个报告,发表在论坛了,直达:我的报告
我的贡献主要有两点:
- 对原有数据集进行改进,剔除14条问题数据
问题数据展示如下(详细说明在报告中有提及):
2.使用Tkinter搭建可视化界面,如下图所示:
注意:tkinter 在AI studio中应该没法用,可下载代码在本地运行。
import io
import os
import numpy as np
import paddle
import paddlenlp
from functools import partial
from paddle.static import InputSpec
1.数据处理
data_in_path="/home/aistudio/data/data110057/fixed_couplets_in.txt"
data_out_path="/home/aistudio/data/data110057/fixed_couplets_out.txt"
def openfile(src):
with open(src,'r',encoding="utf-8") as source:
lines=source.readlines()
return lines
data_in=openfile(data_in_path)
data_out=openfile(data_out_path)
print(len(data_in))
print(len(data_out))
print(data_in[0])
print(data_out[0])
print(len(data_in[0]))
744915
744915
腾 飞 上 铁 , 锐 意 改 革 谋 发 展 , 勇 当 千 里 马
和 谐 南 供 , 安 全 送 电 保 畅 通 , 争 做 领 头 羊
37
def delete_newline_and_space(lista):
newlist=[]
for i in range(len(lista)):
newlist.append(["<start>"]+lista[i].strip().split()+['<end>'])
return newlist
data_in_nospace=delete_newline_and_space(data_in)
data_out_nospace=delete_newline_and_space(data_out)
print(data_in_nospace[0])
print(data_out_nospace[0])
['<start>', '腾', '飞', '上', '铁', ',', '锐', '意', '改', '革', '谋', '发', '展', ',', '勇', '当', '千', '里', '马', '<end>']
['<start>', '和', '谐', '南', '供', ',', '安', '全', '送', '电', '保', '畅', '通', ',', '争', '做', '领', '头', '羊', '<end>']
计算最长的对联长度couplet_maxlen,并将该长度+2作为向量长。不足进行填充。
couplet_maxlen=max([len(i) for i in data_in_nospace])
couplet_maxlen
34
查看最长的对联:
[‘旧’, ‘画’, ‘一’, ‘张’, ‘,’, ‘龙’, ‘不’, ‘吟’, ‘,’, ‘虎’, ‘不’, ‘啸’, ‘,’, ‘花’, ‘不’, ‘闻’, ‘香’, ‘,’, ‘鸟’, ‘不’, ‘叫’, ‘,’, ‘见’, ‘此’, ‘小’, ‘子’, ‘,’, ‘好’, ‘笑’, ‘,’, ‘好’, ‘笑’]
maxlen=0
count=’’
for i in data_in_nospace:
if len(i)>maxlen:
maxlen=len(i)
count=i
print(count)
1.1 建立语料库、字符转id的字典和id转字符的字典
- 字符主要指的是汉字,当然还有标点
有个问题: 输入输出的语料库是二者分别建立一个,还是二者一起建立一个?
在这里建立一个统一的语料库进行实验。(毕设的时候我是分开建的,不知道哪个做法正确)
def bulid_cropus(data_in,data_out):
crpous=[]
for i in data_in:
crpous.extend(i)
for i in data_out:
crpous.extend(i)
return crpous
def build_dict(corpus,frequency):
# 首先统计不同词(汉字)的频率,使用字典记录
word_freq_dict={}
for ch in corpus:
if ch not in word_freq_dict:
word_freq_dict[ch]=0
word_freq_dict[ch]+=1
# 根据频率对字典进行排序
word_freq_dict=sorted(word_freq_dict.items(),key=lambda x:x[1],reverse=True)
# 构造 词-id ,id-词 字典
# word2id_dict={'<pad>':0,"<unk>":1}
# id2word_dict={0:'<pad>',1:'<unk>'}
word2id_dict={}
id2word_dict={}
# 按照频率,从高到低,开始遍历每个单词,并赋予第一无二的 id
for word,freq in word_freq_dict:
if freq>frequency:
curr_id=len(word2id_dict)
word2id_dict[word]=curr_id
id2word_dict[curr_id]=word
else:
# else 部分在 使 单词 指向unk,对于汉字,我们不设置unk,令frequency=0
word2id_dict[word]=1
return word2id_dict,id2word_dict
word_frequency=0
word2id_dict,id2word_dict=build_dict(bulid_cropus(data_in_nospace,data_out_nospace),word_frequency)
词汇量大小
word_size=len(word2id_dict)
id_size=len(id2word_dict)
print("汉字个数:",word_size,"\n id个数:",id_size)
汉字个数: 9017
id个数: 9017
with open("word2id.txt",'w',encoding='utf-8') as w2i:
for i in list(id2word_dict.items()):
w2i.write(str(i)+'\n')
print(word2id_dict['<end>'])
print(word2id_dict['<start>'])
1
0
创建 tensor
def getensor(w2i,datalist,maxlength):
in_tensor=[]
for lista in datalist:
in_samll_tensor=[]
for li in lista:
in_samll_tensor.append(w2i[li])
# if len(in_samll_tensor)<maxlength:
# in_samll_tensor+=[w2i['<end>']]*(maxlength-len(in_samll_tensor))
in_tensor.append(in_samll_tensor)
return in_tensor
in_tensor=getensor(word2id_dict,data_in_nospace,couplet_maxlen)
out_tensor=getensor(word2id_dict,data_out_nospace,couplet_maxlen)
print(len(in_tensor),len(out_tensor))
print(len(in_tensor[0]),len(out_tensor[0]))
print(len(in_tensor[1]),len(out_tensor[1]))
print(in_tensor[0],out_tensor[0])
744915 744915
20 20
9 9
[0, 255, 68, 42, 554, 2, 2462, 63, 823, 923, 775, 252, 243, 2, 1098, 135, 13, 35, 102, 1] [0, 76, 339, 152, 1697, 2, 199, 461, 372, 1242, 861, 975, 273, 2, 334, 563, 723, 109, 421, 1]
转成数字,带上shape属性
in_tensor=np.array(in_tensor)
out_tensor=np.array(out_tensor)
1.2 划分训练集、验证集、测试集 ,按照8:1:1 固定划分
595933-74491(670424)-74491(744915)
train_in_tensor=in_tensor[:595933]
val_in_tensor=in_tensor[595933:670424]
test_in_tensor=in_tensor[670424:]
train_out_tensor=out_tensor[:595933]
val_out_tensor=out_tensor[595933:670424]
test_out_tensor=out_tensor[670424:]
print(len(train_in_tensor),len(test_in_tensor),len(val_in_tensor))
595933 74491 74491
1.3 封装数据集为可直接进行训练的dataset
# 1.继承paddle.io.Dataset
class Mydataset(paddle.io.Dataset):
# 2. 构造函数,定义数据集大小
def __init__(self,first,second):
super(Mydataset,self).__init__()
self.first=first
self.second=second
# 3. 实现__getitem__方法,定义指定index时如何获取数据,并返回单条数据(训练数据,对应的标签)
def __getitem__(self,index):
return self.first[index],self.second[index]
# 4. 实现__len__方法,返回数据集总数目
def __len__(self):
return self.first.shape[0]
train_tensor=Mydataset(train_in_tensor,train_out_tensor)
val_tensor=Mydataset(val_in_tensor,val_out_tensor)
test_tensor=Mydataset(test_in_tensor,test_out_tensor)
for a,b in train_tensor:
print(a,b)
break
[0, 255, 68, 42, 554, 2, 2462, 63, 823, 923, 775, 252, 243, 2, 1098, 135, 13, 35, 102, 1] [0, 76, 339, 152, 1697, 2, 199, 461, 372, 1242, 861, 975, 273, 2, 334, 563, 723, 109, 421, 1]
数据加载
BATCH_SIZE=64
padid=word2id_dict['<end>']
def myPad(inputs,padid):
arrs = [np.asarray(ele) for ele in inputs]
original_length = np.array([ele.shape[0] for ele in arrs])
max_size = max(original_length)
result=[]
for i in range(len(arrs)):
if len(arrs[i])<max_size:
result.append(list(arrs[i])+[padid]*(max_length-len(arrs[i])))
else:
result.append(arrs[i])
result=np.asarray(result)
print(type(result))
print(len(result))
return result,original_length
aa=[inputs[0] for inputs in train_tensor]
src,src_length=myPad(aa,1)
print(src_length.shape)
print(src.shape)
print(len(src[0]))
src,src_length=paddlenlp.data.Pad(pad_val=padid,ret_length=True)([inputsub[0] for inputsub in train_tensor])
print(src.shape,src_length.shape)
def prepare_input(inputs,padid):
src,src_length=paddlenlp.data.Pad(pad_val=padid,ret_length=True)([inputsub[0] for inputsub in inputs])
trg,trg_length=paddlenlp.data.Pad(pad_val=padid,ret_length=True)([inputsub[1] for inputsub in inputs])
trg_mask =(trg[:,:-1]!=padid).astype(paddle.get_default_dtype())
return src,src_length,trg[:,:-1],trg[:,1:,np.newaxis],trg_mask
def create_data_loader(dataset):
data_loader=paddle.io.DataLoader(dataset,batch_sampler=None,batch_size=BATCH_SIZE,collate_fn=partial(prepare_input, padid=padid))
return data_loader
# val_loader=paddle.io.DataLoader(val_tensor,batch_size=BATCH_SIZE,batch_sampler=None,collate_fn=partial(prepare_input, padid=padid))
# test_loader=paddle.io.DataLoader(test_tensor,batch_size=BATCH_SIZE,batch_sampler=None,collate_fn=partial(prepare_input, padid=padid))
train_loader=create_data_loader(train_tensor)
val_loader=create_data_loader(val_tensor)
test_loader=create_data_loader(test_tensor)
src=prepare_input(train_tensor,1)
for i in src:
print(i.shape)
(595933, 34)
(595933,)
(595933, 33)
(595933, 33, 1)
(595933, 33)
j=0
for i in train_loader:
print(len(i))
for ind,each in enumerate(i):
#print(ind,each.shape,each)
print(ind,each.shape)
j+=1
if j==2:
break
5
0 [64, 29]
1 [64]
2 [64, 28]
3 [64, 28, 1]
4 [64, 28]
5
0 [64, 28]
1 [64]
2 [64, 27]
3 [64, 27, 1]
4 [64, 27]
for i in train_loader:
x,x_length,y,_,_= i
break
print(x)
Tensor(shape=[64, 29], dtype=int64, place=CPUPlace, stop_gradient=True,
[[0 , 255 , 68 , ..., 1 , 1 , 1 ],
[0 , 3 , 542 , ..., 1 , 1 , 1 ],
[0 , 8 , 27 , ..., 1 , 1 , 1 ],
...,
[0 , 190 , 3 , ..., 1 , 1 , 1 ],
[0 , 1128, 1751, ..., 1 , 1 , 1 ],
[0 , 102 , 32 , ..., 1 , 1 , 1 ]])
2.网络搭建
主要参考的是官方的项目,直达:https://aistudio.baidu.com/aistudio/projectdetail/1321118?shared=1
2.1 Encoder
class Encoder(paddle.nn.Layer):
def __init__(self,vocab_size,embedding_dim,hidden_size,num_layers):
super(Encoder,self).__init__()
self.embedding=paddle.nn.Embedding(vocab_size,embedding_dim)
self.lstm=paddle.nn.LSTM(input_size=embedding_dim,
hidden_size=hidden_size,
num_layers=num_layers,
dropout=0.2 if num_layers>1 else 0)
# src_length 的形状为[batch_size],作用是控制inputs中的time_step超过[batch_size]的不再更新状态,就是那些填充
def forward(self,src,src_length):
inputs=self.embedding(src) # [batch_size,time_steps,embedding_dim]
encoder_out,encoder_state=self.lstm(inputs,sequence_length=src_length) # out[batch_szie,time_steps,hidden_size] state:[[num_layers*1,batch_size,hidden_size],[num_layers*1,batch_size,hidden_size]]
# encoder_out,encoder_state=self.lstm(inputs)
return encoder_out,encoder_state
encoder=Encoder(word_size,256,128,2)
#paddle.summary(encoder,[(64,18),(64)],dtypes=‘int64’)
out,state=encoder(x,x_length)
print(out.shape)
print(state)
2.2 注意力层
class AttentionLayer(paddle.nn.Layer):
def __init__(self,hidden_size):
super(AttentionLayer,self).__init__()
self.attn1=paddle.nn.Linear(hidden_size,hidden_size)
self.attn2=paddle.nn.Linear(hidden_size+hidden_size,hidden_size)
def forward(self,decoder_hidden_h,encoder_output,encoder_padding_mask):
encoder_output=self.attn1(encoder_output) # [batch_size,time_steps,hidden_size]
# decodr_hidden_h 的形状 [batch_size,hidden_size],是lstm公式中的ht.
# unsqueeze之后[batch_size,1,hidden_size]
# transpose_y=True,后两维转置 [batch_size,hidden_size,time_steps]
# matmul之后的 形状 [batch_size,1,time_steps]
a=paddle.unsqueeze(decoder_hidden_h,[1])
# print(a.shape)
# print(encoder_output.shape)
attn_scores=paddle.matmul(a,encoder_output,transpose_y=True)
# 注意力机制中增加掩码操作,在padding 位加上个非常小的数:-1e9
if encoder_padding_mask is not None:
# encoder_padding_mask的形状为[batch_size,1,time_steps]
attn_scores=paddle.add(attn_scores,encoder_padding_mask)
# softmax操作,默认是最后一个维度,axis=-1,形状不变
attn_scores=paddle.nn.functional.softmax(attn_scores)
# [batch_size,1,time_steps]*[batch_size,time_steps,hidden_size]-->[batch_size,1,hidden_size]
# squeeze之后:[batch_size,hidden_size]
attn_out=paddle.squeeze(paddle.matmul(attn_scores,encoder_output),[1])
# concat之后 [batch_size,hidden_size+hidden_size]
attn_out=paddle.concat([attn_out,decoder_hidden_h],1)
# 最终结果[batch_size,hidden_size]
attn_out=self.attn2(attn_out)
return attn_out
2.3 解码器单元
class DecoderCell(paddle.nn.RNNCellBase):
def __init__(self,num_layers,embedding_dim,hidden_size):
super(DecoderCell,self).__init__()
self.dropout=paddle.nn.Dropout(0.2)
self.lstmcells=paddle.nn.LayerList([paddle.nn.LSTMCell(
input_size=embedding_dim+hidden_size if i==0 else hidden_size,
hidden_size=hidden_size
) for i in range(num_layers)])
self.attention=AttentionLayer(hidden_size)
def forward(self,decoder_input,decoder_initial_states,encoder_out,encoder_padding_mask=None):
#forward 函数会执行squence_len次 ,每次的decoder_input 为[batch_size,embeddding_dim]
# 状态分解 states [encoder_final_states,decoder_init_states]
# encoder_final_states [num_layes,batch_size,hiden_size] ???
# decoder_init_states [] ???
encoder_final_states,decoder_init_states=decoder_initial_states
#num_layers=len(encoder_final_states[0])
#decoder_init_states=lstm_init_state
# ???
new_lstm_states=[]
# decoder_input: [batch_size,embedding_dim]
# print("decodercell ",decoder_input.shape)
inputs=paddle.concat([decoder_input,decoder_init_states],1)
# print("concant之后",inputs.shape)
for i ,lstm_cell in enumerate(self.lstmcells):
# inputs 的形状为 [batch_size,input_size] input_size:输入的大小
state_h,new_lstm_state=lstm_cell(inputs,encoder_final_states[i])
inputs=self.dropout(state_h)
new_lstm_states.append(new_lstm_state)
state_h=self.attention(inputs,encoder_out,encoder_padding_mask)
# print(state_h.shape)
return state_h,[new_lstm_states,state_h]
in1 = np.array([[[1, 2, 3],
[4, 5, 6]],[[1, 2, 3],
[4, 5, 6]]])
in2 = np.array([[[11, 12, 13],
[14, 15, 16]],[[11, 12, 13],
[14, 15, 16]]])
in3 = np.array([[21, 22],
[23, 24]])
in4 = np.array([[21, 22],
[23, 24]])
x1 = paddle.to_tensor(in1)
x2 = paddle.to_tensor(in2)
x3 = paddle.to_tensor(in3)
x4 = paddle.to_tensor(in4)
out1 = paddle.concat([x1, x2],1)
out2 = paddle.concat([x3, x4],1)
print(out1)
print(out2)
2.4 解码器
解码器由embedding+解码器单元+线性输出层组成
class Decoder(paddle.nn.Layer):
def __init__(self,vocab_size,embedding_dim,hidden_size,num_layers):
super(Decoder,self).__init__()
self.embedding=paddle.nn.Embedding(vocab_size,embedding_dim)
self.lstm_attention=paddle.nn.RNN(DecoderCell(num_layers,embedding_dim,hidden_size))
self.fianl=paddle.nn.Linear(hidden_size,vocab_size)
def forward(self,trg, decoder_initial_states,encoder_output,encoder_padding_mask):
# trg 的形状为 [batch_size,sequence_length]
# embedding 之后, [batch_size,sequence_length,embedding_dim]
inputs=self.embedding(trg)
# print("embedding 后的 输入维度",inputs.shape)
# decodr_out [batch_szie,hidden_size]
decoder_out,_ = self.lstm_attention(inputs,
initial_states=decoder_initial_states,
encoder_out=encoder_output,
encoder_padding_mask=encoder_padding_mask)
# predict [batch_size,sequence_len,word_size]
predict=self.fianl(decoder_out)
# print("最后的维度",decoder_out.shape)
return predict
2.5 组装Seq2Seq
class Seq2Seq(paddle.nn.Layer):
def __init__(self, vocab_size,embedding_dim,hidden_size,num_layers,eos_id):
super(Seq2Seq,self).__init__()
self.hidden_size=hidden_size
self.eos_id=eos_id
self.num_layers=num_layers
self.INF= 1e9
self.encoder=Encoder(vocab_size,embedding_dim,hidden_size,num_layers)
self.decoder=Decoder(vocab_size,embedding_dim,hidden_size,num_layers)
def forward(self,src,src_length,trg):
# encoder_output 的形状为[batch_size,sequence_len,hidden_size]
# encoder_final_state ([num_layers*1,batch_size,hidden_size],[num_layers*1,batch_size,hidden_size]]) tuple类型
encoder_output,encoder_final_state=self.encoder(src,src_length)
encoder_final_states=[(encoder_final_state[0][i],encoder_final_state[1][i]) for i in range(self.num_layers)]
#print(encoder_final_states[0])
# [batch_size,hidden_size] 初始化为0
#lstm_init_state= self.decoder.lstm_attention.cell.get_initial_states(batch_ref=encoder_output,shape=[self.hidden_size])
decoder_initial_states=[encoder_final_states,
self.decoder.lstm_attention.cell.get_initial_states(batch_ref=encoder_output,shape=[self.hidden_size])]
src_mask=(src!=self.eos_id).astype(paddle.get_default_dtype())
encoder_mask=(src_mask-1)*self.INF
encoder_padding_mask=paddle.unsqueeze(encoder_mask,[1])
predict=self.decoder(trg,decoder_initial_states,encoder_output,encoder_padding_mask)
return predict
2.6 自定义交叉熵损失函数及超参数
class CrossEntropy(paddle.nn.Layer):
def __init__(self):
super(CrossEntropy,self).__init__()
def forward(self,pre,real,trg_mask):
# 返回的数据类型与pre一致,除了axis维度(未指定则为-1),其他维度也与pre一致
# logits=pre,[batch_size,sequence_len,word_size],猜测会进行argmax操作,[batch_size,sequence_len,1]
# 默认的soft_label为False,lable=real,[bacth_size,sequence_len,1]
cost=paddle.nn.functional.softmax_with_cross_entropy(logits=pre,label=real)
# 删除axis=2 shape上为1的维度
# 返回结果的形状应为 [batch_size,sequence_len]
cost=paddle.squeeze(cost,axis=[2])
# trg_mask 的形状[batch_size,suqence_len]
# * 这个星号应该是对应位置相乘,返回结果的形状 [bathc_szie,sequence_len]
masked_cost=cost*trg_mask
# paddle.mean 对应轴的对应位置求平均, 在这里返回结果为 [sequence_len]
# paddle.sum 求解方法与paddle.mean一致,最终返回的结果应为[1]
return paddle.sum(paddle.mean(masked_cost,axis=[0]))
epochs=20
eos_id=word2id_dict['<end>']
num_layers=2
dropout_rate=0.2
hidden_size=128
embedding_dim=256
max_grad_norm=5
lr=0.001
log_freq=200
model_path='./output/couplets_models'
s2s=Seq2Seq(word_size,embedding_dim,hidden_size,num_layers,eos_id)
# pre=s2s(x,x_length,y)
# pre
# paddle.summary(s2s,[(64,18),(64,17)],dtypes='int64')
model=paddle.Model(s2s)
# model.parameters() 返回一个包含所有模型参数的列表
optimizer=paddle.optimizer.Adam(learning_rate=lr,parameters=model.parameters())
# 困惑度
ppl_metric=paddlenlp.metrics.Perplexity()
model.prepare(optimizer,CrossEntropy(),ppl_metric)
2.7 训练并保存
eval_freq 多少个epoch评估一次
save_freq 多少个epoch保存模型一次
model.fit(train_data=train_loader,
eval_data=val_loader,
epochs=epochs,
eval_freq=1,
save_freq=2,
save_dir=model_path,
log_freq=log_freq,
verbose=2,
callbacks=[paddle.callbacks.VisualDL(’./log’)])
保存用于预测的模型
model.save("./infer_model/infer_model",False)
# 保存网络模型,用于可视化
# path = "./train_model/train_model"
# paddle.jit.save(s2s, path,input_spec=[InputSpec(shape=[BATCH_SIZE,x.shape[1]],dtype='int64'),
# InputSpec(shape=[BATCH_SIZE], dtype='int64'),
# InputSpec(shape=[BATCH_SIZE,x.shape[1]], dtype='int64')])
3.模型预测
注意:tkinter 在AI studio中应该没法用,可下载代码在本地运行。
3.1 定义预测模型
class Seq2SeqInfer(Seq2Seq):
def __init__(self,word_size,embedding_dim,hidden_size,num_layers,bos_id,eos_id,beam_size,max_out_len=couplet_maxlen):
self.bos_id=bos_id
self.beam_size=beam_size
self.max_out_len=max_out_len
self.num_layers=num_layers
super(Seq2SeqInfer,self).__init__(word_size,embedding_dim,hidden_size,num_layers,eos_id)
self.beam_search_decoder=paddle.nn.BeamSearchDecoder(
self.decoder.lstm_attention.cell,
start_token=bos_id,
end_token=eos_id,
beam_size=beam_size,
embedding_fn=self.decoder.embedding,
output_fn=self.decoder.fianl)
def forward(self,src,src_length):
encoder_output,encoder_states=self.encoder(src,src_length)
encoder_final_state=[(encoder_states[0][i],encoder_states[1][i]) for i in range(self.num_layers)]
# 初始化decoder的隐藏层状态
decoder_initial_states=[encoder_final_state,
self.decoder.lstm_attention.cell.get_initial_states(batch_ref=encoder_output,shape=[self.hidden_size])]
src_mask=(src!=self.eos_id).astype(paddle.get_default_dtype())
encoder_padding_mask=(src_mask-1.0)*self.INF
encoder_padding_mask=paddle.unsqueeze(encoder_padding_mask,[1])
# 扩展tensor的bacth维度
encoder_out=paddle.nn.BeamSearchDecoder.tile_beam_merge_with_batch(encoder_output,self.beam_size)
encoder_padding_mask=paddle.nn.BeamSearchDecoder.tile_beam_merge_with_batch(encoder_padding_mask,self.beam_size)
seq_output,_=paddle.nn.dynamic_decode( decoder=self.beam_search_decoder,
inits= decoder_initial_states,
max_step_num= self.max_out_len,
encoder_out=encoder_output,
encoder_padding_mask=encoder_padding_mask)
return seq_output
def pre_process(seq,bos_idx,eos_idx):
#print(bos_idx,eos_idx)
# 结束位置
eos_pos=len(seq)-1
for i ,idx in enumerate(seq):
#print(i,idx[0])
if idx==eos_idx: # 遇到结束标志
eos_pos=i
break
seq=[idx[0] for idx in seq[:eos_pos] if (idx !=bos_idx) ]
return seq
3.1 预测超参数
主要是定义束搜索的束宽
beam_size=10
bos_id=word2id_dict['<start>']
eos_id=word2id_dict['<end>']
max_out_len=couplet_maxlen
print(bos_id)
0
3.2 初始化预测模型,参数从文件中加载
infer_model=paddle.Model(Seq2SeqInfer(word_size,embedding_dim,hidden_size,num_layers,bos_id,eos_id,beam_size,max_out_len))
infer_model.prepare()
infer_model.load('./trained_model/18')
3.3 使用tkinter进行可视化
#insrc='离离原上草'
# 窗前明月光
# 杀虫喷雾器 ;立马报春风
# 色雅味佳,天堂美膳 ;风清气爽,世界佳肴
def Chinesetoid(insrc,word2id_dict,couplet_maxlen=couplet_maxlen,start=bos_id,padid=eos_id):
result=[start,]
for ch in insrc:
result.append(word2id_dict[ch])
result.append(eos_id)
result_len=len(result)
if len(result)<couplet_maxlen:
result+=[eos_id]*(couplet_maxlen-len(result))
return paddle.unsqueeze(paddle.to_tensor(result),axis=0) ,paddle.to_tensor(result_len)
# tensor=Chinesetoid(insrc,word2id_dict)
# print(tensor)
def get_second(inputs,id2word_dict):
finished_seq=infer_model.predict_batch(inputs=list(inputs))[0][0]
#finished_seq=finished_seq[0][0]
#print(type(finished_seq))
input_re=inputs[0][0][1:]
input_re=paddle.tolist(input_re)
#print(input_re)
in_input=[]
for subre in input_re:
if subre==eos_id:
break
in_input.append(subre)
#print(in_input)
result=[]
for subseq in finished_seq:
#print(subseq)
resultid=Counter(list(subseq)).most_common(1)[0][0]
if resultid==eos_id:
break
result.append(resultid)
#print(result)
word_list_f=[id2word_dict[id] for id in in_input]
word_list_s=[id2word_dict[id] for id in result]
sequence='上联:'+"".join(word_list_f)+'\t \n下联: '+"".join(word_list_s)+"\n"
#print(sequence)
return sequence
import tkinter as tk # 使用Tkinter前需要先导入
# 第1步,实例化object,建立窗口window
window = tk.Tk()
# 第2步,给窗口的可视化起名字
window.title('简易对联生成器')
# 第3步,设定窗口的大小(长 * 宽)
window.geometry('600x400') # 这里的乘是小x
# 第4步,在图形界面上设定输入框控件entry框并放置
e = tk.Text(window,width=57,height=5,font=('楷体', 15), show = None)#显示成明文形式
e.place(x=10,y=10)
# ee = tk.Text(window,height=5, show = None)#显示成明文形式
# ee.place(x=10,y=200)
t = tk.Text(window,width=57, height=5,font=('楷体', 15))
t.place(x=10,y=220)
def insert_point():
var = e.get('0.0','end')
var=var.strip()
#print('hhhh',var,type(var))
tensor=Chinesetoid(var,word2id_dict)
#ee.insert('end',str(tensor))
second=get_second(tensor,id2word_dict)
t.delete('1.0','end')
t.insert('end',second)
def delet():
e.delete('1.0','end')
b1 = tk.Button(window, text='点我', width=10,
height=2, command=insert_point)
b1.place(x=200,y=150)
b2 = tk.Button(window, text='清空输入~', width=10,
height=2, command=delet)
b2.place(x=300,y=150)
l = tk.Label(window, text='能识别的词汇大约在9000个,输入一些偏僻字可能会出错。', font=('Arial', 10), height=2)
l.pack(side='bottom')
window.mainloop()
项目总结
- 对数据集数据集进行剔除,剔除14条问题数据
- 基于Seq2Seq模型搭建网络
- 使用Tkinter进行可视化
欢迎批评指正!
更多推荐
所有评论(0)