I am having issues getting the Persona-Dialogue-Generation model(from github) to run. It seems that it should have only one output parameter but I provided two.How to modify the code?
The problem code is
self.transformer_module = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt',num_special_tokens=special_token_len)
and lm_logits, hidden_states = self.transformer_module(input_seq, None, dis_seq)
ValueError: too many values to unpack (expected 2)
Full code:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_pretrained_bert import OpenAIGPTLMHeadModel
class Gpt2SeqModel(nn.Module):
def __init__(self, opt, vocab_size, pad_idx, start_idx, end_idx, special_token_len, dict, longest_label=1,
length_penalty=1.0, diversity_groups=1, diversity_coef=0.2, annealing_topk=None, annealing=0, sample=False,
temperature=0.7):
super().__init__()
# original vocab size plus special vocab
self.vocab_size = vocab_size + 40478
self.token_type_dict = {}
# max is 30
for i in range(29):
self.token_type_dict['dis'+str(i)] = self.vocab_size + i
# pred for prediction turn embedding
self.token_type_dict['pred'] = self.vocab_size + 29
# the remaining 30 is the distance size
special_token_len += 30
self.vocab_size += 29
# regard input and output as one sentence, given the input as context, generate the next sentence.
self.transformer_module = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt',
num_special_tokens=special_token_len)
self.pad_idx = pad_idx
self.start_idx = start_idx
self.end_idx = end_idx
self.register_buffer('start_tensor', torch.LongTensor([start_idx]))
self.register_buffer('pred_turn_tensor', torch.LongTensor([self.token_type_dict['pred']]))
# default beam equal to 1
self.beam_size = opt.get('beam_size', 1)
self.rank = opt.get('rank_candidates', False)
self.use_turn = opt.get('encoder_turn_use', False)
self.use_dis = opt.get('encoder_dis_use', False)
# longest label
self.longest_label = min(longest_label, opt.get('decode_max_seq_len', 100))
self.length_penalty_coef = length_penalty
self.diversity_groups = diversity_groups
self.diversity_coef = diversity_coef
self.annealing_topk = annealing_topk
self.annealing = annealing
self.temperature = temperature
self.topk = opt.get('top_k', 0)
self.dict = dict
self.no_repeat_ngram_size = 2
self.dropout = nn.Dropout(p=0.2)
self.linear = nn.Linear(768, 2, bias=False)
nn.init.normal_(self.linear.weight, std=0.02)
def forward(self, src_seq, src_seq_turn=None, src_seq_dis=None, tgt_seq=None, tgt_seq_turn=None, cands=None, valid_cands=None, prev_enc=None,
rank_during_training=False, sampling=False, sampling_cands=None):
# concat src_seq and tgt_seq as one sentence, use start token to separate them.
if tgt_seq is not None:
# keep track of longest label we've ever seen
# we'll never produce longer ones than that during prediction
self.longest_label = max(self.longest_label, tgt_seq.size(1))
batch_size, src_seq_len = src_seq.size()
if src_seq_dis is not None:
src_seq_dis = np.array(src_seq_dis)
# evaluation return none scores
scores = None
negative_score = None
start_tensor = self.start_tensor.detach().expand(batch_size, 1)
# whether training or evaluation
if tgt_seq is not None:
input_seq = torch.cat([src_seq, start_tensor, tgt_seq], dim=1)
# TODO: manually construct the position ids for input & output
if self.use_dis and src_seq_dis is not None:
# create numpy
token_type_ids = np.zeros(input_seq.size())
# map str to id
for row_ind, row in enumerate(token_type_ids):
for ind, _ in enumerate(row):
if ind < src_seq_len:
str_ind = 'dis' + str(src_seq_dis[row_ind, ind])
row[ind] = self.token_type_dict[str_ind]
else:
row[ind] = self.token_type_dict['pred']
dis_seq = torch.tensor(token_type_ids, device=input_seq.device, dtype=torch.long)
else:
dis_seq = None
lm_logits, hidden_states = self.transformer_module(input_seq, None, dis_seq)
# lm labels should mask the source sentence language model
shift_logits = lm_logits[..., src_seq_len:-1, :].contiguous()
# lm_labels = tgt_seq.clone()[..., 1:].contiguous()
# predict answers
scores = shift_logits
pos_seq_len = src_seq_len + tgt_seq.ne(self.pad_idx).sum(dim=1, keepdim=True)
pos_seq_len_expand = pos_seq_len.unsqueeze(dim=2).repeat(1, 1, 768)
last_state = hidden_states.gather(dim=1, index=pos_seq_len_expand).squeeze(dim=1)
positive_score = self.linear(self.dropout(last_state))
predictions = shift_logits.argmax(dim=-1)
else:
prior_context = torch.cat([src_seq, start_tensor], dim=1)
if self.use_dis and src_seq_dis is not None:
# create numpy
token_type_ids = np.zeros(prior_context.size())
# map str to id
for row_ind, row in enumerate(token_type_ids):
for ind, _ in enumerate(row):
if ind < src_seq_len:
str_ind = 'dis' + str(src_seq_dis[row_ind, ind])
row[ind] = self.token_type_dict[str_ind]
else:
row[ind] = self.token_type_dict['pred']
prior_dis = torch.tensor(token_type_ids, device=prior_context.device, dtype=torch.long)
else:
prior_dis = None
It looks like OpenAIGPTLMHeadModel
will only return lm_logits
, so just remove hidden_states
from model outputs. lm_logits = self.transformer_module(input_seq, None, dis_seq)
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
lm_logits = self.lm_head(hidden_states)
if lm_labels is not None:
# Shift so that tokens < n predict n
shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = lm_labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
return loss
return lm_logits
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.