简体   繁体   中英

How to use pre-trained models for text classification?Comparing a fine-tuned model with a pre-trained model without fine-tuning

I want to know how much the fine-tuned model improves compared to the model without fine-tuning.I want to compare the performance of the pre-trained model(BERT) and the model(fine-tuned BERT) obtained by fine-tuning the pre-trained model on text classification.I know how to fine-tune BERT for text classification, but not very clear on how to use BERT directly for classification.what should I do?The following is the code for fine-tuning the model, how to rewrite it to directly use the pre-trained model.

    <!-- language: python -->

from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn 
import torch.utils.data as Data 
import torch.optim as optim 
from sklearn.metrics import accuracy_score,matthews_corrcoef
from sklearn.model_selection import train_test_split 

tokenizer_model = BertTokenizer.from_pretrained('bert-base-uncased')
pretrained_model = BertModel.from_pretrained("bert-base-uncased")

class MyDataSet(Data.Dataset): 
    def __init__ (self, data, label):
        self.data = data
        self.label = label
        self.tokenizer = tokenizer_model

    def __getitem__(self, idx):
        text = self.data[idx]
        label = self.label[idx]
        inputs = self.tokenizer(text, return_tensors="pt",padding='max_length',max_length=256,truncation=True)
        input_ids = inputs.input_ids.squeeze(0)
        #token_type_ids = inputs.token_type_ids.squeeze(0)
        attention_mask = inputs.attention_mask.squeeze(0)
        #return input_ids, token_type_ids, attention_mask, label
        return input_ids, attention_mask, label

    def __len__(self):
        return len(self.data)

data,label = [],[]
with open(path) as f:
for line in f.readlines():
    a,b = line.strip().split('\t')
    data.append(b)
    if a == 'LOW':
        label.append('0')
    elif a == 'MEDIUM':
        label.append('1')
    else:
        label.append('2')

label = [int(i) for i in label]
train_x,test_x,train_y,test_y = train_test_split(data, label, test_size = 0.15,random_state = 32, stratify=label)
dataset_train = MyDataSet(train_x,train_y)
dataset_test = MyDataSet(test_x,test_y)
dataloader_train = Data.DataLoader(dataset_train, batch_size=128, shuffle=True,num_workers=32,pin_memory=True)
dataloader_test = Data.DataLoader(dataset_test, batch_size=128, shuffle=True,num_workers=32,pin_memory=True)

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.bert = pretrained_model
        self.linear = nn.Linear(768,3)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids, attention_mask).pooler_output
        print(output.shape) # torch.Size([1, 768])  
        output = self.linear(output)
        return output

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    print("Use", torch.cuda.device_count(), 'gpus')
    model = MyModel()
    model = nn.DataParallel(model)
    model = model.to(device)

## model = MyModel().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

for epoch in range(10):
    for input_ids,attention_mask,label in dataloader_train:
        train_input_ids,train_attention_mask,train_label = input_ids.to(device),attention_mask.to(device),label.to(device)       
        model.train()       
        pred = model(train_input_ids,train_attention_mask)
        print('epoch:',epoch)
        #print('pred,label:',pred,label)
        loss = loss_fn(pred, train_label)
        print('Loss:',loss.item())
        pred = torch.argmax(pred,dim=1)
        acc = (pred == train_label).float().mean()
        print('acc:',acc)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        savename_train = str(path) +'_' + str(name) + '_train' + '.txt'
        with open(savename_train,'a') as f:
            f.write(str(epoch)+'\t'+str(loss.item())+'\t'+str(acc.item())+'\n')   

    model.eval()
    with torch.no_grad():
        for input_ids,attention_mask,label in dataloader_test:
            validation_input_ids,validation_attention_mask,validation_label = input_ids.to(device),attention_mask.to(device),label.to(device)  

            pred = model(validation_input_ids,validation_attention_mask)
            loss = loss_fn(pred, validation_label)
            pred = torch.argmax(pred, dim=1)
            acc = (pred == validation_label).float().mean()     
            print('acc:',acc)
            savename_eval = str(path) +'_' + str(name) + '_val' + '.txt'
            with open(savename_eval,'a') as f:
                f.write(str(epoch)+'\t'+str(loss.item())+'\t'+str(acc.item())+'\n') 

 

What you are trying to do does not make sense. The naive BERT model was retrained using a combination of masked language modelling objective and next sentence prediction. So, all it can do is predicting masked tokens, predicting if a pair of given sentence can be next to each other in a text. Most importantly, it can provide embeddings.

To use for classification you have to add a classification head to the end of the model. Initially, the weights of that layer is randomly initialised. If you do not fine tune the last layer, what do you really expect from random weights?

If you really want to compare the fine-tuned model to a baseline, take the embeddings vector from the BERT and use a tradional ML model like SVM or Tree based calssifier.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM