# Measure how long the training epoch takes. t0 = time.time()
# Reset the total loss for this epoch. total_loss = 0
# Put the model into training mode. Don't be mislead--the call to # `train` just changes the *mode*, it doesn't *perform* the training. # `dropout` and `batchnorm` layers behave differently during training # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch) model.train()
# For each batch of training data... for step, batch inenumerate(train_dataloader):
# Progress update every 40 batches. if step % 40 == 0andnot step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0)
# Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using the # `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: labels b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device)
# Always clear any previously calculated gradients before performing a # backward pass. PyTorch doesn't do this automatically because # accumulating the gradients is "convenient while training RNNs". # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) model.zero_grad()
# Perform a forward pass (evaluate the model on this training batch). # This will return the loss (rather than the model output) because we # have provided the `labels`. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
# The call to `model` always returns a tuple, so we need to pull the # loss value out of the tuple. loss = outputs[0]
# Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_loss += loss.item()
# Perform a backward pass to calculate the gradients. loss.backward()
# Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step()
# Update the learning rate. scheduler.step()
# Calculate the average loss over the training data. avg_train_loss = total_loss / len(train_dataloader)
# Store the loss value for plotting the learning curve. loss_values.append(avg_train_loss)
print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epcoh took: {:}".format(format_time(time.time() - t0)))
# ======================================== # Validation # ======================================== # After the completion of each training epoch, measure our performance on # our validation set.
print("") print("Running Validation...")
t0 = time.time()
# Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval()
# Evaluate data for one epoch for batch in validation_dataloader:
# Add batch to GPU batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and # speeding up validation with torch.no_grad():
# Forward pass, calculate logit predictions. # This will return the logits rather than the loss because we have # not provided labels. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
# Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. logits = outputs[0]
# Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy()
# Calculate the accuracy for this batch of test sentences. tmp_eval_accuracy = flat_accuracy(logits, label_ids)
# Accumulate the total accuracy. eval_accuracy += tmp_eval_accuracy
# Track the number of batches nb_eval_steps += 1
# Report the final accuracy for this validation run. print(" Accuracy: {0:.2f}".format(eval_accuracy / nb_eval_steps)) print(" Validation took: {:}".format(format_time(time.time() - t0)))
print("") print("Training complete!")
======== Epoch 1 / 4 ========
Training...
Batch 40 of 241. Elapsed: 0:00:05.
Batch 80 of 241. Elapsed: 0:00:09.
Batch 120 of 241. Elapsed: 0:00:14.
Batch 160 of 241. Elapsed: 0:00:19.
Batch 200 of 241. Elapsed: 0:00:23.
Batch 240 of 241. Elapsed: 0:00:28.
Average training loss: 0.17
Training epcoh took: 0:00:28
Running Validation...
Accuracy: 0.80
Validation took: 0:00:01
======== Epoch 2 / 4 ========
Training...
Batch 40 of 241. Elapsed: 0:00:05.
Batch 80 of 241. Elapsed: 0:00:09.
Batch 120 of 241. Elapsed: 0:00:14.
Batch 160 of 241. Elapsed: 0:00:19.
Batch 200 of 241. Elapsed: 0:00:23.
Batch 240 of 241. Elapsed: 0:00:28.
Average training loss: 0.20
Training epcoh took: 0:00:28
Running Validation...
Accuracy: 0.81
Validation took: 0:00:01
======== Epoch 3 / 4 ========
Training...
Batch 40 of 241. Elapsed: 0:00:05.
Batch 80 of 241. Elapsed: 0:00:09.
Batch 120 of 241. Elapsed: 0:00:14.
Batch 160 of 241. Elapsed: 0:00:19.
Batch 200 of 241. Elapsed: 0:00:23.
Batch 240 of 241. Elapsed: 0:00:28.
Average training loss: 0.13
Training epcoh took: 0:00:28
Running Validation...
Accuracy: 0.82
Validation took: 0:00:01
======== Epoch 4 / 4 ========
Training...
Batch 40 of 241. Elapsed: 0:00:05.
Batch 80 of 241. Elapsed: 0:00:09.
Batch 120 of 241. Elapsed: 0:00:14.
Batch 160 of 241. Elapsed: 0:00:19.
Batch 200 of 241. Elapsed: 0:00:23.
Batch 240 of 241. Elapsed: 0:00:28.
Average training loss: 0.12
Training epcoh took: 0:00:28
Running Validation...
Accuracy: 0.82
Validation took: 0:00:01
Training complete!
# 文本预处理 # Tokenize all of the sentences and map the tokens to thier word IDs. input_ids = []
# For every sentence... for sent in sentences: # `encode` will: # (1) Tokenize the sentence. # (2) Prepend the `[CLS]` token to the start. # (3) Append the `[SEP]` token to the end. # (4) Map tokens to their IDs. encoded_sent = tokenizer.encode( sent, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' )
# Create a mask of 1s for each token followed by 0s for padding for seq in input_ids: seq_mask = [float(i > 0) for i in seq] attention_masks.append(seq_mask)
# Predict for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and # speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
logits = outputs[0]
# Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy()
# Store predictions and true labels predictions.append(logits) true_labels.append(label_ids)
# Evaluate each test batch using Matthew's correlation coefficient print('Calculating Matthews Corr. Coef. for each batch...')
# For each input batch... for i inrange(len(true_labels)):
# The predictions for this batch are a 2-column ndarray (one column for "0" # and one column for "1"). Pick the label with the highest value and turn this # in to a list of 0s and 1s. pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
# Calculate and store the coef for this batch. matthews = matthews_corrcoef(true_labels[i], pred_labels_i) matthews_set.append(matthews)
# Combine the predictions for each batch into a single list of 0s and 1s. flat_predictions = [item for sublist in predictions for item in sublist] flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
# Combine the correct labels for each batch into a single list. flat_true_labels = [item for sublist in true_labels for item in sublist]
# Calculate the MCC mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
print('MCC: %.3f' % mcc)
MCC: 0.524
Matthews相关系数,用于度量二分类的质量。它会考虑TP/FP/TN/FP的情况,通常被认为是一个balanced的度量 ,可以用于那些有着不同size的分类中。MCC本质上是一个介于[-1,+1]之间的相关系数值。相关系数为+1,表示是一个完美的预测,0表示是一个平均随机预测(average random prediction),而-1表示是一个逆预测(inverse prediction)。这种统计方法也被称为:phi coefficient。
Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
1 2 3
question = "How many parameters does BERT-large have?" answer_text = "BERT-large is really big...it has 24-layers and an embedding size \ of 1024, for a total of 340M parameters!"
1 2
input_ids = tokenizer.encode(question, answer_text) print("The input has a total of {:} tokens.".format(len(input_ids)))
The input has a total of 44 tokens.
1 2 3
tokens = tokenizer.convert_ids_to_tokens(input_ids) for token, idinzip(tokens, input_ids): print("{:<12} {:>6,}".format(token, id))
[CLS] 101
how 2,129
many 2,116
parameters 11,709
does 2,515
bert 14,324
- 1,011
large 2,312
have 2,031
? 1,029
[SEP] 102
bert 14,324
- 1,011
large 2,312
is 2,003
really 2,428
big 2,502
. 1,012
. 1,012
. 1,012
it 2,009
has 2,038
24 2,484
- 1,011
layers 9,014
and 1,998
an 2,019
em 7,861
##bed 8,270
##ding 4,667
size 2,946
of 1,997
102 9,402
##4 2,549
, 1,010
for 2,005
a 1,037
total 2,561
of 1,997
340 16,029
##m 2,213
parameters 11,709
! 999
[SEP] 102