mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 18:51:14 +06:00
moved bert to qelos-util
This commit is contained in:
parent
4e52188433
commit
bd91ae654f
0
hf_bert/__init__.py
Normal file
0
hf_bert/__init__.py
Normal file
11
modeling.py
11
modeling.py
@ -34,6 +34,10 @@ def gelu(x):
|
|||||||
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
||||||
|
|
||||||
|
|
||||||
|
def swish(x):
|
||||||
|
return x * torch.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
class BertConfig(object):
|
class BertConfig(object):
|
||||||
"""Configuration class to store the configuration of a `BertModel`.
|
"""Configuration class to store the configuration of a `BertModel`.
|
||||||
"""
|
"""
|
||||||
@ -60,7 +64,7 @@ class BertConfig(object):
|
|||||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
|
||||||
layer in the Transformer encoder.
|
layer in the Transformer encoder.
|
||||||
hidden_act: The non-linear activation function (function or string) in the
|
hidden_act: The non-linear activation function (function or string) in the
|
||||||
encoder and pooler.
|
encoder and pooler. If string, "gelu", "relu" and "swish" supported.
|
||||||
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
hidden_dropout_prob: The dropout probabilitiy for all fully connected
|
||||||
layers in the embeddings, encoder, and pooler.
|
layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob: The dropout ratio for the attention
|
attention_probs_dropout_prob: The dropout ratio for the attention
|
||||||
@ -237,7 +241,8 @@ class BERTIntermediate(nn.Module):
|
|||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(BERTIntermediate, self).__init__()
|
super(BERTIntermediate, self).__init__()
|
||||||
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
||||||
self.intermediate_act_fn = gelu
|
act2fn = {"gelu": gelu, "relu": torch.nn.ReLU, "swish": swish}
|
||||||
|
self.intermediate_act_fn = act2fn[config.hidden_act] if isinstance(config.hidden_act, str) else config.hidden_act
|
||||||
|
|
||||||
def forward(self, hidden_states):
|
def forward(self, hidden_states):
|
||||||
hidden_states = self.dense(hidden_states)
|
hidden_states = self.dense(hidden_states)
|
||||||
@ -355,7 +360,7 @@ class BertModel(nn.Module):
|
|||||||
all_encoder_layers = self.encoder(embedding_output, extended_attention_mask)
|
all_encoder_layers = self.encoder(embedding_output, extended_attention_mask)
|
||||||
sequence_output = all_encoder_layers[-1]
|
sequence_output = all_encoder_layers[-1]
|
||||||
pooled_output = self.pooler(sequence_output)
|
pooled_output = self.pooler(sequence_output)
|
||||||
return [embedding_output] + all_encoder_layers, pooled_output
|
return all_encoder_layers, pooled_output
|
||||||
|
|
||||||
class BertForSequenceClassification(nn.Module):
|
class BertForSequenceClassification(nn.Module):
|
||||||
"""BERT model for classification.
|
"""BERT model for classification.
|
||||||
|
@ -1,71 +0,0 @@
|
|||||||
import unittest
|
|
||||||
import json
|
|
||||||
import random
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import modeling
|
|
||||||
import convert_tf_checkpoint_to_pytorch
|
|
||||||
|
|
||||||
import grouch
|
|
||||||
|
|
||||||
|
|
||||||
class MyTest(unittest.TestCase):
|
|
||||||
def test_loading_and_running(self):
|
|
||||||
bertpath = "../../grouch/data/bert/bert-base/"
|
|
||||||
configpath = bertpath + "bert_config.json"
|
|
||||||
ckptpath = bertpath + "bert_model.ckpt"
|
|
||||||
m = convert_tf_checkpoint_to_pytorch.convert(configpath, ckptpath)
|
|
||||||
m.eval()
|
|
||||||
# print(m)
|
|
||||||
|
|
||||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
|
||||||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
|
|
||||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
|
|
||||||
|
|
||||||
all_y, pool_y = m(input_ids, token_type_ids, input_mask)
|
|
||||||
print(pool_y.shape)
|
|
||||||
# np.save("_bert_ref_pool_out.npy", pool_y.detach().numpy())
|
|
||||||
# np.save("_bert_ref_all_out.npy", torch.stack(all_y, 0).detach().numpy())
|
|
||||||
|
|
||||||
config = grouch.TransformerBERT.load_config(configpath)
|
|
||||||
gm = grouch.TransformerBERT.init_from_config(config)
|
|
||||||
gm.load_weights_from_tf_checkpoint(ckptpath)
|
|
||||||
gm.eval()
|
|
||||||
|
|
||||||
g_all_y, g_pool_y = gm(input_ids, token_type_ids, input_mask)
|
|
||||||
print(g_pool_y.shape)
|
|
||||||
|
|
||||||
# check embeddings
|
|
||||||
# print(m.embeddings)
|
|
||||||
# print(gm.emb)
|
|
||||||
# hugging_emb = m.embeddings(input_ids, token_type_ids)
|
|
||||||
# grouch_emb = gm.emb(input_ids, token_type_ids)
|
|
||||||
|
|
||||||
print((all_y[0] - g_all_y[0]).norm())
|
|
||||||
# print(all_y[0][:, :, :10] - g_all_y[0][:, :, :10])
|
|
||||||
self.assertTrue(np.allclose(all_y[0].detach().numpy(), g_all_y[0].detach().numpy(), atol=1e-7))
|
|
||||||
print("embeddings good")
|
|
||||||
|
|
||||||
print(m.encoder.layer[0])
|
|
||||||
print(gm.encoder.layers[0])
|
|
||||||
print("norm of diff at layer 1", (all_y[1] - g_all_y[1]).norm())
|
|
||||||
# print(all_y[1][:, :, :10] - g_all_y[1][:, :, :10])
|
|
||||||
self.assertTrue(np.allclose(all_y[1].detach().numpy(), g_all_y[1].detach().numpy(), atol=1e-6))
|
|
||||||
|
|
||||||
# hugging_layer = m.encoder.layer[0]
|
|
||||||
# grouch_layer = gm.encoder.layers[0]
|
|
||||||
# print("comparing weights")
|
|
||||||
# print((hugging_layer.attention.self.query.weight - grouch_layer.slf_attn.q_proj.weight).norm())
|
|
||||||
# print((hugging_layer.attention.self.query.bias - grouch_layer.slf_attn.q_proj.bias).norm())
|
|
||||||
# print((hugging_layer.attention.self.key.weight - grouch_layer.slf_attn.k_proj.weight).norm())
|
|
||||||
# print((hugging_layer.attention.self.key.bias - grouch_layer.slf_attn.k_proj.bias).norm())
|
|
||||||
# print((hugging_layer.attention.self.value.weight - grouch_layer.slf_attn.v_proj.weight).norm())
|
|
||||||
# print((hugging_layer.attention.self.value.bias - grouch_layer.slf_attn.v_proj.bias).norm())
|
|
||||||
# print((hugging_layer.attention.output.dense.weight - grouch_layer.slf_attn.vw_proj.weight).norm())
|
|
||||||
# print((hugging_layer.attention.output.dense.bias - grouch_layer.slf_attn.vw_proj.bias).norm())
|
|
||||||
|
|
||||||
print("norm of diff at last layer", (all_y[-1] - g_all_y[-1]).norm())
|
|
||||||
# print(all_y[-1][:, :, :10] - g_all_y[-1][:, :, :10])
|
|
||||||
self.assertTrue(np.allclose(all_y[-1].detach().numpy(), g_all_y[-1].detach().numpy(), atol=1e-4))
|
|
Loading…
Reference in New Issue
Block a user