{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Comparing TensorFlow (original) and PyTorch models\n", "\n", "You can use this small notebook to check the conversion of the model's weights from the TensorFlow model to the PyTorch model. In the following, we compare the weights of the last layer on a simple example (in `input.txt`) but both models returns all the hidden layers so you can check every stage of the model.\n", "\n", "To run this notebook, follow these instructions:\n", "- make sure that your Python environment has both TensorFlow and PyTorch installed,\n", "- download the original TensorFlow implementation,\n", "- download a pre-trained TensorFlow model as indicaded in the TensorFlow implementation readme,\n", "- run the script `convert_tf_checkpoint_to_pytorch.py` as indicated in the `README` to convert the pre-trained TensorFlow model to PyTorch.\n", "\n", "If needed change the relative paths indicated in this notebook (at the beggining of Sections 1 and 2) to point to the relevent models and code." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1/ TensorFlow code" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:34:49.025081Z", "start_time": "2018-11-05T09:34:49.012403Z" } }, "outputs": [], "source": [ "original_tf_inplem_dir = \"./tensorflow_code/\"\n", "model_dir = \"../google_models/uncased_L-12_H-768_A-12/\"\n", "\n", "vocab_file = model_dir + \"vocab.txt\"\n", "bert_config_file = model_dir + \"bert_config.json\"\n", "init_checkpoint = model_dir + \"bert_model.ckpt\"\n", "\n", "input_file = \"./samples/input.txt\"\n", "max_seq_length = 128" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:34:50.216833Z", "start_time": "2018-11-05T09:34:49.027270Z" } }, "outputs": [], "source": [ "import importlib.util\n", "import sys\n", "\n", "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/extract_features.py')\n", "module = importlib.util.module_from_spec(spec)\n", "spec.loader.exec_module(module)\n", "sys.modules['extract_features_tensorflow'] = module\n", "\n", "from extract_features_tensorflow import *" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:34:50.338711Z", "start_time": "2018-11-05T09:34:50.218734Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:*** Example ***\n", "INFO:tensorflow:unique_id: 0\n", "INFO:tensorflow:tokens: [CLS] who was jim henson ? [SEP] jim henson was a puppet ##eer [SEP]\n", "INFO:tensorflow:input_ids: 101 2040 2001 3958 27227 1029 102 3958 27227 2001 1037 13997 11510 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", "INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n" ] } ], "source": [ "layer_indexes = list(range(12))\n", "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n", "tokenizer = tokenization.FullTokenizer(\n", " vocab_file=vocab_file, do_lower_case=True)\n", "examples = read_examples(input_file)\n", "\n", "features = convert_examples_to_features(\n", " examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)\n", "unique_id_to_feature = {}\n", "for feature in features:\n", " unique_id_to_feature[feature.unique_id] = feature" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:34:53.784740Z", "start_time": "2018-11-05T09:34:50.342200Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:Estimator's model_fn (.model_fn at 0x11d0419d8>) includes params argument, but params are not passed to Estimator.\n", "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpt4nhctcg\n", "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpt4nhctcg', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n", "graph_options {\n", " rewrite_options {\n", " meta_optimizer_iterations: ONE\n", " }\n", "}\n", ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': , '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n", "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n", "INFO:tensorflow:_TPUContext: eval_on_tpu True\n", "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n" ] } ], "source": [ "is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\n", "run_config = tf.contrib.tpu.RunConfig(\n", " master=None,\n", " tpu_config=tf.contrib.tpu.TPUConfig(\n", " num_shards=1,\n", " per_host_input_for_training=is_per_host))\n", "\n", "model_fn = model_fn_builder(\n", " bert_config=bert_config,\n", " init_checkpoint=init_checkpoint,\n", " layer_indexes=layer_indexes,\n", " use_tpu=False,\n", " use_one_hot_embeddings=False)\n", "\n", "# If TPU is not available, this will fall back to normal Estimator on CPU\n", "# or GPU.\n", "estimator = tf.contrib.tpu.TPUEstimator(\n", " use_tpu=False,\n", " model_fn=model_fn,\n", " config=run_config,\n", " predict_batch_size=1)\n", "\n", "input_fn = input_fn_builder(\n", " features=features, seq_length=max_seq_length)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:34:58.695496Z", "start_time": "2018-11-05T09:34:53.787465Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpt4nhctcg, running initialization to predict.\n", "INFO:tensorflow:Calling model_fn.\n", "INFO:tensorflow:Running infer on CPU\n", "INFO:tensorflow:Done calling model_fn.\n", "INFO:tensorflow:Graph was finalized.\n", "INFO:tensorflow:Running local_init_op.\n", "INFO:tensorflow:Done running local_init_op.\n", "extracting layer 0\n", "extracting layer 1\n", "extracting layer 2\n", "extracting layer 3\n", "extracting layer 4\n", "extracting layer 5\n", "extracting layer 6\n", "extracting layer 7\n", "extracting layer 8\n", "extracting layer 9\n", "extracting layer 10\n", "extracting layer 11\n", "INFO:tensorflow:prediction_loop marked as finished\n", "INFO:tensorflow:prediction_loop marked as finished\n" ] } ], "source": [ "tensorflow_all_out = []\n", "for result in estimator.predict(input_fn, yield_single_examples=True):\n", " unique_id = int(result[\"unique_id\"])\n", " feature = unique_id_to_feature[unique_id]\n", " output_json = collections.OrderedDict()\n", " output_json[\"linex_index\"] = unique_id\n", " tensorflow_all_out_features = []\n", " # for (i, token) in enumerate(feature.tokens):\n", " all_layers = []\n", " for (j, layer_index) in enumerate(layer_indexes):\n", " print(\"extracting layer {}\".format(j))\n", " layer_output = result[\"layer_output_%d\" % j]\n", " layers = collections.OrderedDict()\n", " layers[\"index\"] = layer_index\n", " layers[\"values\"] = layer_output\n", " all_layers.append(layers)\n", " tensorflow_out_features = collections.OrderedDict()\n", " tensorflow_out_features[\"layers\"] = all_layers\n", " tensorflow_all_out_features.append(tensorflow_out_features)\n", "\n", " output_json[\"features\"] = tensorflow_all_out_features\n", " tensorflow_all_out.append(output_json)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:34:58.741194Z", "start_time": "2018-11-05T09:34:58.697190Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "2\n", "odict_keys(['linex_index', 'features'])\n", "number of tokens 1\n", "number of layers 12\n" ] }, { "data": { "text/plain": [ "(128, 768)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(len(tensorflow_all_out))\n", "print(len(tensorflow_all_out[0]))\n", "print(tensorflow_all_out[0].keys())\n", "print(\"number of tokens\", len(tensorflow_all_out[0]['features']))\n", "print(\"number of layers\", len(tensorflow_all_out[0]['features'][0]['layers']))\n", "tensorflow_all_out[0]['features'][0]['layers'][0]['values'].shape" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:34:58.779046Z", "start_time": "2018-11-05T09:34:58.743861Z" } }, "outputs": [], "source": [ "tensorflow_outputs = list(tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2/ PyTorch code" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:34:58.934535Z", "start_time": "2018-11-05T09:34:58.781393Z" } }, "outputs": [], "source": [ "import extract_features\n", "from extract_features import *" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:41:23.922979Z", "start_time": "2018-11-05T09:41:23.890277Z" } }, "outputs": [], "source": [ "init_checkpoint_pt = \"../google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\"" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:35:00.435355Z", "start_time": "2018-11-05T09:34:59.269985Z" }, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "BertModel(\n", " (embeddings): BERTEmbeddings(\n", " (word_embeddings): Embedding(30522, 768)\n", " (position_embeddings): Embedding(512, 768)\n", " (token_type_embeddings): Embedding(2, 768)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (encoder): BERTEncoder(\n", " (layer): ModuleList(\n", " (0): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (1): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (2): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (3): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (4): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (5): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (6): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (7): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (8): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (9): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (10): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (11): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " )\n", " )\n", " (pooler): BERTPooler(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (activation): Tanh()\n", " )\n", ")" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "device = torch.device(\"cpu\")\n", "model = extract_features.BertModel(bert_config)\n", "model.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\n", "model.to(device)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:35:00.476576Z", "start_time": "2018-11-05T09:35:00.436902Z" }, "code_folding": [] }, "outputs": [ { "data": { "text/plain": [ "BertModel(\n", " (embeddings): BERTEmbeddings(\n", " (word_embeddings): Embedding(30522, 768)\n", " (position_embeddings): Embedding(512, 768)\n", " (token_type_embeddings): Embedding(2, 768)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (encoder): BERTEncoder(\n", " (layer): ModuleList(\n", " (0): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (1): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (2): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (3): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (4): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (5): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (6): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (7): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (8): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (9): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (10): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (11): BERTLayer(\n", " (attention): BERTAttention(\n", " (self): BERTSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1)\n", " )\n", " (output): BERTSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " (intermediate): BERTIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " )\n", " (output): BERTOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): BERTLayerNorm()\n", " (dropout): Dropout(p=0.1)\n", " )\n", " )\n", " )\n", " )\n", " (pooler): BERTPooler(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (activation): Tanh()\n", " )\n", ")" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n", "all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n", "all_input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\n", "all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n", "\n", "eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)\n", "eval_sampler = SequentialSampler(eval_data)\n", "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n", "\n", "model.eval()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:35:00.938199Z", "start_time": "2018-11-05T09:35:00.478338Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[ 101, 2040, 2001, 3958, 27227, 1029, 102, 3958, 27227, 2001,\n", " 1037, 13997, 11510, 102, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]])\n", "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0]])\n", "tensor([0])\n", "layer 0 0\n", "layer 1 1\n", "layer 2 2\n", "layer 3 3\n", "layer 4 4\n", "layer 5 5\n", "layer 6 6\n", "layer 7 7\n", "layer 8 8\n", "layer 9 9\n", "layer 10 10\n", "layer 11 11\n" ] } ], "source": [ "layer_indexes = list(range(12))\n", "\n", "pytorch_all_out = []\n", "for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:\n", " print(input_ids)\n", " print(input_mask)\n", " print(example_indices)\n", " input_ids = input_ids.to(device)\n", " input_mask = input_mask.to(device)\n", "\n", " all_encoder_layers, _ = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)\n", "\n", " for b, example_index in enumerate(example_indices):\n", " feature = features[example_index.item()]\n", " unique_id = int(feature.unique_id)\n", " # feature = unique_id_to_feature[unique_id]\n", " output_json = collections.OrderedDict()\n", " output_json[\"linex_index\"] = unique_id\n", " all_out_features = []\n", " # for (i, token) in enumerate(feature.tokens):\n", " all_layers = []\n", " for (j, layer_index) in enumerate(layer_indexes):\n", " print(\"layer\", j, layer_index)\n", " layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()\n", " layer_output = layer_output[b]\n", " layers = collections.OrderedDict()\n", " layers[\"index\"] = layer_index\n", " layer_output = layer_output\n", " layers[\"values\"] = layer_output if not isinstance(layer_output, (int, float)) else [layer_output]\n", " all_layers.append(layers)\n", "\n", " out_features = collections.OrderedDict()\n", " out_features[\"layers\"] = all_layers\n", " all_out_features.append(out_features)\n", " output_json[\"features\"] = all_out_features\n", " pytorch_all_out.append(output_json)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:35:00.986964Z", "start_time": "2018-11-05T09:35:00.941625Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "2\n", "odict_keys(['linex_index', 'features'])\n", "number of tokens 1\n", "number of layers 12\n", "hidden_size 128\n" ] }, { "data": { "text/plain": [ "(128, 768)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(len(pytorch_all_out))\n", "print(len(pytorch_all_out[0]))\n", "print(pytorch_all_out[0].keys())\n", "print(\"number of tokens\", len(pytorch_all_out))\n", "print(\"number of layers\", len(pytorch_all_out[0]['features'][0]['layers']))\n", "print(\"hidden_size\", len(pytorch_all_out[0]['features'][0]['layers'][0]['values']))\n", "pytorch_all_out[0]['features'][0]['layers'][0]['values'].shape" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:35:01.026420Z", "start_time": "2018-11-05T09:35:00.988377Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(128, 768)\n", "(128, 768)\n" ] } ], "source": [ "pytorch_outputs = list(pytorch_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)\n", "print(pytorch_outputs[0].shape)\n", "print(pytorch_outputs[1].shape)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:35:01.065912Z", "start_time": "2018-11-05T09:35:01.028986Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(128, 768)\n", "(128, 768)\n" ] } ], "source": [ "print(tensorflow_outputs[0].shape)\n", "print(tensorflow_outputs[1].shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3/ Comparing the standard deviation on the last layer of both models" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:35:01.105895Z", "start_time": "2018-11-05T09:35:01.067712Z" } }, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2018-11-05T09:38:17.626158Z", "start_time": "2018-11-05T09:38:17.589346Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape tensorflow layer, shape pytorch layer, standard deviation\n", "((128, 768), (128, 768), 1.5258875e-07)\n", "((128, 768), (128, 768), 2.342731e-07)\n", "((128, 768), (128, 768), 2.801949e-07)\n", "((128, 768), (128, 768), 3.5904986e-07)\n", "((128, 768), (128, 768), 4.2842768e-07)\n", "((128, 768), (128, 768), 5.127951e-07)\n", "((128, 768), (128, 768), 6.14668e-07)\n", "((128, 768), (128, 768), 7.063922e-07)\n", "((128, 768), (128, 768), 7.906173e-07)\n", "((128, 768), (128, 768), 8.475192e-07)\n", "((128, 768), (128, 768), 8.975489e-07)\n", "((128, 768), (128, 768), 4.1671223e-07)\n" ] } ], "source": [ "print('shape tensorflow layer, shape pytorch layer, standard deviation')\n", "print('\\n'.join(list(str((np.array(tensorflow_outputs[i]).shape,\n", " np.array(pytorch_outputs[i]).shape, \n", " np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0)))) for i in range(12))))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" }, "toc": { "colors": { "hover_highlight": "#DAA520", "running_highlight": "#FF0000", "selected_highlight": "#FFD700" }, "moveMenuLeft": true, "nav_menu": { "height": "48px", "width": "252px" }, "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 4, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }