diff --git a/modeling_pytorch.py b/modeling_pytorch.py
index fc530ab1717..9ca262928f7 100644
--- a/modeling_pytorch.py
+++ b/modeling_pytorch.py
@@ -27,6 +27,7 @@ import six
 import tensorflow as tf
 import torch
 import torch.nn as nn
+from torch.nn import CrossEntropyLoss
 
 def gelu(x):
     raise NotImplementedError
@@ -394,3 +395,30 @@ class BertModel(nn.Module):
         sequence_output = all_encoder_layers[-1]
         pooled_output = self.pooler(sequence_output)
         return all_encoder_layers, pooled_output
+
+class BertForSequenceClassification(nn.Module):
+    def __init__(self, config, num_labels):
+        super(BertForSequenceClassification, self).__init__()
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+
+        def init_weights(m):
+            if isinstance(m) == nn.Linear or isinstance(m) == nn.Embedding:
+                print("Initializing {}".format(m))
+                # Slight difference here with the TF version which uses truncated_normal
+                # cf https://github.com/pytorch/pytorch/pull/5617
+                m.weight.normal_(config.initializer_range)
+        self.apply(init_weights)
+
+    def forward(self, input_ids, token_type_ids, attention_mask, labels=None):
+        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits, labels)
+            return loss, logits
+        else:
+            return logits
diff --git a/run_classifier_pytorch.py b/run_classifier_pytorch.py
index 66d4a8444b2..ff90c19314c 100644
--- a/run_classifier_pytorch.py
+++ b/run_classifier_pytorch.py
@@ -20,20 +20,23 @@ from __future__ import print_function
 
 import csv
 import os
-from modeling_pytorch import BertConfig, BertModel
-from optimization_pytorch import BERTAdam
-# import optimization
-import tokenization_pytorch
-import torch
-
 import logging
+import argparse
+
+import numpy as np
+import torch
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+
+import tokenization_pytorch
+from modeling_pytorch import BertConfig, BertForSequenceClassification
+from optimization_pytorch import BERTAdam
+
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                     datefmt = '%m/%d/%Y %H:%M:%S',
                     level = logging.INFO)
 logger = logging.getLogger(__name__)
 
-import argparse
-
 parser = argparse.ArgumentParser()
 
 ## Required parameters
@@ -116,7 +119,7 @@ parser.add_argument("--iterations_per_loop",
                     default = 1000,
                     type = int,
                     help = "How many steps to make in each estimator call.")
- 
+
 parser.add_argument("--no_cuda",
                     default = False,
                     type = bool,
@@ -127,39 +130,6 @@ parser.add_argument("--local_rank",
                     default=-1,
                     help = "local_rank for distributed training on gpus")
 
-### BEGIN - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ###                   
-parser.add_argument("--use_tpu",
-                    default = False,
-                    type = bool,
-                    help = "Whether to use TPU or GPU/CPU.") 
-parser.add_argument("--tpu_name",
-                    default = None,
-                    type = str,
-                    help = "The Cloud TPU to use for training. This should be either the name "
-                        "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
-                        "url.")
-parser.add_argument("--tpu_zone",
-                    default = None,
-                    type = str,
-                    help = "[Optional] GCE zone where the Cloud TPU is located in. If not "
-                        "specified, we will attempt to automatically detect the GCE project from "
-                        "metadata.")                    
-parser.add_argument("--gcp_project",
-                    default = None,
-                    type = str,
-                    help = "[Optional] Project name for the Cloud TPU-enabled project. If not "
-                        "specified, we will attempt to automatically detect the GCE project from "
-                        "metadata.")                     
-parser.add_argument("--master",
-                    default = None,
-                    type = str,
-                    help = "[Optional] TensorFlow master URL.")                                                    
-parser.add_argument("--num_tpu_cores",
-                    default = 8,
-                    type = int,
-                    help = "Only used if `use_tpu` is True. Total number of TPU cores to use.")
-### END - TO DELETE EVENTUALLY --> NO SENSE IN PYTORCH ### 
-  
 args = parser.parse_args()
 
 class InputExample(object):
@@ -429,44 +399,41 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
             tokens_b.pop()
 
 
-def input_fn_builder(features, seq_length, is_training, drop_remainder):
+def input_fn_builder(features, seq_length, train_batch_size):
+    # TODO: delete
     """Creates an `input_fn` closure to be passed to TPUEstimator.""" ### ATTENTION - To rewrite ###
 
-    all_input_ids = []
-    all_input_mask = []
-    all_segment_ids = []
-    all_label_ids = []
+    all_input_ids = [f.input_ids for feature in features]
+    all_input_mask = [f.input_mask for feature in features]
+    all_segment_ids = [f.segment_ids for feature in features]
+    all_label_ids = [f.label_id for feature in features]
 
-    for feature in features:
-        all_input_ids.append(feature.input_ids)
-        all_input_mask.append(feature.input_mask)
-        all_segment_ids.append(feature.segment_ids)
-        all_label_ids.append(feature.label_id)
+    # for feature in features:
+    #     all_input_ids.append(feature.input_ids)
+    #     all_input_mask.append(feature.input_mask)
+    #     all_segment_ids.append(feature.segment_ids)
+    #     all_label_ids.append(feature.label_id)
 
-    def input_fn(params):
-        """The actual input function."""
-        batch_size = params["batch_size"]
+    input_ids_tensor = torch.tensor(all_input_ids, dtype=torch.Long)
+    input_mask_tensor = torch.tensor(all_input_mask, dtype=torch.Long)
+    segment_tensor = torch.tensor(all_segment, dtype=torch.Long)
+    label_tensor = torch.tensor(all_label, dtype=torch.Long)
 
-        num_examples = len(features)
-        
-        device = torch.device("cuda") if args.use_gpu else torch.device("cpu")
-        d = torch.utils.data.TensorDataset({ ## BUG THIS IS NOT WORKING.... ###
-            "input_ids": torch.IntTensor(all_input_ids, device=device), #Requires_grad=False by default
-            "input_mask": torch.IntTensor(all_input_mask, device=device),
-            "segment_ids": torch.IntTensor(all_segment_ids, device=device),
-            "label_ids": torch.IntTensor(all_label_ids, device=device)
-            })
+    train_data = TensorDataset(input_ids_tensor, input_mask_tensor,
+                               segment_tensor, label_tensor)
+    if args.local_rank == -1:
+        train_sampler = RandomSampler(train_data)
+    else:
+        train_sampler = DistributedSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)
 
-        shuffle = True if is_training else False
-        d = torch.utils.data.DataLoader(dataset=d, batch_size=batch_size, 
-                                        shuffle=shuffle,drop_last=drop_remainder)       
-        # Cf https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
-        return d
+    return train_dataloader
 
-    return input_fn
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs==labels)/float(labels.size)
 
-
-def main(_):
+def main():
     processors = {
         "cola": ColaProcessor,
         "mnli": MnliProcessor,
@@ -492,7 +459,7 @@ def main(_):
             "Cannot use sequence length %d because the BERT model "
             "was only trained up to sequence length %d" %
             (args.max_seq_length, bert_config.max_position_embeddings))
-    
+
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
                     raise ValueError(f"Output directory ({args.output_dir}) already exists and is "
                                      f"not empty.")
@@ -517,13 +484,13 @@ def main(_):
         num_train_steps = int(
             len(train_examples) / args.train_batch_size * args.num_train_epochs)
 
-    model = BertModel(bert_config)
+    model = BertForSequenceClassification(bert_config)
     if args.init_checkpoint is not None:
-        model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
+        model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
     model.to(device)
 
     optimizer = BERTAdam([{'params': [p for n, p in model.named_parameters() if n != 'bias'], 'l2': 0.01},
-                          {'params': [p for n, p in model.named_parameters() if n != 'bias']}
+                          {'params': [p for n, p in model.named_parameters() if n == 'bias'], 'l2': 0.}
                          ],
                          lr=args.learning_rate, schedule='warmup_linear',
                          warmup=args.warmup_proportion,
@@ -536,18 +503,31 @@ def main(_):
         logger.info("  Num examples = %d", len(train_examples))
         logger.info("  Batch size = %d", args.train_batch_size)
         logger.info("  Num steps = %d", num_train_steps)
-        train_input = input_fn_builder(
-            features=train_features,
-            seq_length=args.max_seq_length,
-            is_training=True,
-            drop_remainder=True)
-        # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
-        for batch_ix, batch in train_input:
-            output = model_fn(batch)
-            loss = output["loss"]
+
+        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.Long)
+        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.Long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.Long)
+        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.Long)
+
+        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+        if args.local_rank == -1:
+            train_sampler = RandomSampler(train_data)
+        else:
+            train_sampler = DistributedSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+        model.train()
+        global_step = 0
+        for input_ids, input_mask, segment_ids, label_ids in train_dataloader:
+            input_ids.to(device)
+            input_mask.to(device)
+            segment_ids.to(device)
+            label_ids.to(device)
+
+            loss = model(input_ids, segment_ids, input_mask, label_ids)
             loss.backward()
-
-
+            optimizer.step()
+            global_step += 1
 
     if args.do_eval:
         eval_examples = processor.get_dev_examples(args.data_dir)
@@ -558,23 +538,40 @@ def main(_):
         logger.info("  Num examples = %d", len(eval_examples))
         logger.info("  Batch size = %d", args.eval_batch_size)
 
-        # This tells the estimator to run through the entire set.
-        eval_steps = None
-        # However, if running eval on the TPU, you will need to specify the
-        # number of steps.
-        if args.use_tpu:
-            # Eval will be slightly WRONG on the TPU because it will truncate
-            # the last batch.
-            eval_steps = int(len(eval_examples) / args.eval_batch_size)
+        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.Long)
+        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.Long)
+        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.Long)
+        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.Long)
 
-        eval_drop_remainder = True if args.use_tpu else False
-        eval_input_fn = input_fn_builder(
-            features=eval_features,
-            seq_length=args.max_seq_length,
-            is_training=False,
-            drop_remainder=eval_drop_remainder)
+        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+        if args.local_rank == -1:
+            eval_sampler = SequentialSampler(eval_data)
+        else:
+            eval_sampler = DistributedSampler(eval_data)
+        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
-        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
+        model.eval()
+        eval_loss = 0
+        eval_accuracy = 0
+        for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
+            input_ids.to(device)
+            input_mask.to(device)
+            segment_ids.to(device)
+            label_ids.to(device)
+
+            tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
+            tmp_eval_accuracy = accuracy(logits, label_ids)
+
+            eval_loss += tmp_eval_loss.item()
+            eval_accuracy += tmp_eval_accuracy
+
+        eval_loss = eval_loss / len(eval_dataloader)
+        eval_accuracy = eval_accuracy / len(eval_dataloader)
+
+        result = {'eval_loss': eval_loss,
+                  'eval_accuracy': eval_accuracy,
+                  'global_step': global_step,
+                  'loss': loss.item()}
 
         output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
         with open(output_eval_file, "w") as writer:
@@ -582,6 +579,6 @@ def main(_):
             for key in sorted(result.keys()):
                 logger.info("  %s = %s", key, str(result[key]))
                 writer.write("%s = %s\n" % (key, str(result[key])))
-    
+
 if __name__ == "__main__":
     main()