diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py index 7d1a9e8e8cf..bc6ff149796 100644 --- a/examples/contrib/run_swag.py +++ b/examples/contrib/run_swag.py @@ -487,7 +487,7 @@ def evaluate(args, model, tokenizer, prefix=""): def main(): parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv" ) @@ -520,7 +520,7 @@ def main(): help="The output directory where the model checkpoints and predictions will be written.", ) - ## Other parameters + # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py index ca73419689b..c046730c124 100644 --- a/examples/distillation/run_squad_w_distillation.py +++ b/examples/distillation/run_squad_w_distillation.py @@ -430,7 +430,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal def main(): parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json" ) @@ -486,7 +486,7 @@ def main(): "--temperature", default=2.0, type=float, help="Distillation temperature. Only for distillation." ) - ## Other parameters + # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) diff --git a/examples/distillation/scripts/extract.py b/examples/distillation/scripts/extract.py index 429350a7725..f91b6d321fe 100644 --- a/examples/distillation/scripts/extract.py +++ b/examples/distillation/scripts/extract.py @@ -43,7 +43,7 @@ if __name__ == "__main__": state_dict = model.state_dict() compressed_sd = {} - ### Embeddings ### + # Embeddings # if args.model_type == "gpt2": for param_name in ["wte.weight", "wpe.weight"]: compressed_sd[f"{prefix}.{param_name}"] = state_dict[f"{prefix}.{param_name}"] @@ -55,7 +55,7 @@ if __name__ == "__main__": param_name = f"{prefix}.embeddings.LayerNorm.{w}" compressed_sd[param_name] = state_dict[param_name] - ### Transformer Blocks ### + # Transformer Blocks # std_idx = 0 for teacher_idx in [0, 2, 4, 7, 9, 11]: if args.model_type == "gpt2": @@ -82,7 +82,7 @@ if __name__ == "__main__": ] std_idx += 1 - ### Language Modeling Head ###s + # Language Modeling Head ###s if args.model_type == "roberta": for layer in ["lm_head.decoder.weight", "lm_head.bias"]: compressed_sd[f"{layer}"] = state_dict[f"{layer}"] diff --git a/examples/distillation/train.py b/examples/distillation/train.py index a37a7c4274c..670d03ea16e 100644 --- a/examples/distillation/train.py +++ b/examples/distillation/train.py @@ -219,7 +219,7 @@ def main(): args = parser.parse_args() sanity_checks(args) - ## ARGS ## + # ARGS # init_gpu_params(args) set_seed(args) if args.is_master: @@ -236,7 +236,7 @@ def main(): os.makedirs(args.dump_path) logger.info(f"Experiment will be dumped and logged in {args.dump_path}") - ### SAVE PARAMS ### + # SAVE PARAMS # logger.info(f"Param: {args}") with open(os.path.join(args.dump_path, "parameters.json"), "w") as f: json.dump(vars(args), f, indent=4) @@ -245,7 +245,7 @@ def main(): student_config_class, student_model_class, _ = MODEL_CLASSES[args.student_type] teacher_config_class, teacher_model_class, teacher_tokenizer_class = MODEL_CLASSES[args.teacher_type] - ### TOKENIZER ### + # TOKENIZER # tokenizer = teacher_tokenizer_class.from_pretrained(args.teacher_name) special_tok_ids = {} for tok_name, tok_symbol in tokenizer.special_tokens_map.items(): @@ -255,7 +255,7 @@ def main(): args.special_tok_ids = special_tok_ids args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name] - ## DATA LOADER ## + # DATA LOADER # logger.info(f"Loading data from {args.data_file}") with open(args.data_file, "rb") as fp: data = pickle.load(fp) @@ -275,7 +275,7 @@ def main(): train_lm_seq_dataset = LmSeqsDataset(params=args, data=data) logger.info(f"Data loader created.") - ## STUDENT ## + # STUDENT # logger.info(f"Loading student config from {args.student_config}") stu_architecture_config = student_config_class.from_pretrained(args.student_config) stu_architecture_config.output_hidden_states = True @@ -290,26 +290,26 @@ def main(): student.to(f"cuda:{args.local_rank}") logger.info(f"Student loaded.") - ## TEACHER ## + # TEACHER # teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True) if args.n_gpu > 0: teacher.to(f"cuda:{args.local_rank}") logger.info(f"Teacher loaded from {args.teacher_name}.") - ## FREEZING ## + # FREEZING # if args.freeze_pos_embs: freeze_pos_embeddings(student, args) if args.freeze_token_type_embds: freeze_token_type_embeddings(student, args) - ## SANITY CHECKS ## + # SANITY CHECKS # assert student.config.vocab_size == teacher.config.vocab_size assert student.config.hidden_size == teacher.config.hidden_size assert student.config.max_position_embeddings == teacher.config.max_position_embeddings if args.mlm: assert token_probs.size(0) == stu_architecture_config.vocab_size - ## DISTILLER ## + # DISTILLER # torch.cuda.empty_cache() distiller = Distiller( params=args, dataset=train_lm_seq_dataset, token_probs=token_probs, student=student, teacher=teacher diff --git a/examples/mm-imdb/run_mmimdb.py b/examples/mm-imdb/run_mmimdb.py index 24ad82190cc..abea83bff95 100644 --- a/examples/mm-imdb/run_mmimdb.py +++ b/examples/mm-imdb/run_mmimdb.py @@ -344,7 +344,7 @@ def load_examples(args, tokenizer, evaluate=False): def main(): parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--data_dir", default=None, @@ -374,7 +374,7 @@ def main(): help="The output directory where the model predictions and checkpoints will be written.", ) - ## Other parameters + # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) diff --git a/examples/run_bertology.py b/examples/run_bertology.py index 27709fa7ee1..c3fe4b47135 100644 --- a/examples/run_bertology.py +++ b/examples/run_bertology.py @@ -242,7 +242,7 @@ def prune_heads(args, model, eval_dataloader, head_mask): def main(): parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--data_dir", default=None, @@ -272,7 +272,7 @@ def main(): help="The output directory where the model predictions and checkpoints will be written.", ) - ## Other parameters + # Other parameters parser.add_argument( "--config_name", default="", diff --git a/examples/run_glue.py b/examples/run_glue.py index f9819005946..fe5cc7e604b 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -410,7 +410,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): def main(): parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--data_dir", default=None, @@ -447,7 +447,7 @@ def main(): help="The output directory where the model predictions and checkpoints will be written.", ) - ## Other parameters + # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index ae3d68dadaf..1fae12299d3 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -422,7 +422,7 @@ def evaluate(args, model, tokenizer, prefix=""): def main(): parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file)." ) @@ -434,7 +434,7 @@ def main(): help="The output directory where the model predictions and checkpoints will be written.", ) - ## Other parameters + # Other parameters parser.add_argument( "--eval_data_file", default=None, diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py index 82f5a7ee4c6..cb0ddb09a51 100644 --- a/examples/run_multiple_choice.py +++ b/examples/run_multiple_choice.py @@ -385,7 +385,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): def main(): parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--data_dir", default=None, @@ -422,7 +422,7 @@ def main(): help="The output directory where the model predictions and checkpoints will be written.", ) - ## Other parameters + # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) diff --git a/examples/run_ner.py b/examples/run_ner.py index 8d991555a90..7120c373648 100644 --- a/examples/run_ner.py +++ b/examples/run_ner.py @@ -385,7 +385,7 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode): def main(): parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--data_dir", default=None, @@ -415,7 +415,7 @@ def main(): help="The output directory where the model predictions and checkpoints will be written.", ) - ## Other parameters + # Other parameters parser.add_argument( "--labels", default="", diff --git a/examples/run_xnli.py b/examples/run_xnli.py index f772bb5cbb9..f550ca7c58e 100644 --- a/examples/run_xnli.py +++ b/examples/run_xnli.py @@ -377,7 +377,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): def main(): parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--data_dir", default=None, @@ -417,7 +417,7 @@ def main(): help="The output directory where the model predictions and checkpoints will be written.", ) - ## Other parameters + # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py index eec29b59b75..aa5c5ae4c9d 100644 --- a/templates/adding_a_new_example_script/run_xxx.py +++ b/templates/adding_a_new_example_script/run_xxx.py @@ -401,7 +401,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal def main(): parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json" ) @@ -434,7 +434,7 @@ def main(): help="The output directory where the model checkpoints and predictions will be written.", ) - ## Other parameters + # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py index 2e6c473475e..06aa4bf3780 100755 --- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py +++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py @@ -43,7 +43,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du if __name__ == "__main__": parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) diff --git a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py index bba3269a905..957379b5b85 100644 --- a/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py @@ -43,7 +43,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pyt if __name__ == "__main__": parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) diff --git a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py index 87608f482f5..50695dedbec 100755 --- a/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py @@ -43,7 +43,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor if __name__ == "__main__": parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) diff --git a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py index 3aa8957257d..4f5bb0aa6c5 100755 --- a/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_gpt2_original_tf_checkpoint_to_pytorch.py @@ -51,7 +51,7 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p if __name__ == "__main__": parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--gpt2_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) diff --git a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py index 25c2a0a006b..d1d245dbeff 100755 --- a/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_openai_original_tf_checkpoint_to_pytorch.py @@ -51,7 +51,7 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c if __name__ == "__main__": parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--openai_checkpoint_folder_path", default=None, diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py index ba1dec53b6e..6494814309e 100644 --- a/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -410,7 +410,7 @@ def convert_all_pt_checkpoints_to_tf( if __name__ == "__main__": parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--tf_dump_path", default=None, type=str, required=True, help="Path to the output Tensorflow dump file." ) diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py index 5cf766b81d7..9044bc59269 100644 --- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py +++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py @@ -94,7 +94,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ layer: BertLayer = model.roberta.encoder.layer[i] roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i] - ### self attention + # self attention self_attn: BertSelfAttention = layer.attention.self assert ( roberta_layer.self_attn.k_proj.weight.data.shape @@ -110,7 +110,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias - ### self-attention output + # self-attention output self_output: BertSelfOutput = layer.attention.output assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape self_output.dense.weight = roberta_layer.self_attn.out_proj.weight @@ -118,20 +118,20 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias - ### intermediate + # intermediate intermediate: BertIntermediate = layer.intermediate assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape intermediate.dense.weight = roberta_layer.fc1.weight intermediate.dense.bias = roberta_layer.fc1.bias - ### output + # output bert_output: BertOutput = layer.output assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape bert_output.dense.weight = roberta_layer.fc2.weight bert_output.dense.bias = roberta_layer.fc2.bias bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias - #### end of layer + # end of layer if classification_head: model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight @@ -170,7 +170,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ if __name__ == "__main__": parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." ) diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py index 853c9b71751..94ba61f6e46 100755 --- a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py @@ -43,7 +43,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_du if __name__ == "__main__": parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) diff --git a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py index ef98b76ab13..30768fa96cf 100755 --- a/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py +++ b/transformers/convert_xlm_original_pytorch_checkpoint_to_pytorch.py @@ -70,7 +70,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p if __name__ == "__main__": parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump." ) diff --git a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py index 37e93b7a1fa..5c652244911 100755 --- a/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_xlnet_original_tf_checkpoint_to_pytorch.py @@ -82,7 +82,7 @@ def convert_xlnet_checkpoint_to_pytorch( if __name__ == "__main__": parser = argparse.ArgumentParser() - ## Required parameters + # Required parameters parser.add_argument( "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." ) diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py index aa732b31e79..18a96a88766 100644 --- a/transformers/modeling_distilbert.py +++ b/transformers/modeling_distilbert.py @@ -47,7 +47,7 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { } -### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ### +# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # def gelu(x): return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0))) @@ -327,7 +327,7 @@ class Transformer(nn.Module): return outputs # last-layer hidden state, (all hidden states), (all attentions) -### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ### +# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # class DistilBertPreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py index 297d7edb129..8692e3eba49 100644 --- a/transformers/modeling_tf_distilbert.py +++ b/transformers/modeling_tf_distilbert.py @@ -42,7 +42,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { } -### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ### +# UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # def gelu(x): """ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when initially created. @@ -463,7 +463,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) -### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ### +# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL # class TFDistilBertPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py index f05b8aa4a94..92ff8bf21b5 100644 --- a/transformers/modeling_tf_pytorch_utils.py +++ b/transformers/modeling_tf_pytorch_utils.py @@ -67,7 +67,8 @@ def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove="") ##################### -### PyTorch => TF 2.0 +# PyTorch => TF 2.0 # +##################### def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False): @@ -197,7 +198,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a ##################### -### TF 2.0 => PyTorch +# TF 2.0 => PyTorch # +##################### def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False): diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py index b5618df38ae..068e9ac121f 100644 --- a/transformers/modeling_tf_transfo_xl.py +++ b/transformers/modeling_tf_transfo_xl.py @@ -79,23 +79,23 @@ class TFPositionwiseFF(tf.keras.layers.Layer): def call(self, inp, training=False): if self.pre_lnorm: - ##### layer normalization + positionwise feed-forward + # layer normalization + positionwise feed-forward core_out = self.layer_norm(inp) core_out = self.layer_1(core_out) core_out = self.drop_1(core_out, training=training) core_out = self.layer_2(core_out) core_out = self.drop_2(core_out, training=training) - ##### residual connection + # residual connection output = core_out + inp else: - ##### positionwise feed-forward + # positionwise feed-forward core_out = self.layer_1(inp) core_out = self.drop_1(core_out, training=training) core_out = self.layer_2(core_out) core_out = self.drop_2(core_out, training=training) - ##### residual connection + layer normalization + # residual connection + layer normalization output = self.layer_norm(inp + core_out) return output @@ -206,7 +206,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head)) # qlen x n_head x d_head - #### compute attention score + # compute attention score rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k) # qlen x klen x bsz x n_head @@ -218,7 +218,7 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): attn_score = AC + BD attn_score = attn_score * self.scale - #### compute attention probability + # compute attention probability if attn_mask is not None: attn_mask_t = attn_mask[:, :, None, None] attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t @@ -231,22 +231,22 @@ class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer): if head_mask is not None: attn_prob = attn_prob * head_mask - #### compute attention vector + # compute attention vector attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v) # [qlen x bsz x n_head x d_head] attn_vec_sizes = shape_list(attn_vec) attn_vec = tf.reshape(attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head)) - ##### linear projection + # linear projection attn_out = self.o_net(attn_vec) attn_out = self.drop(attn_out, training=training) if self.pre_lnorm: - ##### residual connection + # residual connection outputs = [w + attn_out] else: - ##### residual connection + layer normalization + # residual connection + layer normalization outputs = [self.layer_norm(w + attn_out)] if self.output_attentions: diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py index 9e48856a642..4bc8df2dafc 100644 --- a/transformers/modeling_tf_xlnet.py +++ b/transformers/modeling_tf_xlnet.py @@ -190,7 +190,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): (h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems, target_mapping, head_mask) = inputs if g is not None: - ###### Two-stream attention with relative positional encoding. + # Two-stream attention with relative positional encoding. # content based attention score if mems is not None and len(shape_list(mems)) > 1: cat = tf.concat([mems, h], axis=0) @@ -206,7 +206,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): # position-based key head k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r) - ##### h-stream + # h-stream # content-stream query head q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q) @@ -221,7 +221,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): # post processing output_h = self.post_attention([h, attn_vec_h], training=training) - ##### g-stream + # g-stream # query-stream query head q_head_g = tf.einsum("ibh,hnd->ibnd", g, self.q) @@ -251,7 +251,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): attn_prob = attn_prob_h, attn_prob_g else: - ###### Multi-head attention with relative positional encoding + # Multi-head attention with relative positional encoding if mems is not None and len(shape_list(mems)) > 1: cat = tf.concat([mems, h], axis=0) else: @@ -552,7 +552,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): dtype_float = tf.bfloat16 if self.use_bfloat16 else tf.float32 - ##### Attention mask + # Attention mask # causal attention mask if self.attn_type == "uni": attn_mask = self.create_mask(qlen, mlen) @@ -597,7 +597,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): else: non_tgt_mask = None - ##### Word embeddings and prepare h & g hidden states + # Word embeddings and prepare h & g hidden states if inputs_embeds is not None: word_emb_k = inputs_embeds else: @@ -612,7 +612,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): else: output_g = None - ##### Segment embedding + # Segment embedding if token_type_ids is not None: # Convert `token_type_ids` to one-hot `seg_mat` mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32) @@ -624,7 +624,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): else: seg_mat = None - ##### Positional encoding + # Positional encoding pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz, dtype=dtype_float) pos_emb = self.dropout(pos_emb, training=training) diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py index a9040b53db2..4ac524ee81f 100644 --- a/transformers/modeling_transfo_xl.py +++ b/transformers/modeling_transfo_xl.py @@ -213,16 +213,16 @@ class PositionwiseFF(nn.Module): def forward(self, inp): if self.pre_lnorm: - ##### layer normalization + positionwise feed-forward + # layer normalization + positionwise feed-forward core_out = self.CoreNet(self.layer_norm(inp)) - ##### residual connection + # residual connection output = core_out + inp else: - ##### positionwise feed-forward + # positionwise feed-forward core_out = self.CoreNet(inp) - ##### residual connection + layer normalization + # residual connection + layer normalization output = self.layer_norm(inp + core_out) return output @@ -316,7 +316,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head - #### compute attention score + # compute attention score rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head AC = torch.einsum("ibnd,jbnd->ijbn", (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head @@ -328,7 +328,7 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): attn_score = AC + BD attn_score.mul_(self.scale) - #### compute attention probability + # compute attention probability if attn_mask is not None and torch.sum(attn_mask).item(): attn_mask = attn_mask == 1 # Switch to bool if attn_mask.dim() == 2: @@ -352,21 +352,21 @@ class RelPartialLearnableMultiHeadAttn(nn.Module): if head_mask is not None: attn_prob = attn_prob * head_mask - #### compute attention vector + # compute attention vector attn_vec = torch.einsum("ijbn,jbnd->ibnd", (attn_prob, w_head_v)) # [qlen x bsz x n_head x d_head] attn_vec = attn_vec.contiguous().view(attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) - ##### linear projection + # linear projection attn_out = self.o_net(attn_vec) attn_out = self.drop(attn_out) if self.pre_lnorm: - ##### residual connection + # residual connection outputs = [w + attn_out] else: - ##### residual connection + layer normalization + # residual connection + layer normalization outputs = [self.layer_norm(w + attn_out)] if self.output_attentions: diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py index d749f1d1226..f87e09a3bcc 100644 --- a/transformers/modeling_xlnet.py +++ b/transformers/modeling_xlnet.py @@ -330,7 +330,7 @@ class XLNetRelativeAttention(nn.Module): def forward(self, h, g, attn_mask_h, attn_mask_g, r, seg_mat, mems=None, target_mapping=None, head_mask=None): if g is not None: - ###### Two-stream attention with relative positional encoding. + # Two-stream attention with relative positional encoding. # content based attention score if mems is not None and mems.dim() > 1: cat = torch.cat([mems, h], dim=0) @@ -346,7 +346,7 @@ class XLNetRelativeAttention(nn.Module): # position-based key head k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r) - ##### h-stream + # h-stream # content-stream query head q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q) @@ -361,7 +361,7 @@ class XLNetRelativeAttention(nn.Module): # post processing output_h = self.post_attention(h, attn_vec_h) - ##### g-stream + # g-stream # query-stream query head q_head_g = torch.einsum("ibh,hnd->ibnd", g, self.q) @@ -391,7 +391,7 @@ class XLNetRelativeAttention(nn.Module): attn_prob = attn_prob_h, attn_prob_g else: - ###### Multi-head attention with relative positional encoding + # Multi-head attention with relative positional encoding if mems is not None and mems.dim() > 1: cat = torch.cat([mems, h], dim=0) else: @@ -804,7 +804,7 @@ class XLNetModel(XLNetPreTrainedModel): dtype_float = next(self.parameters()).dtype device = next(self.parameters()).device - ##### Attention mask + # Attention mask # causal attention mask if self.attn_type == "uni": attn_mask = self.create_mask(qlen, mlen) @@ -849,7 +849,7 @@ class XLNetModel(XLNetPreTrainedModel): else: non_tgt_mask = None - ##### Word embeddings and prepare h & g hidden states + # Word embeddings and prepare h & g hidden states if inputs_embeds is not None: word_emb_k = inputs_embeds else: @@ -864,7 +864,7 @@ class XLNetModel(XLNetPreTrainedModel): else: output_g = None - ##### Segment embedding + # Segment embedding if token_type_ids is not None: # Convert `token_type_ids` to one-hot `seg_mat` if mlen > 0: @@ -879,7 +879,7 @@ class XLNetModel(XLNetPreTrainedModel): else: seg_mat = None - ##### Positional encoding + # Positional encoding pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz) pos_emb = self.dropout(pos_emb) diff --git a/transformers/optimization_tf.py b/transformers/optimization_tf.py index 18c261f6c66..83eff902fbd 100644 --- a/transformers/optimization_tf.py +++ b/transformers/optimization_tf.py @@ -178,7 +178,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam): return True -## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py +# Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py class GradientAccumulator(object): """Distribution strategies-aware gradient accumulation utility."""