diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000000..7a6ba382df2 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +*.py eol=lf +*.rst eol=lf +*.md eol=lf \ No newline at end of file diff --git a/examples/research_projects/bertology/run_prune_gpt.py b/examples/research_projects/bertology/run_prune_gpt.py index e89a6a4fdcb..7e88f3081e9 100644 --- a/examples/research_projects/bertology/run_prune_gpt.py +++ b/examples/research_projects/bertology/run_prune_gpt.py @@ -1,388 +1,388 @@ -#!/usr/bin/env python3 -""" This script is adapted from the Bertology pruning code (https://github.com/huggingface/transformers/blob/783d7d2629e97c5f0c5f9ef01b8c66410275c204/examples/research_projects/bertology/run_bertology.py) -to prune GPT-like models. The author is @altsoph. -""" - -import argparse -import logging -import os -from datetime import datetime - -import numpy as np -import torch -from torch.utils.data import DataLoader, RandomSampler, TensorDataset -from tqdm import tqdm - -from transformers import GPT2LMHeadModel - - -logger = logging.getLogger(__name__) - - -def save_model(model, dirpath): - # save results - if os.path.exists(dirpath): - if os.path.exists(os.path.join(dirpath, "config.json")) and os.path.isfile( - os.path.join(dirpath, "config.json") - ): - os.remove(os.path.join(dirpath, "config.json")) - if os.path.exists(os.path.join(dirpath, "pytorch_model.bin")) and os.path.isfile( - os.path.join(dirpath, "pytorch_model.bin") - ): - os.remove(os.path.join(dirpath, "pytorch_model.bin")) - else: - os.makedirs(dirpath) - model.save_pretrained(dirpath) - - -def entropy(p, unlogit=False): - """ Compute the entropy of a probability distribution """ - exponent = 2 - if unlogit: - p = torch.pow(p, exponent) - plogp = p * torch.log(p) - plogp[p == 0] = 0 - return -plogp.sum(dim=-1) - - -def print_2d_tensor(tensor): - """ Print a 2D tensor """ - logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor)))) - for row in range(len(tensor)): - if tensor.dtype != torch.long: - logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:.5f}" for x in tensor[row].cpu().data)) - else: - logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data)) - - -def compute_heads_importance( - args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False -): - """This method shows how to compute: - - head attention entropy - - head importance scores according to http://arxiv.org/abs/1905.10650 - """ - # Prepare our tensors - n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads - head_importance = torch.zeros(n_layers, n_heads).to(args.device) - attn_entropy = torch.zeros(n_layers, n_heads).to(args.device) - - if head_mask is None: - head_mask = torch.ones(n_layers, n_heads).to(args.device) - - head_mask.requires_grad_(requires_grad=True) - # If actually pruned attention multi-head, set head mask to None to avoid shape mismatch - if actually_pruned: - head_mask = None - - tot_tokens = 0.0 - total_loss = 0.0 - for step, inputs in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): - inputs = tuple(t.to(args.device) for t in inputs) - (input_ids,) = inputs - - # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below) - outputs = model(input_ids, labels=input_ids, head_mask=head_mask) - # (loss), lm_logits, presents, (all hidden_states), (attentions) - loss, _, all_attentions = ( - outputs[0], - outputs[1], - outputs[-1], - ) # Loss and logits are the first, attention the last - loss.backward() # Backpropagate to populate the gradients in the head mask - total_loss += loss.detach().cpu().numpy() - if compute_entropy: - for layer, attn in enumerate(all_attentions): - masked_entropy = entropy(attn.detach(), True) - attn_entropy[layer] += masked_entropy.sum(-1).sum(0).sum(0).detach() - - if compute_importance: - head_importance += head_mask.grad.abs().detach() - tot_tokens += torch.ones_like(input_ids).float().detach().sum().data - - # Normalize - attn_entropy /= tot_tokens - head_importance /= tot_tokens - # Layerwise importance normalization - if not args.dont_normalize_importance_by_layer: - exponent = 2 - norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent) - head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20 - - if not args.dont_normalize_global_importance: - head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min()) - - # Print matrices - if compute_entropy: - logger.info("Attention entropies") - print_2d_tensor(attn_entropy) - if compute_importance: - logger.info("Head importance scores") - print_2d_tensor(head_importance) - logger.info("Head ranked by importance scores") - head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device) - head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange( - head_importance.numel(), device=args.device - ) - head_ranks = head_ranks.view_as(head_importance) - print_2d_tensor(head_ranks) - return attn_entropy, head_importance, total_loss - - -def mask_heads(args, model, eval_dataloader): - """This method shows how to mask head (set some heads to zero), to test the effect on the network, - based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650) - """ - _, head_importance, loss = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False) - original_score = 1 / loss # instead of downsteam score use the LM loss - logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold) - - new_head_mask = torch.ones_like(head_importance) - num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount)) - - current_score = original_score - while current_score >= original_score * args.masking_threshold: - head_mask = new_head_mask.clone().detach() # save current head mask - # heads from least important to most - keep only not-masked heads - head_importance[head_mask == 0.0] = float("Inf") - current_heads_to_mask = head_importance.view(-1).sort()[1] - - if len(current_heads_to_mask) <= num_to_mask: - print("BREAK BY num_to_mask") - break - - # mask heads - current_heads_to_mask = current_heads_to_mask[:num_to_mask] - logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist())) - new_head_mask = new_head_mask.view(-1) - new_head_mask[current_heads_to_mask] = 0.0 - new_head_mask = new_head_mask.view_as(head_mask) - new_head_mask = new_head_mask.clone().detach() - print_2d_tensor(new_head_mask) - - # Compute metric and head importance again - _, head_importance, loss = compute_heads_importance( - args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask - ) - current_score = 1 / loss - logger.info( - "Masking: current score: %f, remaining heads %d (%.1f percents)", - current_score, - new_head_mask.sum(), - new_head_mask.sum() / new_head_mask.numel() * 100, - ) - - logger.info("Final head mask") - print_2d_tensor(head_mask) - np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy()) - - return head_mask - - -def prune_heads(args, model, eval_dataloader, head_mask): - """This method shows how to prune head (remove heads weights) based on - the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650) - """ - # Try pruning and test time speedup - # Pruning is like masking but we actually remove the masked weights - before_time = datetime.now() - _, _, loss = compute_heads_importance( - args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask - ) - score_masking = 1 / loss - original_time = datetime.now() - before_time - - original_num_params = sum(p.numel() for p in model.parameters()) - heads_to_prune = dict( - (layer, (1 - head_mask[layer].long()).nonzero().squeeze().tolist()) for layer in range(len(head_mask)) - ) - - for k, v in heads_to_prune.items(): - if isinstance(v, int): - heads_to_prune[k] = [ - v, - ] - - assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item() - model.prune_heads(heads_to_prune) - pruned_num_params = sum(p.numel() for p in model.parameters()) - - before_time = datetime.now() - _, _, loss = compute_heads_importance( - args, - model, - eval_dataloader, - compute_entropy=False, - compute_importance=False, - head_mask=None, - actually_pruned=True, - ) - - score_pruning = 1 / loss - new_time = datetime.now() - before_time - - logger.info( - "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", - original_num_params, - pruned_num_params, - pruned_num_params / original_num_params * 100, - ) - logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning) - logger.info("Pruning: speed ratio (original timing / new timing): %f percents", original_time / new_time * 100) - save_model(model, args.output_dir) - - -def main(): - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--data_dir", - default=None, - type=str, - required=True, - help="The input data dir. Should contain the .tsv files (or other data files) for the task.", - ) - parser.add_argument( - "--model_name_or_path", - default=None, - type=str, - required=True, - help="Path to pretrained model or model identifier from huggingface.co/models", - ) - parser.add_argument( - "--output_dir", - default=None, - type=str, - required=True, - help="The output directory where the model predictions and checkpoints will be written.", - ) - - # Other parameters - parser.add_argument( - "--config_name", - default="", - type=str, - help="Pretrained config name or path if not the same as model_name_or_path", - ) - parser.add_argument( - "--tokenizer_name", - default="", - type=str, - help="Pretrained tokenizer name or path if not the same as model_name_or_path", - ) - parser.add_argument( - "--cache_dir", - default=None, - type=str, - help="Where do you want to store the pre-trained models downloaded from s3", - ) - parser.add_argument( - "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances." - ) - parser.add_argument( - "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory" - ) - parser.add_argument( - "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" - ) - - parser.add_argument( - "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers" - ) - parser.add_argument( - "--dont_normalize_global_importance", - action="store_true", - help="Don't normalize all importance scores between 0 and 1", - ) - - parser.add_argument( - "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy." - ) - parser.add_argument( - "--masking_threshold", - default=0.9, - type=float, - help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).", - ) - parser.add_argument( - "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step." - ) - parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.") - - parser.add_argument( - "--max_seq_length", - default=128, - type=int, - help="The maximum total input sequence length after WordPiece tokenization. \n" - "Sequences longer than this will be truncated, sequences shorter padded.", - ) - parser.add_argument("--batch_size", default=1, type=int, help="Batch size.") - - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") - parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") - parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") - parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") - args = parser.parse_args() - - if args.server_ip and args.server_port: - # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script - import ptvsd - - print("Waiting for debugger attach") - ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) - ptvsd.wait_for_attach() - - # Setup devices and distributed training - if args.local_rank == -1 or args.no_cuda: - args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() - else: - torch.cuda.set_device(args.local_rank) - args.device = torch.device("cuda", args.local_rank) - args.n_gpu = 1 - torch.distributed.init_process_group(backend="nccl") # Initializes the distributed backend - - # Setup logging - logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) - logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1))) - - model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path) - - # Distributed and parallel training - model.to(args.device) - if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel( - model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True - ) - elif args.n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Print/save training arguments - os.makedirs(args.output_dir, exist_ok=True) - torch.save(args, os.path.join(args.output_dir, "run_args.bin")) - logger.info("Training/evaluation parameters %s", args) - - # Prepare dataset - numpy_data = np.concatenate( - [ - np.loadtxt(args.data_dir, dtype=np.int64), - ] - ) - train_tensor_dataset = (torch.from_numpy(numpy_data),) - train_data = TensorDataset(*train_tensor_dataset) - train_sampler = RandomSampler(train_data) - eval_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) - - # Compute head entropy and importance score - compute_heads_importance(args, model, eval_dataloader) - - # Try head masking (set heads to zero until the score goes under a threshole) - # and head pruning (remove masked heads and see the effect on the network) - if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0: - head_mask = mask_heads(args, model, eval_dataloader) - prune_heads(args, model, eval_dataloader, head_mask) - - -if __name__ == "__main__": - main() +#!/usr/bin/env python3 +""" This script is adapted from the Bertology pruning code (https://github.com/huggingface/transformers/blob/783d7d2629e97c5f0c5f9ef01b8c66410275c204/examples/research_projects/bertology/run_bertology.py) +to prune GPT-like models. The author is @altsoph. +""" + +import argparse +import logging +import os +from datetime import datetime + +import numpy as np +import torch +from torch.utils.data import DataLoader, RandomSampler, TensorDataset +from tqdm import tqdm + +from transformers import GPT2LMHeadModel + + +logger = logging.getLogger(__name__) + + +def save_model(model, dirpath): + # save results + if os.path.exists(dirpath): + if os.path.exists(os.path.join(dirpath, "config.json")) and os.path.isfile( + os.path.join(dirpath, "config.json") + ): + os.remove(os.path.join(dirpath, "config.json")) + if os.path.exists(os.path.join(dirpath, "pytorch_model.bin")) and os.path.isfile( + os.path.join(dirpath, "pytorch_model.bin") + ): + os.remove(os.path.join(dirpath, "pytorch_model.bin")) + else: + os.makedirs(dirpath) + model.save_pretrained(dirpath) + + +def entropy(p, unlogit=False): + """ Compute the entropy of a probability distribution """ + exponent = 2 + if unlogit: + p = torch.pow(p, exponent) + plogp = p * torch.log(p) + plogp[p == 0] = 0 + return -plogp.sum(dim=-1) + + +def print_2d_tensor(tensor): + """ Print a 2D tensor """ + logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor)))) + for row in range(len(tensor)): + if tensor.dtype != torch.long: + logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:.5f}" for x in tensor[row].cpu().data)) + else: + logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data)) + + +def compute_heads_importance( + args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False +): + """This method shows how to compute: + - head attention entropy + - head importance scores according to http://arxiv.org/abs/1905.10650 + """ + # Prepare our tensors + n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads + head_importance = torch.zeros(n_layers, n_heads).to(args.device) + attn_entropy = torch.zeros(n_layers, n_heads).to(args.device) + + if head_mask is None: + head_mask = torch.ones(n_layers, n_heads).to(args.device) + + head_mask.requires_grad_(requires_grad=True) + # If actually pruned attention multi-head, set head mask to None to avoid shape mismatch + if actually_pruned: + head_mask = None + + tot_tokens = 0.0 + total_loss = 0.0 + for step, inputs in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): + inputs = tuple(t.to(args.device) for t in inputs) + (input_ids,) = inputs + + # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below) + outputs = model(input_ids, labels=input_ids, head_mask=head_mask) + # (loss), lm_logits, presents, (all hidden_states), (attentions) + loss, _, all_attentions = ( + outputs[0], + outputs[1], + outputs[-1], + ) # Loss and logits are the first, attention the last + loss.backward() # Backpropagate to populate the gradients in the head mask + total_loss += loss.detach().cpu().numpy() + if compute_entropy: + for layer, attn in enumerate(all_attentions): + masked_entropy = entropy(attn.detach(), True) + attn_entropy[layer] += masked_entropy.sum(-1).sum(0).sum(0).detach() + + if compute_importance: + head_importance += head_mask.grad.abs().detach() + tot_tokens += torch.ones_like(input_ids).float().detach().sum().data + + # Normalize + attn_entropy /= tot_tokens + head_importance /= tot_tokens + # Layerwise importance normalization + if not args.dont_normalize_importance_by_layer: + exponent = 2 + norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent) + head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20 + + if not args.dont_normalize_global_importance: + head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min()) + + # Print matrices + if compute_entropy: + logger.info("Attention entropies") + print_2d_tensor(attn_entropy) + if compute_importance: + logger.info("Head importance scores") + print_2d_tensor(head_importance) + logger.info("Head ranked by importance scores") + head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device) + head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange( + head_importance.numel(), device=args.device + ) + head_ranks = head_ranks.view_as(head_importance) + print_2d_tensor(head_ranks) + return attn_entropy, head_importance, total_loss + + +def mask_heads(args, model, eval_dataloader): + """This method shows how to mask head (set some heads to zero), to test the effect on the network, + based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650) + """ + _, head_importance, loss = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False) + original_score = 1 / loss # instead of downsteam score use the LM loss + logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold) + + new_head_mask = torch.ones_like(head_importance) + num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount)) + + current_score = original_score + while current_score >= original_score * args.masking_threshold: + head_mask = new_head_mask.clone().detach() # save current head mask + # heads from least important to most - keep only not-masked heads + head_importance[head_mask == 0.0] = float("Inf") + current_heads_to_mask = head_importance.view(-1).sort()[1] + + if len(current_heads_to_mask) <= num_to_mask: + print("BREAK BY num_to_mask") + break + + # mask heads + current_heads_to_mask = current_heads_to_mask[:num_to_mask] + logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist())) + new_head_mask = new_head_mask.view(-1) + new_head_mask[current_heads_to_mask] = 0.0 + new_head_mask = new_head_mask.view_as(head_mask) + new_head_mask = new_head_mask.clone().detach() + print_2d_tensor(new_head_mask) + + # Compute metric and head importance again + _, head_importance, loss = compute_heads_importance( + args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask + ) + current_score = 1 / loss + logger.info( + "Masking: current score: %f, remaining heads %d (%.1f percents)", + current_score, + new_head_mask.sum(), + new_head_mask.sum() / new_head_mask.numel() * 100, + ) + + logger.info("Final head mask") + print_2d_tensor(head_mask) + np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy()) + + return head_mask + + +def prune_heads(args, model, eval_dataloader, head_mask): + """This method shows how to prune head (remove heads weights) based on + the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650) + """ + # Try pruning and test time speedup + # Pruning is like masking but we actually remove the masked weights + before_time = datetime.now() + _, _, loss = compute_heads_importance( + args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask + ) + score_masking = 1 / loss + original_time = datetime.now() - before_time + + original_num_params = sum(p.numel() for p in model.parameters()) + heads_to_prune = dict( + (layer, (1 - head_mask[layer].long()).nonzero().squeeze().tolist()) for layer in range(len(head_mask)) + ) + + for k, v in heads_to_prune.items(): + if isinstance(v, int): + heads_to_prune[k] = [ + v, + ] + + assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item() + model.prune_heads(heads_to_prune) + pruned_num_params = sum(p.numel() for p in model.parameters()) + + before_time = datetime.now() + _, _, loss = compute_heads_importance( + args, + model, + eval_dataloader, + compute_entropy=False, + compute_importance=False, + head_mask=None, + actually_pruned=True, + ) + + score_pruning = 1 / loss + new_time = datetime.now() - before_time + + logger.info( + "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", + original_num_params, + pruned_num_params, + pruned_num_params / original_num_params * 100, + ) + logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning) + logger.info("Pruning: speed ratio (original timing / new timing): %f percents", original_time / new_time * 100) + save_model(model, args.output_dir) + + +def main(): + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.", + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models", + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model predictions and checkpoints will be written.", + ) + + # Other parameters + parser.add_argument( + "--config_name", + default="", + type=str, + help="Pretrained config name or path if not the same as model_name_or_path", + ) + parser.add_argument( + "--tokenizer_name", + default="", + type=str, + help="Pretrained tokenizer name or path if not the same as model_name_or_path", + ) + parser.add_argument( + "--cache_dir", + default=None, + type=str, + help="Where do you want to store the pre-trained models downloaded from s3", + ) + parser.add_argument( + "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances." + ) + parser.add_argument( + "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory" + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + + parser.add_argument( + "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers" + ) + parser.add_argument( + "--dont_normalize_global_importance", + action="store_true", + help="Don't normalize all importance scores between 0 and 1", + ) + + parser.add_argument( + "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy." + ) + parser.add_argument( + "--masking_threshold", + default=0.9, + type=float, + help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).", + ) + parser.add_argument( + "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step." + ) + parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.") + + parser.add_argument( + "--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, sequences shorter padded.", + ) + parser.add_argument("--batch_size", default=1, type=int, help="Batch size.") + + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") + parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") + parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") + parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") + args = parser.parse_args() + + if args.server_ip and args.server_port: + # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script + import ptvsd + + print("Waiting for debugger attach") + ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) + ptvsd.wait_for_attach() + + # Setup devices and distributed training + if args.local_rank == -1 or args.no_cuda: + args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + args.device = torch.device("cuda", args.local_rank) + args.n_gpu = 1 + torch.distributed.init_process_group(backend="nccl") # Initializes the distributed backend + + # Setup logging + logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) + logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1))) + + model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path) + + # Distributed and parallel training + model.to(args.device) + if args.local_rank != -1: + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True + ) + elif args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Print/save training arguments + os.makedirs(args.output_dir, exist_ok=True) + torch.save(args, os.path.join(args.output_dir, "run_args.bin")) + logger.info("Training/evaluation parameters %s", args) + + # Prepare dataset + numpy_data = np.concatenate( + [ + np.loadtxt(args.data_dir, dtype=np.int64), + ] + ) + train_tensor_dataset = (torch.from_numpy(numpy_data),) + train_data = TensorDataset(*train_tensor_dataset) + train_sampler = RandomSampler(train_data) + eval_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) + + # Compute head entropy and importance score + compute_heads_importance(args, model, eval_dataloader) + + # Try head masking (set heads to zero until the score goes under a threshole) + # and head pruning (remove masked heads and see the effect on the network) + if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0: + head_mask = mask_heads(args, model, eval_dataloader) + prune_heads(args, model, eval_dataloader, head_mask) + + +if __name__ == "__main__": + main() diff --git a/tests/test_modeling_deberta.py b/tests/test_modeling_deberta.py index cc76c725195..f2af7ce4308 100644 --- a/tests/test_modeling_deberta.py +++ b/tests/test_modeling_deberta.py @@ -1,290 +1,290 @@ -# coding=utf-8 -# Copyright 2018 Microsoft Authors and the HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import random -import unittest - -import numpy as np - -from transformers import is_torch_available -from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device - -from .test_configuration_common import ConfigTester -from .test_modeling_common import ModelTesterMixin, ids_tensor - - -if is_torch_available(): - import torch - - from transformers import ( - DebertaConfig, - DebertaForMaskedLM, - DebertaForQuestionAnswering, - DebertaForSequenceClassification, - DebertaForTokenClassification, - DebertaModel, - ) - from transformers.models.deberta.modeling_deberta import DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST - - -@require_torch -class DebertaModelTest(ModelTesterMixin, unittest.TestCase): - - all_model_classes = ( - ( - DebertaModel, - DebertaForMaskedLM, - DebertaForSequenceClassification, - DebertaForTokenClassification, - DebertaForQuestionAnswering, - ) - if is_torch_available() - else () - ) - - test_torchscript = False - test_pruning = False - test_head_masking = False - is_encoder_decoder = False - - class DebertaModelTester(object): - def __init__( - self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - relative_attention=False, - position_biased_input=True, - pos_att_type="None", - num_labels=3, - num_choices=4, - scope=None, - ): - self.parent = parent - self.batch_size = batch_size - self.seq_length = seq_length - self.is_training = is_training - self.use_input_mask = use_input_mask - self.use_token_type_ids = use_token_type_ids - self.use_labels = use_labels - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.type_sequence_label_size = type_sequence_label_size - self.initializer_range = initializer_range - self.num_labels = num_labels - self.num_choices = num_choices - self.relative_attention = relative_attention - self.position_biased_input = position_biased_input - self.pos_att_type = pos_att_type - self.scope = scope - - def prepare_config_and_inputs(self): - input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) - - input_mask = None - if self.use_input_mask: - input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) - - token_type_ids = None - if self.use_token_type_ids: - token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) - - sequence_labels = None - token_labels = None - choice_labels = None - if self.use_labels: - sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) - token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) - choice_labels = ids_tensor([self.batch_size], self.num_choices) - - config = DebertaConfig( - vocab_size=self.vocab_size, - hidden_size=self.hidden_size, - num_hidden_layers=self.num_hidden_layers, - num_attention_heads=self.num_attention_heads, - intermediate_size=self.intermediate_size, - hidden_act=self.hidden_act, - hidden_dropout_prob=self.hidden_dropout_prob, - attention_probs_dropout_prob=self.attention_probs_dropout_prob, - max_position_embeddings=self.max_position_embeddings, - type_vocab_size=self.type_vocab_size, - initializer_range=self.initializer_range, - relative_attention=self.relative_attention, - position_biased_input=self.position_biased_input, - pos_att_type=self.pos_att_type, - ) - - return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - - def check_loss_output(self, result): - self.parent.assertListEqual(list(result.loss.size()), []) - - def create_and_check_deberta_model( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - model = DebertaModel(config=config) - model.to(torch_device) - model.eval() - sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0] - sequence_output = model(input_ids, token_type_ids=token_type_ids)[0] - sequence_output = model(input_ids)[0] - - self.parent.assertListEqual( - list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size] - ) - - def create_and_check_deberta_for_masked_lm( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - model = DebertaForMaskedLM(config=config) - model.to(torch_device) - model.eval() - result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - - self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) - - def create_and_check_deberta_for_sequence_classification( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - config.num_labels = self.num_labels - model = DebertaForSequenceClassification(config) - model.to(torch_device) - model.eval() - result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) - self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels]) - self.check_loss_output(result) - - def create_and_check_deberta_for_token_classification( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - config.num_labels = self.num_labels - model = DebertaForTokenClassification(config=config) - model.to(torch_device) - model.eval() - result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) - self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) - - def create_and_check_deberta_for_question_answering( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - model = DebertaForQuestionAnswering(config=config) - model.to(torch_device) - model.eval() - result = model( - input_ids, - attention_mask=input_mask, - token_type_ids=token_type_ids, - start_positions=sequence_labels, - end_positions=sequence_labels, - ) - self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) - self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) - - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - ( - config, - input_ids, - token_type_ids, - input_mask, - sequence_labels, - token_labels, - choice_labels, - ) = config_and_inputs - inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} - return config, inputs_dict - - def setUp(self): - self.model_tester = DebertaModelTest.DebertaModelTester(self) - self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=37) - - def test_config(self): - self.config_tester.run_common_tests() - - def test_deberta_model(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_deberta_model(*config_and_inputs) - - def test_for_sequence_classification(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_deberta_for_sequence_classification(*config_and_inputs) - - def test_for_masked_lm(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_deberta_for_masked_lm(*config_and_inputs) - - def test_for_question_answering(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_deberta_for_question_answering(*config_and_inputs) - - def test_for_token_classification(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_deberta_for_token_classification(*config_and_inputs) - - @slow - def test_model_from_pretrained(self): - for model_name in DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: - model = DebertaModel.from_pretrained(model_name) - self.assertIsNotNone(model) - - -@require_torch -@require_sentencepiece -@require_tokenizers -class DebertaModelIntegrationTest(unittest.TestCase): - @unittest.skip(reason="Model not available yet") - def test_inference_masked_lm(self): - pass - - @slow - def test_inference_no_head(self): - random.seed(0) - np.random.seed(0) - torch.manual_seed(0) - torch.cuda.manual_seed_all(0) - model = DebertaModel.from_pretrained("microsoft/deberta-base") - - input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) - output = model(input_ids)[0] - # compare the actual values for a slice. - expected_slice = torch.tensor( - [[[-0.0218, -0.6641, -0.3665], [-0.3907, -0.4716, -0.6640], [0.7461, 1.2570, -0.9063]]] - ) - self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4), f"{output[:, :3, :3]}") +# coding=utf-8 +# Copyright 2018 Microsoft Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import random +import unittest + +import numpy as np + +from transformers import is_torch_available +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device + +from .test_configuration_common import ConfigTester +from .test_modeling_common import ModelTesterMixin, ids_tensor + + +if is_torch_available(): + import torch + + from transformers import ( + DebertaConfig, + DebertaForMaskedLM, + DebertaForQuestionAnswering, + DebertaForSequenceClassification, + DebertaForTokenClassification, + DebertaModel, + ) + from transformers.models.deberta.modeling_deberta import DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST + + +@require_torch +class DebertaModelTest(ModelTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + DebertaModel, + DebertaForMaskedLM, + DebertaForSequenceClassification, + DebertaForTokenClassification, + DebertaForQuestionAnswering, + ) + if is_torch_available() + else () + ) + + test_torchscript = False + test_pruning = False + test_head_masking = False + is_encoder_decoder = False + + class DebertaModelTester(object): + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + relative_attention=False, + position_biased_input=True, + pos_att_type="None", + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.relative_attention = relative_attention + self.position_biased_input = position_biased_input + self.pos_att_type = pos_att_type + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = DebertaConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + relative_attention=self.relative_attention, + position_biased_input=self.position_biased_input, + pos_att_type=self.pos_att_type, + ) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def check_loss_output(self, result): + self.parent.assertListEqual(list(result.loss.size()), []) + + def create_and_check_deberta_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DebertaModel(config=config) + model.to(torch_device) + model.eval() + sequence_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)[0] + sequence_output = model(input_ids, token_type_ids=token_type_ids)[0] + sequence_output = model(input_ids)[0] + + self.parent.assertListEqual( + list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size] + ) + + def create_and_check_deberta_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DebertaForMaskedLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_deberta_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = DebertaForSequenceClassification(config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels]) + self.check_loss_output(result) + + def create_and_check_deberta_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = DebertaForTokenClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_deberta_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = DebertaForQuestionAnswering(config=config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + token_type_ids=token_type_ids, + start_positions=sequence_labels, + end_positions=sequence_labels, + ) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + def setUp(self): + self.model_tester = DebertaModelTest.DebertaModelTester(self) + self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_deberta_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_model(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_sequence_classification(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_masked_lm(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_question_answering(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_deberta_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = DebertaModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +@require_torch +@require_sentencepiece +@require_tokenizers +class DebertaModelIntegrationTest(unittest.TestCase): + @unittest.skip(reason="Model not available yet") + def test_inference_masked_lm(self): + pass + + @slow + def test_inference_no_head(self): + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + torch.cuda.manual_seed_all(0) + model = DebertaModel.from_pretrained("microsoft/deberta-base") + + input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + output = model(input_ids)[0] + # compare the actual values for a slice. + expected_slice = torch.tensor( + [[[-0.0218, -0.6641, -0.3665], [-0.3907, -0.4716, -0.6640], [0.7461, 1.2570, -0.9063]]] + ) + self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4), f"{output[:, :3, :3]}")