mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 12:50:06 +06:00
Fix doc errors and typos across the board (#8139)
* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes
This commit is contained in:
parent
4731a00c3e
commit
969859d5f6
@ -96,7 +96,7 @@ folder.
|
||||
|
||||
## Start contributing! (Pull Requests)
|
||||
|
||||
Before writing code, we strongly advise you to search through the exising PRs or
|
||||
Before writing code, we strongly advise you to search through the existing PRs or
|
||||
issues to make sure that nobody is already working on the same thing. If you are
|
||||
unsure, it is always a good idea to open an issue to get some feedback.
|
||||
|
||||
@ -235,7 +235,7 @@ Follow these steps to start contributing:
|
||||
### Checklist
|
||||
|
||||
1. The title of your pull request should be a summary of its contribution;
|
||||
2. If your pull request adresses an issue, please mention the issue number in
|
||||
2. If your pull request addresses an issue, please mention the issue number in
|
||||
the pull request description to make sure they are linked (and people
|
||||
consulting the issue know you are working on it);
|
||||
3. To indicate a work in progress please prefix the title with `[WIP]`. These
|
||||
|
@ -80,9 +80,9 @@ cache home followed by ``/transformers/`` (even if you don't have PyTorch instal
|
||||
So if you don't have any specific environment variable set, the cache directory will be at
|
||||
``~/.cache/torch/transformers/``.
|
||||
|
||||
**Note:** If you have set a shell enviromnent variable for one of the predecessors of this library
|
||||
**Note:** If you have set a shell environment variable for one of the predecessors of this library
|
||||
(``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
|
||||
enviromnent variable for ``TRANSFORMERS_CACHE``.
|
||||
environment variable for ``TRANSFORMERS_CACHE``.
|
||||
|
||||
### Note on model downloads (Continuous Integration or large-scale deployments)
|
||||
|
||||
|
@ -20,7 +20,7 @@ Here is a quick summary of what you should take care of when migrating from `pyt
|
||||
|
||||
The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
||||
|
||||
The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
|
||||
The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
|
||||
|
||||
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
||||
|
||||
@ -109,7 +109,7 @@ for batch in train_data:
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
### In 🤗 Transformers, optimizer and schedules are splitted and instantiated like this:
|
||||
### In 🤗 Transformers, optimizer and schedules are split and instantiated like this:
|
||||
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
|
||||
### and used like this:
|
||||
|
@ -119,7 +119,7 @@ Other files can safely be deleted.
|
||||
Upload your model with the CLI
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Now go in a terminal and run the following command. It should be in the virtual enviromnent where you installed 🤗
|
||||
Now go in a terminal and run the following command. It should be in the virtual environment where you installed 🤗
|
||||
Transformers, since that command :obj:`transformers-cli` comes from the library.
|
||||
|
||||
.. code-block::
|
||||
|
@ -510,8 +510,8 @@ As a default all models apply *Top-K* sampling when used in pipelines, as config
|
||||
|
||||
|
||||
Here, the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am
|
||||
concerned, I will"*. The default arguments of ``PreTrainedModel.generate()`` can be directly overriden in the pipeline,
|
||||
as is shown above for the argument ``max_length``.
|
||||
concerned, I will"*. The default arguments of ``PreTrainedModel.generate()`` can be directly overridden in the
|
||||
pipeline, as is shown above for the argument ``max_length``.
|
||||
|
||||
Here is an example of text generation using ``XLNet`` and its tokenzier.
|
||||
|
||||
|
@ -291,10 +291,9 @@ def hans_convert_examples_to_features(
|
||||
|
||||
Args:
|
||||
examples: List of ``InputExamples`` containing the examples.
|
||||
tokenizer: Instance of a tokenizer that will tokenize the examples.
|
||||
max_length: Maximum example length.
|
||||
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method.
|
||||
output_mode: String indicating the output mode. Either ``regression`` or ``classification``.
|
||||
max_length: Maximum example length.
|
||||
tokenizer: Instance of a tokenizer that will tokenize the examples.
|
||||
|
||||
Returns:
|
||||
A list of task-specific ``InputFeatures`` which can be fed to the model.
|
||||
|
@ -155,7 +155,7 @@ class BertModelWithPabee(BertModel):
|
||||
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
|
||||
|
||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||
|
@ -198,7 +198,7 @@ class DeeBertModel(BertPreTrainedModel):
|
||||
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
|
||||
|
||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if encoder_attention_mask.dim() == 3:
|
||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
||||
if encoder_attention_mask.dim() == 2:
|
||||
@ -260,7 +260,7 @@ class BertHighway(nn.Module):
|
||||
|
||||
# BertModel
|
||||
bmodel_output = (pooler_input, pooler_output) + encoder_outputs[1:]
|
||||
# "return" bodel_output
|
||||
# "return" bmodel_output
|
||||
|
||||
# Dropout and classification
|
||||
pooled_output = bmodel_output[1]
|
||||
|
@ -265,7 +265,7 @@ class Distiller:
|
||||
-------
|
||||
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
||||
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
||||
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
|
||||
clm_labels: `torch.tensor(bs, seq_length)` - The causal language modeling labels. There is a -100 where there is nothing to predict.
|
||||
"""
|
||||
token_ids, lengths = batch
|
||||
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
||||
@ -401,9 +401,9 @@ class Distiller:
|
||||
# https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
|
||||
# https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
|
||||
if self.params.restrict_ce_to_mask:
|
||||
mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||
mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_length, voc_size)
|
||||
else:
|
||||
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
||||
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_length, voc_size)
|
||||
s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||
s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
|
||||
t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||
|
@ -61,7 +61,7 @@ class LmSeqsDataset(Dataset):
|
||||
|
||||
def remove_long_sequences(self):
|
||||
"""
|
||||
Sequences that are too long are splitted by chunk of max_model_input_size.
|
||||
Sequences that are too long are split by chunk of max_model_input_size.
|
||||
"""
|
||||
max_len = self.params.max_model_input_size
|
||||
indices = self.lengths > max_len
|
||||
@ -138,8 +138,8 @@ class LmSeqsDataset(Dataset):
|
||||
# logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
|
||||
|
||||
# unk_idx = self.params.special_tok_ids['unk_token']
|
||||
# nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
|
||||
# logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
|
||||
# nb_unknown = sum([(t==unk_idx).sum() for t in self.token_ids])
|
||||
# logger.info(f'{nb_unknown} unknown tokens (covering {100*nb_unknown/data_len:.2f}% of the data)')
|
||||
|
||||
def batch_sequences(self, batch):
|
||||
"""
|
||||
|
@ -96,7 +96,7 @@ if __name__ == "__main__":
|
||||
compressed_sd["lm_head.weight"] = state_dict["lm_head.weight"]
|
||||
|
||||
print(f"N layers selected for distillation: {std_idx}")
|
||||
print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
|
||||
print(f"Number of params transferred for distillation: {len(compressed_sd.keys())}")
|
||||
|
||||
print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
|
||||
print(f"Save transferred checkpoint to {args.dump_checkpoint}.")
|
||||
torch.save(compressed_sd, args.dump_checkpoint)
|
||||
|
@ -266,14 +266,14 @@ def find_top_rpn_proposals(
|
||||
):
|
||||
"""Args:
|
||||
proposals (list[Tensor]): (L, N, Hi*Wi*A, 4).
|
||||
pred_objectness_logits: tensors of lenngth L.
|
||||
pred_objectness_logits: tensors of length L.
|
||||
nms_thresh (float): IoU threshold to use for NMS
|
||||
pre_nms_topk (int): before nms
|
||||
post_nms_topk (int): after nms
|
||||
min_box_side_len (float): minimum proposal box side
|
||||
training (bool): True if proposals are to be used in training,
|
||||
Returns:
|
||||
resuls (List[Dict]): stores post_nms_topk object proposals for image i.
|
||||
results (List[Dict]): stores post_nms_topk object proposals for image i.
|
||||
"""
|
||||
num_images = len(images)
|
||||
device = proposals[0].device
|
||||
@ -648,7 +648,7 @@ class RPNOutputs(object):
|
||||
images (ImageList): :class:`ImageList` instance representing N input images
|
||||
pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A, Hi, W)
|
||||
pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A*4, Hi, Wi)
|
||||
anchors (list[torch.Tensor]): nested list ofboxes. anchors[i][j] at (n, l) stores anchor array for feature map l
|
||||
anchors (list[torch.Tensor]): nested list of boxes. anchors[i][j] at (n, l) stores anchor array for feature map l
|
||||
boundary_threshold (int): if >= 0, then anchors that extend beyond the image boundary by more than boundary_thresh are not used in training.
|
||||
gt_boxes (list[Boxes], optional): A list of N elements.
|
||||
smooth_l1_beta (float): The transition point between L1 and L2 lossn. When set to 0, the loss becomes L1. When +inf, it is ignored
|
||||
@ -1186,7 +1186,7 @@ class ROIOutputs(object):
|
||||
attr_probs_all, attrs_all = self._predict_attrs(attr_logits, preds_per_image)
|
||||
features = features.split(preds_per_image, dim=0)
|
||||
|
||||
# fun for each image too, also I can expirement and do multiple images
|
||||
# fun for each image too, also I can experiment and do multiple images
|
||||
final_results = []
|
||||
zipped = zip(boxes_all, obj_scores_all, attr_probs_all, attrs_all, sizes)
|
||||
for i, (boxes, obj_scores, attr_probs, attrs, size) in enumerate(zipped):
|
||||
@ -1412,7 +1412,7 @@ class AnchorGenerator(nn.Module):
|
||||
|
||||
def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
|
||||
"""
|
||||
anchors are continious geometric rectangles
|
||||
anchors are continuous geometric rectangles
|
||||
centered on one feature map point sample.
|
||||
We can later build the set of anchors
|
||||
for the entire feature map by tiling these tensors
|
||||
@ -1865,7 +1865,7 @@ class GeneralizedRCNN(nn.Module):
|
||||
scales_yx=None,
|
||||
**kwargs,
|
||||
):
|
||||
# run images through bacbone
|
||||
# run images through backbone
|
||||
original_sizes = image_shapes * scales_yx
|
||||
features = self.backbone(images)
|
||||
|
||||
|
@ -116,7 +116,7 @@ class Preprocess:
|
||||
images = self.aug(images)
|
||||
# transpose images and convert to torch tensors
|
||||
# images = [torch.as_tensor(i.astype("float32")).permute(2, 0, 1).to(self.device) for i in images]
|
||||
# now normalize before pad to aoid useless arithmatic
|
||||
# now normalize before pad to avoid useless arithmetic
|
||||
images = [self.normalizer(x) for x in images]
|
||||
# now pad them to do the following operations
|
||||
images, sizes = self.pad(images)
|
||||
|
@ -236,7 +236,7 @@ def compare(in_tensor):
|
||||
), f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} % element-wise mismatch"
|
||||
raise Exception("tensors are all good")
|
||||
|
||||
# Hugging face functiions below
|
||||
# Hugging face functions below
|
||||
|
||||
|
||||
def is_remote_url(url_or_filename):
|
||||
@ -520,7 +520,7 @@ def get_image_from_url(url):
|
||||
return img
|
||||
|
||||
|
||||
# to load legace frcnn checkpoint from detectron
|
||||
# to load legacy frcnn checkpoint from detectron
|
||||
def load_frcnn_pkl_from_url(url):
|
||||
fn = url.split("/")[-1]
|
||||
if fn not in os.listdir(os.getcwd()):
|
||||
|
@ -33,7 +33,7 @@ def main(args):
|
||||
remaining_count = 0 # Number of remaining (not pruned) params in the encoder
|
||||
encoder_count = 0 # Number of params in the encoder
|
||||
|
||||
print("name".ljust(60, " "), "Remaining Weights %", "Remaning Weight")
|
||||
print("name".ljust(60, " "), "Remaining Weights %", "Remaining Weight")
|
||||
for name, param in st.items():
|
||||
if "encoder" not in name:
|
||||
continue
|
||||
|
@ -591,7 +591,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
|
||||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||
|
||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||
@ -631,7 +631,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
|
||||
) # We can specify head_mask for each layer
|
||||
head_mask = head_mask.to(
|
||||
dtype=next(self.parameters()).dtype
|
||||
) # switch to fload if need + fp16 compatibility
|
||||
) # switch to float if need + fp16 compatibility
|
||||
else:
|
||||
head_mask = [None] * self.config.num_hidden_layers
|
||||
|
||||
|
@ -225,7 +225,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
desc="Epoch",
|
||||
disable=args.local_rank not in [-1, 0],
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
set_seed(args) # Added here for reproducibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
@ -705,7 +705,7 @@ def main():
|
||||
"--final_lambda",
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Regularization intensity (used in conjunction with `regulariation`.",
|
||||
help="Regularization intensity (used in conjunction with `regularization`.",
|
||||
)
|
||||
|
||||
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
|
||||
@ -816,7 +816,7 @@ def main():
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
|
@ -231,7 +231,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
# Added here for reproductibility
|
||||
# Added here for reproducibility
|
||||
set_seed(args)
|
||||
|
||||
for _ in train_iterator:
|
||||
@ -824,7 +824,7 @@ def main():
|
||||
"--final_lambda",
|
||||
default=0.0,
|
||||
type=float,
|
||||
help="Regularization intensity (used in conjunction with `regulariation`.",
|
||||
help="Regularization intensity (used in conjunction with `regularization`.",
|
||||
)
|
||||
|
||||
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
|
||||
@ -977,7 +977,7 @@ def main():
|
||||
if args.local_rank == -1 or args.no_cuda:
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
||||
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||
torch.cuda.set_device(args.local_rank)
|
||||
device = torch.device("cuda", args.local_rank)
|
||||
torch.distributed.init_process_group(backend="nccl")
|
||||
|
@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
|
||||
class RagPyTorchDistributedRetriever(RagRetriever):
|
||||
"""
|
||||
A distributed retriever built on top of the ``torch.distributed`` communication package. During training all workers
|
||||
initalize their own instance of the retriever, however, only the main worker loads the index into memory. The index is stored
|
||||
initialize their own instance of the retriever, however, only the main worker loads the index into memory. The index is stored
|
||||
in cpu memory. The index will also work well in a non-distributed setup.
|
||||
|
||||
Args:
|
||||
@ -45,7 +45,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
|
||||
|
||||
def init_retrieval(self, distributed_port: int):
|
||||
"""
|
||||
Retriever initalization function, needs to be called from the training process. The function sets some common parameters
|
||||
Retriever initialization function, needs to be called from the training process. The function sets some common parameters
|
||||
and environment variables. On top of that, (only) the main process in the process group loads the index into memory.
|
||||
|
||||
Args:
|
||||
@ -56,7 +56,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
|
||||
|
||||
logger.info("initializing retrieval")
|
||||
|
||||
# initializing a separate process group for retrievel as the default
|
||||
# initializing a separate process group for retrieval as the default
|
||||
# nccl backend doesn't support gather/scatter operations while gloo
|
||||
# is too slow to replace nccl for the core gpu communication
|
||||
if dist.is_initialized():
|
||||
@ -101,7 +101,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
|
||||
n_docs (:obj:`int`):
|
||||
The number of docs retrieved per query.
|
||||
|
||||
Ouput:
|
||||
Output:
|
||||
retrieved_doc_embeds (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`
|
||||
The retrieval embeddings of the retrieved docs per query.
|
||||
doc_ids (:obj:`np.ndarray` of shape :obj:`batch_size, n_docs`)
|
||||
|
@ -176,7 +176,7 @@ def get_args():
|
||||
choices=["e2e", "retrieval"],
|
||||
default="e2e",
|
||||
type=str,
|
||||
help="Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calulates precision@k.",
|
||||
help="Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calculates precision@k.",
|
||||
)
|
||||
parser.add_argument("--k", default=1, type=int, help="k for the precision@k calculation")
|
||||
parser.add_argument(
|
||||
@ -206,7 +206,7 @@ def get_args():
|
||||
"--predictions_path",
|
||||
type=str,
|
||||
default="predictions.txt",
|
||||
help="Name of the predictions file, to be stored in the checkpoints directry",
|
||||
help="Name of the predictions file, to be stored in the checkpoints directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--eval_all_checkpoints",
|
||||
|
@ -26,7 +26,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
|
||||
def split_text(text: str, n=100, character=" ") -> List[str]:
|
||||
"""Split the text every ``n``-th occurence of ``character``"""
|
||||
"""Split the text every ``n``-th occurrence of ``character``"""
|
||||
text = text.split(character)
|
||||
return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]
|
||||
|
||||
|
@ -44,7 +44,7 @@ class BertAbsConfig(PretrainedConfig):
|
||||
enc_ff_size: int
|
||||
The size of the encoder's feed-forward layers.
|
||||
enc_dropout: int
|
||||
The dropout probabilitiy for all fully connected layers in the
|
||||
The dropout probability for all fully connected layers in the
|
||||
embeddings, layers, pooler and also the attention probabilities in
|
||||
the encoder.
|
||||
dec_layer: int
|
||||
@ -56,7 +56,7 @@ class BertAbsConfig(PretrainedConfig):
|
||||
dec_ff_size: int
|
||||
The size of the decoder's feed-forward layers.
|
||||
dec_dropout: int
|
||||
The dropout probabilitiy for all fully connected layers in the
|
||||
The dropout probability for all fully connected layers in the
|
||||
embeddings, layers, pooler and also the attention probabilities in
|
||||
the decoder.
|
||||
"""
|
||||
|
@ -152,7 +152,7 @@ class TransformerDecoder(nn.Module):
|
||||
dropout (float): dropout parameters
|
||||
embeddings (:obj:`onmt.modules.Embeddings`):
|
||||
embeddings to use, should have positional encodings
|
||||
attn_type (str): if using a seperate copy attention
|
||||
attn_type (str): if using a separate copy attention
|
||||
"""
|
||||
|
||||
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
|
||||
@ -817,11 +817,7 @@ class Translator(object):
|
||||
|
||||
Args:
|
||||
batch (:obj:`Batch`): a batch from a dataset object
|
||||
data (:obj:`Dataset`): the dataset object
|
||||
fast (bool): enables fast beam search (may not support all features)
|
||||
|
||||
Todo:
|
||||
Shouldn't need the original dataset.
|
||||
"""
|
||||
with torch.no_grad():
|
||||
return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)
|
||||
|
@ -12,7 +12,7 @@ def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None
|
||||
state_dict = torch.load(src_path, map_location=map_location)
|
||||
for k, v in tqdm(state_dict.items()):
|
||||
if not isinstance(v, torch.Tensor):
|
||||
raise TypeError("FP16 conversion only works on paths that are saved state dics, like pytorch_model.bin")
|
||||
raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
|
||||
state_dict[k] = v.half()
|
||||
if save_path is None: # overwrite src_path
|
||||
save_path = src_path
|
||||
|
@ -7,7 +7,7 @@ language: ar
|
||||
|
||||
**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config. More details are available in the [AraBERT PAPER](https://arxiv.org/abs/2003.00104v2) and in the [AraBERT Meetup](https://github.com/WissamAntoun/pydata_khobar_meetup)
|
||||
|
||||
There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
|
||||
There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were split using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
|
||||
|
||||
The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
|
||||
|
||||
|
@ -7,7 +7,7 @@ language: ar
|
||||
|
||||
**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config. More details are available in the [AraBERT PAPER](https://arxiv.org/abs/2003.00104v2) and in the [AraBERT Meetup](https://github.com/WissamAntoun/pydata_khobar_meetup)
|
||||
|
||||
There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
|
||||
There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were split using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
|
||||
|
||||
The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
|
||||
|
||||
|
@ -4,7 +4,7 @@ tags:
|
||||
---
|
||||
|
||||
## CS224n SQuAD2.0 Project Dataset
|
||||
The goal of this model is to save CS224n students GPU time when establising
|
||||
The goal of this model is to save CS224n students GPU time when establishing
|
||||
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
||||
The training set used to fine-tune this model is the same as
|
||||
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
||||
|
@ -34,7 +34,7 @@ model = AutoModelWithLMHead.from_pretrained("jannesg/takalane_afr_roberta")
|
||||
|
||||
#### Limitations and bias
|
||||
|
||||
Updates will be added continously to improve performance.
|
||||
Updates will be added continuously to improve performance.
|
||||
|
||||
## Training data
|
||||
|
||||
|
@ -94,7 +94,7 @@ fill_mask(PYTHON_CODE3)
|
||||
|
||||
> Great! 🎉
|
||||
|
||||
## This work is heavely inspired on [CodeBERTa](https://github.com/huggingface/transformers/blob/master/model_cards/huggingface/CodeBERTa-small-v1/README.md) by huggingface team
|
||||
## This work is heavily inspired on [CodeBERTa](https://github.com/huggingface/transformers/blob/master/model_cards/huggingface/CodeBERTa-small-v1/README.md) by huggingface team
|
||||
|
||||
<br>
|
||||
|
||||
|
@ -11,7 +11,7 @@ This model is a fine-tuned on [NER-C](https://www.kaggle.com/nltkdata/conll-corp
|
||||
|
||||
- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora)
|
||||
|
||||
I preprocessed the dataset and splitted it as train / dev (80/20)
|
||||
I preprocessed the dataset and split it as train / dev (80/20)
|
||||
|
||||
| Dataset | # Examples |
|
||||
| ---------------------- | ----- |
|
||||
|
@ -65,7 +65,7 @@ Citation:
|
||||
|
||||
</details>
|
||||
|
||||
As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
||||
As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and split the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
||||
|
||||
| Dataset | # samples |
|
||||
| ----------- | --------- |
|
||||
|
@ -65,7 +65,7 @@ Citation:
|
||||
|
||||
</details>
|
||||
|
||||
As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
||||
As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and split the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
||||
|
||||
| Dataset | # samples |
|
||||
| ----------- | --------- |
|
||||
|
@ -11,7 +11,7 @@ This model is a fine-tuned on [NER-C](https://www.kaggle.com/nltkdata/conll-corp
|
||||
|
||||
- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora)
|
||||
|
||||
I preprocessed the dataset and splitted it as train / dev (80/20)
|
||||
I preprocessed the dataset and split it as train / dev (80/20)
|
||||
|
||||
| Dataset | # Examples |
|
||||
| ---------------------- | ----- |
|
||||
|
@ -11,7 +11,7 @@ This model is a fine-tuned on Spanish [CONLL CORPORA](https://www.kaggle.com/nlt
|
||||
|
||||
- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora) with data augmentation techniques
|
||||
|
||||
I preprocessed the dataset and splitted it as train / dev (80/20)
|
||||
I preprocessed the dataset and split it as train / dev (80/20)
|
||||
|
||||
| Dataset | # Examples |
|
||||
| ---------------------- | ----- |
|
||||
|
@ -44,7 +44,7 @@ python transformers/examples/question-answering/run_squad.py \
|
||||
--save_steps 1000
|
||||
```
|
||||
|
||||
It is importatnt to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
|
||||
It is important to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
|
||||
|
||||
## Test set Results 🧾
|
||||
|
||||
|
@ -44,7 +44,7 @@ python transformers/examples/question-answering/run_squad.py \
|
||||
--version_2_with_negative
|
||||
```
|
||||
|
||||
It is importatnt to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
|
||||
It is important to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
|
||||
|
||||
## Test set Results 🧾
|
||||
|
||||
|
@ -48,7 +48,7 @@ python code/run_squad.py \
|
||||
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
||||
|
||||
|
||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
|
||||
## Model in action
|
||||
|
||||
|
@ -54,7 +54,7 @@ python code/run_squad.py \
|
||||
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
||||
|
||||
|
||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
|
||||
## Model in action
|
||||
|
||||
|
@ -45,7 +45,7 @@ python code/run_tacred.py \
|
||||
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-base-finetuned-tacred) |
|
||||
|
||||
|
||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
|
||||
|
||||
> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
|
||||
|
@ -48,7 +48,7 @@ python code/run_squad.py \
|
||||
| SpanBERT (large) | **94.6** (this) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
||||
|
||||
|
||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
|
||||
## Model in action
|
||||
|
||||
|
@ -54,7 +54,7 @@ python code/run_squad.py \
|
||||
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | **88.7** (this) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
||||
|
||||
|
||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
|
||||
## Model in action
|
||||
|
||||
|
@ -45,7 +45,7 @@ python code/run_tacred.py \
|
||||
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | **70.8** (this one) |
|
||||
|
||||
|
||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||
|
||||
|
||||
> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
|
||||
|
@ -50,7 +50,7 @@ tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL-sql
|
||||
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL-sql-to-en")
|
||||
|
||||
def get_explanation(query):
|
||||
input_text = "translante Sql to English: %s </s>" % query
|
||||
input_text = "translate Sql to English: %s </s>" % query
|
||||
features = tokenizer([input_text], return_tensors='pt')
|
||||
|
||||
output = model.generate(input_ids=features['input_ids'],
|
||||
|
@ -50,7 +50,7 @@ tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
|
||||
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
|
||||
|
||||
def get_sql(query):
|
||||
input_text = "translante English to SQL: %s </s>" % query
|
||||
input_text = "translate English to SQL: %s </s>" % query
|
||||
features = tokenizer([input_text], return_tensors='pt')
|
||||
|
||||
output = model.generate(input_ids=features['input_ids'],
|
||||
|
@ -50,7 +50,7 @@ tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-small-finetuned-wikiSQL")
|
||||
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-small-finetuned-wikiSQL")
|
||||
|
||||
def get_sql(query):
|
||||
input_text = "translante English to SQL: %s </s>" % query
|
||||
input_text = "translate English to SQL: %s </s>" % query
|
||||
features = tokenizer([input_text], return_tensors='pt')
|
||||
|
||||
output = model.generate(input_ids=features['input_ids'],
|
||||
|
@ -71,7 +71,7 @@ Citation:
|
||||
|
||||
</details>
|
||||
|
||||
As XQuAD is just an evaluation dataset, I used Data augmentation techniques (scraping, neural machine translation, etc) to obtain more samples and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
||||
As XQuAD is just an evaluation dataset, I used Data augmentation techniques (scraping, neural machine translation, etc) to obtain more samples and split the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
||||
|
||||
| Dataset | # samples |
|
||||
| ----------- | --------- |
|
||||
|
@ -172,7 +172,7 @@ class MemorySummary(NamedTuple):
|
||||
`MemorySummary` namedtuple otherwise with the fields:
|
||||
|
||||
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by
|
||||
substracting the memory after executing each line from the memory before executing said line.
|
||||
subtracting the memory after executing each line from the memory before executing said line.
|
||||
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
|
||||
obtained by summing repeated memory increase for a line if it's executed several times. The list is sorted
|
||||
from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory
|
||||
@ -208,7 +208,7 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
|
||||
|
||||
Returns:
|
||||
|
||||
- `max_memory`: (`int`) cosumed memory peak in Bytes
|
||||
- `max_memory`: (`int`) consumed memory peak in Bytes
|
||||
"""
|
||||
|
||||
def get_cpu_memory(process_id: int) -> int:
|
||||
@ -221,7 +221,7 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
|
||||
|
||||
Returns
|
||||
|
||||
- `memory`: (`int`) cosumed memory in Bytes
|
||||
- `memory`: (`int`) consumed memory in Bytes
|
||||
"""
|
||||
process = psutil.Process(process_id)
|
||||
try:
|
||||
@ -367,7 +367,7 @@ def start_memory_tracing(
|
||||
devices = list(range(nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
|
||||
nvml.nvmlShutdown()
|
||||
except (OSError, nvml.NVMLError):
|
||||
logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
|
||||
logger.warning("Error while initializing communication with GPU. " "We won't perform GPU memory tracing.")
|
||||
log_gpu = False
|
||||
else:
|
||||
log_gpu = is_torch_available() or is_tf_available()
|
||||
@ -472,9 +472,10 @@ def stop_memory_tracing(
|
||||
|
||||
Args:
|
||||
|
||||
- `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
|
||||
- `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total
|
||||
memory
|
||||
`memory_trace` (optional output of start_memory_tracing, default: None):
|
||||
memory trace to convert in summary
|
||||
`ignore_released_memory` (boolean, default: None):
|
||||
if True we only sum memory increase to compute total memory
|
||||
|
||||
Return:
|
||||
|
||||
@ -482,7 +483,7 @@ def stop_memory_tracing(
|
||||
- `MemorySummary` namedtuple otherwise with the fields:
|
||||
|
||||
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by
|
||||
substracting the memory after executing each line from the memory before executing said line.
|
||||
subtracting the memory after executing each line from the memory before executing said line.
|
||||
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each
|
||||
line obtained by summing repeated memory increase for a line if it's executed several times. The list is
|
||||
sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative
|
||||
|
@ -41,7 +41,7 @@ class ConvertCommand(BaseTransformersCLICommand):
|
||||
"--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
|
||||
)
|
||||
train_parser.add_argument(
|
||||
"--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output."
|
||||
"--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch saved model output."
|
||||
)
|
||||
train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
|
||||
train_parser.add_argument(
|
||||
|
@ -61,7 +61,7 @@ class BartConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
|
@ -76,7 +76,7 @@ class BertConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
|
@ -42,7 +42,7 @@ class BertGenerationConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
@ -62,7 +62,7 @@ class BertGenerationConfig(PretrainedConfig):
|
||||
>>> # Initializing a BertGeneration config
|
||||
>>> configuration = BertGenerationConfig()
|
||||
|
||||
>>> # Initializing a modelfrom the config
|
||||
>>> # Initializing a model from the config
|
||||
>>> model = BertGenerationEncoder(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
|
@ -58,7 +58,7 @@ class BlenderbotConfig(BartConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
|
@ -55,7 +55,7 @@ class DebertaConfig(PretrainedConfig):
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
|
||||
:obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
|
@ -61,7 +61,7 @@ class DistilBertConfig(PretrainedConfig):
|
||||
hidden_dim (:obj:`int`, `optional`, defaults to 3072):
|
||||
The size of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
|
||||
|
@ -57,7 +57,7 @@ class DPRConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
|
@ -62,7 +62,7 @@ class ElectraConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
|
@ -59,11 +59,11 @@ class FlaubertConfig(XLMConfig):
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probability for the attention mechanism
|
||||
gelu_activation (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to use a `gelu` actibation instead of `relu`.
|
||||
Whether or not to use a `gelu` activation instead of `relu`.
|
||||
sinusoidal_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
|
||||
causal (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model shoul behave in a causal manner. Causal models use a triangular attention mask in
|
||||
Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
|
||||
order to only attend to the left-side context instead if a bidirectional context.
|
||||
asm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
|
||||
|
@ -73,7 +73,7 @@ class FSMTConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
|
@ -68,7 +68,7 @@ class FunnelConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probability for the attention probabilities.
|
||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
|
@ -54,7 +54,7 @@ class LayoutLMConfig(BertConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
|
@ -57,7 +57,7 @@ class LxmertConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
@ -95,10 +95,9 @@ class LxmertConfig(PretrainedConfig):
|
||||
Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
|
||||
objective.
|
||||
task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to add object predicition, attribute predicition and feature regression to the loss
|
||||
objective.
|
||||
Whether or not to add object prediction, attribute ppredictionand feature regression to the loss objective.
|
||||
task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to add the question-asnwering loss to the objective
|
||||
Whether or not to add the question-asansweringoss to the objective
|
||||
visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to calculate the object-prediction loss objective
|
||||
visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
@ -106,10 +105,10 @@ class LxmertConfig(PretrainedConfig):
|
||||
visual_feat_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to calculate the feature-regression loss objective
|
||||
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model should return the attentions from the vision, langauge, and cross-modality layers
|
||||
Whether or not the model should return the attentions from the vision, language, and cross-modality layers
|
||||
should be returned.
|
||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the model should return the hidden states from the vision, langauge, and cross-modality
|
||||
Whether or not the model should return the hidden states from the vision, language, and cross-modality
|
||||
layers should be returned.
|
||||
"""
|
||||
|
||||
|
@ -52,7 +52,7 @@ class MarianConfig(BartConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
|
@ -57,7 +57,7 @@ class MBartConfig(BartConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
|
@ -96,7 +96,7 @@ class PegasusConfig(BartConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
|
@ -60,7 +60,7 @@ class ProphetNetConfig(PretrainedConfig):
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
|
@ -30,7 +30,7 @@ RAG_CONFIG_DOC = r"""
|
||||
Separator inserted between the title and the text of the retrieved document when calling
|
||||
:class:`~transformers.RagRetriever`.
|
||||
doc_sep (:obj:`str`, `optional`, defaults to ``" // "``):
|
||||
Separator inserted between the the text of the retrieved document and the original input when calliang
|
||||
Separator inserted between the the text of the retrieved document and the original input when calling
|
||||
:class:`~transformers.RagRetriever`.
|
||||
n_docs (:obj:`int`, `optional`, defaults to 5):
|
||||
Number of documents to retrieve.
|
||||
@ -39,7 +39,7 @@ RAG_CONFIG_DOC = r"""
|
||||
retrieval_vector_size (:obj:`int`, `optional`, defaults to 768):
|
||||
Dimensionality of the document embeddings indexed by :class:`~transformers.RagRetriever`.
|
||||
retrieval_batch_size (:obj:`int`, `optional`, defaults to 8):
|
||||
Retrieval batch size, defined as the number of queries issues concurrently to the faiss index excapsulated
|
||||
Retrieval batch size, defined as the number of queries issues concurrently to the faiss index encapsulated
|
||||
:class:`~transformers.RagRetriever`.
|
||||
dataset (:obj:`str`, `optional`, defaults to :obj:`"wiki_dpr"`):
|
||||
A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and ids
|
||||
|
@ -82,7 +82,7 @@ class ReformerConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the feed forward layer in the residual attention
|
||||
block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
hidden_size (:obj:`int`, `optional`, defaults to 256):
|
||||
Dimensionality of the output hidden states of the residual attention blocks.
|
||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||
|
@ -20,7 +20,7 @@ from .utils import logging
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
# TODO: uploadto AWS
|
||||
# TODO: upload to AWS
|
||||
RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"retribert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
|
||||
}
|
||||
@ -51,7 +51,7 @@ class RetriBertConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
|
@ -52,7 +52,7 @@ class SqueezeBertConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout ratio for the attention probabilities.
|
||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||
|
@ -77,7 +77,7 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
adaptive (:obj:`boolean`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to use adaptive softmax.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
dropatt (:obj:`float`, `optional`, defaults to 0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
untie_r (:obj:`boolean`, `optional`, defaults to :obj:`True`):
|
||||
|
@ -83,7 +83,7 @@ def generate_identified_filename(filename: Path, identifier: str) -> Path:
|
||||
filename: pathlib.Path The actual path object we would like to add an identifier suffix
|
||||
identifier: The suffix to add
|
||||
|
||||
Returns: String with concatenated indentifier at the end of the filename
|
||||
Returns: String with concatenated identifier at the end of the filename
|
||||
"""
|
||||
return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)
|
||||
|
||||
|
@ -30,7 +30,7 @@ class LightningModel(pl.LightningModule):
|
||||
self.num_labels = 2
|
||||
self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
|
||||
|
||||
# implement only because lighning requires to do so
|
||||
# implement only because lightning requires to do so
|
||||
def forward(self):
|
||||
pass
|
||||
|
||||
@ -57,7 +57,7 @@ def convert_longformer_qa_checkpoint_to_pytorch(
|
||||
# save model
|
||||
longformer_for_qa.save_pretrained(pytorch_dump_folder_path)
|
||||
|
||||
print("Conversion succesful. Model saved under {}".format(pytorch_dump_folder_path))
|
||||
print("Conversion successful. Model saved under {}".format(pytorch_dump_folder_path))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -75,7 +75,7 @@ if __name__ == "__main__":
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path the official PyTorch Lighning Checkpoint.",
|
||||
help="Path the official PyTorch Lightning Checkpoint.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
|
||||
|
@ -34,7 +34,7 @@ class TatoebaConverter:
|
||||
|
||||
1. convert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
|
||||
2. rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
|
||||
one existes. e.g. aav-eng -> aav-en, heb-eng -> he-en
|
||||
one exists. e.g. aav-eng -> aav-en, heb-eng -> he-en
|
||||
3. write a model card containing the original Tatoeba-Challenge/README.md and extra info about alpha3 group
|
||||
members.
|
||||
"""
|
||||
|
@ -123,7 +123,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--force_download",
|
||||
action="store_true",
|
||||
help="Re-dowload checkpoints.",
|
||||
help="Re-download checkpoints.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -104,7 +104,7 @@ if __name__ == "__main__":
|
||||
"--finetuning_task",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Name of a task on which the XLNet TensorFloaw model was fine-tuned",
|
||||
help="Name of a task on which the XLNet TensorFlow model was fine-tuned",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
@ -330,7 +330,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
|
||||
input_ids, labels, attention_mask = self.mask_tokens(input_ids)
|
||||
|
||||
token_type_ids = [example["token_type_ids"] for example in examples]
|
||||
# size of segment_ids varied because randomness, padding zero to the end as the orignal implementation
|
||||
# size of segment_ids varied because randomness, padding zero to the end as the original implementation
|
||||
token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
|
||||
|
||||
sop_label_list = [example["sentence_order_label"] for example in examples]
|
||||
|
@ -71,7 +71,7 @@ class TextDataset(Dataset):
|
||||
tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
|
||||
)
|
||||
# Note that we are losing the last truncated example here for the sake of simplicity (no padding)
|
||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
||||
# If your dataset is small, first you should look for a bigger one :-) and second you
|
||||
# can change this behavior by adding (model specific) padding.
|
||||
|
||||
start = time.time()
|
||||
|
@ -327,7 +327,7 @@ def squad_convert_examples_to_features(
|
||||
padding_strategy: Default to "max_length". Which padding strategy to use
|
||||
return_dataset: Default False. Either 'pt' or 'tf'.
|
||||
if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
|
||||
threads: multiple processing threadsa-smi
|
||||
threads: multiple processing threads.
|
||||
|
||||
|
||||
Returns:
|
||||
@ -527,7 +527,7 @@ def squad_convert_examples_to_features(
|
||||
|
||||
class SquadProcessor(DataProcessor):
|
||||
"""
|
||||
Processor for the SQuAD data set. Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
|
||||
Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
|
||||
version 2.0 of SQuAD, respectively.
|
||||
"""
|
||||
|
||||
|
@ -245,9 +245,6 @@ class SingleSentenceClassificationProcessor(DataProcessor):
|
||||
Args:
|
||||
tokenizer: Instance of a tokenizer that will tokenize the examples
|
||||
max_length: Maximum example length
|
||||
task: GLUE task
|
||||
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
|
||||
output_mode: String indicating the output mode. Either ``regression`` or ``classification``
|
||||
pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
|
||||
pad_token: Padding token
|
||||
mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
|
||||
|
@ -89,7 +89,7 @@ try:
|
||||
# Check we're not importing a "datasets" directory somewhere
|
||||
_datasets_available = hasattr(datasets, "__version__") and hasattr(datasets, "load_dataset")
|
||||
if _datasets_available:
|
||||
logger.debug(f"Succesfully imported datasets version {datasets.__version__}")
|
||||
logger.debug(f"Successfully imported datasets version {datasets.__version__}")
|
||||
else:
|
||||
logger.debug("Imported a datasets object but this doesn't seem to be the 🤗 datasets library.")
|
||||
|
||||
@ -147,7 +147,7 @@ try:
|
||||
import faiss # noqa: F401
|
||||
|
||||
_faiss_available = True
|
||||
logger.debug(f"Succesfully imported faiss version {faiss.__version__}")
|
||||
logger.debug(f"Successfully imported faiss version {faiss.__version__}")
|
||||
except ImportError:
|
||||
_faiss_available = False
|
||||
|
||||
@ -290,7 +290,7 @@ def torch_only_method(fn):
|
||||
|
||||
# docstyle-ignore
|
||||
DATASETS_IMPORT_ERROR = """
|
||||
{0} requires the 🤗 Datasets library but it was not found in your enviromnent. You can install it with:
|
||||
{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
|
||||
```
|
||||
pip install datasets
|
||||
```
|
||||
@ -308,7 +308,7 @@ that python file if that's the case.
|
||||
|
||||
# docstyle-ignore
|
||||
TOKENIZERS_IMPORT_ERROR = """
|
||||
{0} requires the 🤗 Tokenizers library but it was not found in your enviromnent. You can install it with:
|
||||
{0} requires the 🤗 Tokenizers library but it was not found in your environment. You can install it with:
|
||||
```
|
||||
pip install tokenizers
|
||||
```
|
||||
@ -321,30 +321,30 @@ In a notebook or a colab, you can install it by executing a cell with
|
||||
|
||||
# docstyle-ignore
|
||||
SENTENCEPIECE_IMPORT_ERROR = """
|
||||
{0} requires the SentencePiece library but it was not found in your enviromnent. Checkout the instructions on the
|
||||
{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
|
||||
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
|
||||
that match your enviromnent.
|
||||
that match your environment.
|
||||
"""
|
||||
|
||||
|
||||
# docstyle-ignore
|
||||
FAISS_IMPORT_ERROR = """
|
||||
{0} requires the faiss library but it was not found in your enviromnent. Checkout the instructions on the
|
||||
{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
|
||||
installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
|
||||
that match your enviromnent.
|
||||
that match your environment.
|
||||
"""
|
||||
|
||||
|
||||
# docstyle-ignore
|
||||
PYTORCH_IMPORT_ERROR = """
|
||||
{0} requires the PyTorch library but it was not found in your enviromnent. Checkout the instructions on the
|
||||
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your enviromnent.
|
||||
{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
|
||||
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
|
||||
"""
|
||||
|
||||
|
||||
# docstyle-ignore
|
||||
SKLEARN_IMPORT_ERROR = """
|
||||
{0} requires the scikit-learn library but it was not found in your enviromnent. You can install it with:
|
||||
{0} requires the scikit-learn library but it was not found in your environment. You can install it with:
|
||||
```
|
||||
pip install -U scikit-learn
|
||||
```
|
||||
@ -357,15 +357,15 @@ In a notebook or a colab, you can install it by executing a cell with
|
||||
|
||||
# docstyle-ignore
|
||||
TENSORFLOW_IMPORT_ERROR = """
|
||||
{0} requires the TensorFlow library but it was not found in your enviromnent. Checkout the instructions on the
|
||||
installation page: https://www.tensorflow.org/install and follow the ones that match your enviromnent.
|
||||
{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
|
||||
installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
|
||||
"""
|
||||
|
||||
|
||||
# docstyle-ignore
|
||||
FLAX_IMPORT_ERROR = """
|
||||
{0} requires the FLAX library but it was not found in your enviromnent. Checkout the instructions on the
|
||||
installation page: https://github.com/google/flax and follow the ones that match your enviromnent.
|
||||
{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
|
||||
installation page: https://github.com/google/flax and follow the ones that match your environment.
|
||||
"""
|
||||
|
||||
|
||||
@ -918,13 +918,13 @@ def cached_path(
|
||||
|
||||
Args:
|
||||
cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
|
||||
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
||||
resume_download: if True, resume the download if incompletly recieved file is found.
|
||||
force_download: if True, re-download the file even if it's already cached in the cache dir.
|
||||
resume_download: if True, resume the download if incompletely received file is found.
|
||||
user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
|
||||
extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
|
||||
file in a folder along the archive.
|
||||
force_extract: if True when extract_compressed_file is True and the archive was already extracted,
|
||||
re-extract the archive and overide the folder where it was extracted.
|
||||
re-extract the archive and override the folder where it was extracted.
|
||||
|
||||
Return:
|
||||
None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
|
||||
|
@ -25,14 +25,14 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
class TFGenerationMixin:
|
||||
"""
|
||||
A class contraining all of the functions supporting generation, to be used as a mixin in
|
||||
:class:`~transfomers.TFPreTrainedModel`.
|
||||
A class containing all of the functions supporting generation, to be used as a mixin in
|
||||
:class:`~transformers.TFPreTrainedModel`.
|
||||
"""
|
||||
|
||||
def prepare_inputs_for_generation(self, inputs, **kwargs):
|
||||
"""
|
||||
Implement in subclasses of :class:`~transfomers.TFPreTrainedModel` for custom behavior to prepare inputs in the
|
||||
generate method.
|
||||
Implement in subclasses of :class:`~transformers.TFPreTrainedModel` for custom behavior to prepare inputs in
|
||||
the generate method.
|
||||
"""
|
||||
return {"inputs": inputs}
|
||||
|
||||
@ -216,17 +216,17 @@ class TFGenerationMixin:
|
||||
)
|
||||
|
||||
if input_ids is not None:
|
||||
batch_size = shape_list(input_ids)[0] # overriden by the input batch_size
|
||||
batch_size = shape_list(input_ids)[0] # overridden by the input batch_size
|
||||
else:
|
||||
batch_size = 1
|
||||
|
||||
assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer."
|
||||
assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
|
||||
assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
|
||||
assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
|
||||
assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
|
||||
assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
|
||||
assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
|
||||
assert temperature > 0, "`temperature` should be strictely positive."
|
||||
assert temperature > 0, "`temperature` should be strictly positive."
|
||||
assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
|
||||
assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
|
||||
assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
|
||||
@ -239,10 +239,10 @@ class TFGenerationMixin:
|
||||
assert (eos_token_id is None) or (
|
||||
isinstance(eos_token_id, int) and (eos_token_id >= 0)
|
||||
), "`eos_token_id` should be a positive integer."
|
||||
assert length_penalty > 0, "`length_penalty` should be strictely positive."
|
||||
assert length_penalty > 0, "`length_penalty` should be strictly positive."
|
||||
assert (
|
||||
isinstance(num_return_sequences, int) and num_return_sequences > 0
|
||||
), "`num_return_sequences` should be a strictely positive integer."
|
||||
), "`num_return_sequences` should be a strictly positive integer."
|
||||
assert (
|
||||
bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
|
||||
), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
|
||||
@ -722,7 +722,7 @@ class TFGenerationMixin:
|
||||
beam_scores[:, None], (batch_size * num_beams, vocab_size)
|
||||
) # (batch_size * num_beams, vocab_size)
|
||||
|
||||
# re-organize to group the beam together (we are keeping top hypothesis accross beams)
|
||||
# re-organize to group the beam together (we are keeping top hypothesis across beams)
|
||||
next_scores = tf.reshape(
|
||||
next_scores, (batch_size, num_beams * vocab_size)
|
||||
) # (batch_size, num_beams * vocab_size)
|
||||
@ -897,7 +897,7 @@ class TFGenerationMixin:
|
||||
|
||||
def adjust_logits_during_generation(self, logits, **kwargs):
|
||||
"""
|
||||
Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to adjust the logits in
|
||||
Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
|
||||
the generate method.
|
||||
"""
|
||||
return logits
|
||||
@ -978,7 +978,7 @@ def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
|
||||
|
||||
def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
|
||||
"""
|
||||
Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
|
||||
Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
|
||||
|
||||
Args:
|
||||
logits: logits distribution shape (batch size, vocabulary size)
|
||||
@ -1047,7 +1047,7 @@ def set_tensor_by_indices_to_value(tensor, indices, value):
|
||||
|
||||
def sample_without_replacement(logits, num_samples):
|
||||
"""
|
||||
categorical sampling witouth replacement is currently not implemented the gumbel-max trick will do for now see
|
||||
categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see
|
||||
https://github.com/tensorflow/tensorflow/issues/9260 for more info
|
||||
"""
|
||||
z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1))
|
||||
|
@ -29,20 +29,20 @@ logger = logging.get_logger(__name__)
|
||||
|
||||
class GenerationMixin:
|
||||
"""
|
||||
A class contraining all of the functions supporting generation, to be used as a mixin in
|
||||
:class:`~transfomers.PreTrainedModel`.
|
||||
A class containing all of the functions supporting generation, to be used as a mixin in
|
||||
:class:`~transformers.PreTrainedModel`.
|
||||
"""
|
||||
|
||||
def prepare_inputs_for_generation(self, input_ids, **kwargs):
|
||||
"""
|
||||
Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to prepare inputs in the
|
||||
Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to prepare inputs in the
|
||||
generate method.
|
||||
"""
|
||||
return {"input_ids": input_ids}
|
||||
|
||||
def adjust_logits_during_generation(self, logits, **kwargs):
|
||||
"""
|
||||
Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to adjust the logits in
|
||||
Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
|
||||
the generate method.
|
||||
"""
|
||||
return logits
|
||||
@ -285,7 +285,7 @@ class GenerationMixin:
|
||||
)
|
||||
|
||||
if input_ids is not None:
|
||||
batch_size = input_ids.shape[0] # overriden by the input batch_size
|
||||
batch_size = input_ids.shape[0] # overridden by the input batch_size
|
||||
else:
|
||||
batch_size = 1
|
||||
|
||||
@ -533,7 +533,7 @@ class GenerationMixin:
|
||||
):
|
||||
"""
|
||||
Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
|
||||
independantly.
|
||||
independently.
|
||||
"""
|
||||
# length of generated sentences / unfinished sentences
|
||||
unfinished_sents = input_ids.new(batch_size).fill_(1)
|
||||
@ -600,7 +600,7 @@ class GenerationMixin:
|
||||
# unfinished_sents is set to zero if eos in sentence
|
||||
unfinished_sents.mul_((~eos_in_sents).long())
|
||||
|
||||
# stop when there is a </s> in each sentence, or if we exceed the maximul length
|
||||
# stop when there is a </s> in each sentence, or if we exceed the maximum length
|
||||
if unfinished_sents.max() == 0:
|
||||
break
|
||||
|
||||
@ -724,7 +724,7 @@ class GenerationMixin:
|
||||
else:
|
||||
next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size)
|
||||
|
||||
# re-organize to group the beam together (we are keeping top hypothesis accross beams)
|
||||
# re-organize to group the beam together (we are keeping top hypothesis across beams)
|
||||
next_scores = next_scores.view(
|
||||
batch_size, num_beams * vocab_size
|
||||
) # (batch_size, num_beams * vocab_size)
|
||||
@ -969,7 +969,7 @@ def top_k_top_p_filtering(
|
||||
min_tokens_to_keep: int = 1,
|
||||
) -> Tensor:
|
||||
"""
|
||||
Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
|
||||
Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
|
||||
|
||||
Args:
|
||||
logits: logits distribution shape (batch size, vocabulary size)
|
||||
|
@ -49,7 +49,7 @@ class ModelCard:
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
# Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
|
||||
# Recommended attributes from https://arxiv.org/abs/1810.03993 (see papers)
|
||||
self.model_details = kwargs.pop("model_details", {})
|
||||
self.intended_use = kwargs.pop("intended_use", {})
|
||||
self.factors = kwargs.pop("factors", {})
|
||||
|
@ -488,7 +488,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
|
||||
model_args (additional positional arguments, `optional`):
|
||||
Will be passed along to the underlying model ``__init__()`` method.
|
||||
config (:class:`~transformers.PretrainedConfig`, `optional`):
|
||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can
|
||||
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
|
||||
be automatically loaded when:
|
||||
|
||||
- The model is a model provided by the library (loaded with the `shortcut name` string of a
|
||||
@ -522,7 +522,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
|
||||
output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
|
||||
local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to only look at local files (e.g., not try doanloading the model).
|
||||
Whether or not to only look at local files (e.g., not try downloading the model).
|
||||
use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
|
||||
our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
|
||||
@ -1424,7 +1424,7 @@ class AutoModelForTokenClassification:
|
||||
class AutoModelForMultipleChoice:
|
||||
r"""
|
||||
This is a generic model class that will be instantiated as one of the model classes of the library---with a
|
||||
multiple choice classifcation head---when created with the when created with the
|
||||
multiple choice classification head---when created with the when created with the
|
||||
:meth:`~transformers.AutoModelForMultipleChoice.from_pretrained` class method or the
|
||||
:meth:`~transformers.AutoModelForMultipleChoice.from_config` class method.
|
||||
|
||||
|
@ -906,7 +906,7 @@ class BartModel(PretrainedBartModel):
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False
|
||||
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
|
||||
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
||||
encoder_outputs = BaseModelOutput(
|
||||
last_hidden_state=encoder_outputs[0],
|
||||
|
@ -69,8 +69,8 @@ class XSoftmax(torch.autograd.Function):
|
||||
|
||||
Args:
|
||||
input (:obj:`torch.tensor`): The input tensor that will apply softmax.
|
||||
mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax caculation.
|
||||
dim (int): The dimenssion that will apply softmax
|
||||
mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
|
||||
dim (int): The dimension that will apply softmax
|
||||
|
||||
Example::
|
||||
import torch
|
||||
@ -540,16 +540,16 @@ class DisentangledSelfAttention(torch.nn.Module):
|
||||
|
||||
Args:
|
||||
hidden_states (:obj:`torch.FloatTensor`):
|
||||
Input states to the module usally the output from previous layer, it will be the Q,K and V in
|
||||
Input states to the module usually the output from previous layer, it will be the Q,K and V in
|
||||
`Attention(Q,K,V)`
|
||||
|
||||
attention_mask (:obj:`torch.ByteTensor`):
|
||||
An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maxium
|
||||
An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maximum
|
||||
sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
|
||||
th token.
|
||||
|
||||
return_att (:obj:`bool`, optional):
|
||||
Whether return the attention maxitrix.
|
||||
Whether return the attention matrix.
|
||||
|
||||
query_states (:obj:`torch.FloatTensor`, optional):
|
||||
The `Q` state in `Attention(Q,K,V)`.
|
||||
@ -627,7 +627,7 @@ class DisentangledSelfAttention(torch.nn.Module):
|
||||
relative_pos = relative_pos.unsqueeze(1)
|
||||
# bxhxqxk
|
||||
elif relative_pos.dim() != 4:
|
||||
raise ValueError(f"Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
|
||||
raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
|
||||
|
||||
att_span = min(max(query_layer.size(-2), key_layer.size(-2)), self.max_relative_positions)
|
||||
relative_pos = relative_pos.long().to(query_layer.device)
|
||||
@ -772,7 +772,7 @@ DEBERTA_START_DOCSTRING = r"""
|
||||
The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
|
||||
<https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
|
||||
BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
|
||||
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-trianing data.
|
||||
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-training data.
|
||||
|
||||
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
|
||||
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
|
||||
|
@ -290,7 +290,7 @@ class Transformer(nn.Module):
|
||||
attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.
|
||||
|
||||
Returns:
|
||||
hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hiddens states in the last (top)
|
||||
hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
|
||||
layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
|
||||
Tuple of length n_layers with the hidden states from each layer.
|
||||
Optional: only if output_hidden_states=True
|
||||
|
@ -418,7 +418,7 @@ DPR_READER_INPUTS_DOCSTRING = r"""
|
||||
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
|
||||
tensors for more detail.
|
||||
output_hidden_states (:obj:`bool`, `optional`):
|
||||
Whether or not to rturn the hidden states of all layers. See ``hidden_states`` under returned tensors for
|
||||
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
|
||||
more detail.
|
||||
return_dict (:obj:`bool`, `optional`):
|
||||
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
|
||||
|
@ -30,7 +30,7 @@ logger = logging.get_logger(__name__)
|
||||
_CONFIG_FOR_DOC = "EncoderDecoderConfig"
|
||||
|
||||
ENCODER_DECODER_START_DOCSTRING = r"""
|
||||
This class can be used to inialize a sequence-to-sequnece model with any pretrained autoencoding model as the
|
||||
This class can be used to initialize a sequence-tsequencece model with any pretrained autoencoding model as the
|
||||
encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
|
||||
:meth:`~transformers.AutoModel.from_pretrained` function and the decoder is loaded via
|
||||
:meth:`~transformers.AutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically added
|
||||
|
@ -99,7 +99,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
|
||||
`What are position IDs? <../glossary.html#position-ids>`_
|
||||
lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
|
||||
Length of each sentence that can be used to avoid performing attention on padding token indices. You can
|
||||
also use :obj:`attention_mask` for the same result (see above), kept here for compatbility. Indices
|
||||
also use :obj:`attention_mask` for the same result (see above), kept here for compatibility. Indices
|
||||
selected in ``[0, ..., input_ids.size(-1)]``:
|
||||
cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
|
||||
Dictionary strings to ``torch.FloatTensor`` that contains precomputed hidden-states (key and values in the
|
||||
|
@ -124,18 +124,18 @@ class FlaxAutoModel(object):
|
||||
All remaining positional arguments will be passed to the underlying model's ``__init__`` method
|
||||
|
||||
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can
|
||||
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
|
||||
be automatically loaded when:
|
||||
|
||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a
|
||||
pretrained model), or
|
||||
- the model was saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and is reloaded
|
||||
by suppling the save directory.
|
||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a
|
||||
by supplying the save directory.
|
||||
- the model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
|
||||
configuration JSON file named `config.json` is found in the directory.
|
||||
|
||||
state_dict: (`optional`) dict:
|
||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved
|
||||
an optional state dictionary for the model to use instead of a state dictionary loaded from saved
|
||||
weights file. This option can be used if you want to create a model from a pretrained configuration but
|
||||
load your own weights. In this case though, you should check if using
|
||||
:func:`~transformers.FlaxPreTrainedModel.save_pretrained` and
|
||||
@ -150,14 +150,14 @@ class FlaxAutoModel(object):
|
||||
they exists.
|
||||
|
||||
resume_download: (`optional`) boolean, default False:
|
||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
||||
Do not delete incompletely received file. Attempt to resume the download if such a file exists.
|
||||
|
||||
proxies: (`optional`) dict, default None:
|
||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
|
||||
'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
|
||||
|
||||
output_loading_info: (`optional`) boolean:
|
||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error
|
||||
Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error
|
||||
messages.
|
||||
|
||||
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||
|
@ -64,7 +64,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
|
||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||
|
||||
- 1 for tokens that are **not masked**,
|
||||
- 0 for tokens that are **maked**.
|
||||
- 0 for tokens that are **masked**.
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
|
||||
|
@ -226,7 +226,7 @@ class FunnelAttentionStructure(nn.Module):
|
||||
d_model = self.config.d_model
|
||||
if self.config.attention_type == "factorized":
|
||||
# Notations from the paper, appending A.2.2, final formula.
|
||||
# We need to create and return the matrics phi, psi, pi and omega.
|
||||
# We need to create and return the matrices phi, psi, pi and omega.
|
||||
pos_seq = torch.arange(0, seq_len, 1.0, dtype=dtype, device=device)
|
||||
freq_seq = torch.arange(0, d_model // 2, 1.0, dtype=dtype, device=device)
|
||||
inv_freq = 1 / (10000 ** (freq_seq / (d_model // 2)))
|
||||
@ -1226,7 +1226,7 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
|
||||
|
||||
@add_start_docstrings(
|
||||
"""
|
||||
Funnel Transfprmer Model with a sequence classification/regression head on top (two linear layer on top of the
|
||||
Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
|
||||
first timestep of the last hidden state) e.g. for GLUE tasks.
|
||||
""",
|
||||
FUNNEL_START_DOCSTRING,
|
||||
|
@ -588,7 +588,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||
|
||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if self.config.add_cross_attention and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||
@ -708,7 +708,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
position_ids = kwargs.get("position_ids", None)
|
||||
|
||||
if attention_mask is not None and position_ids is None:
|
||||
# create postion_ids on the fly for batch generation
|
||||
# create position_ids on the fly for batch generation
|
||||
position_ids = attention_mask.long().cumsum(-1) - 1
|
||||
position_ids.masked_fill_(attention_mask == 0, 1)
|
||||
if past:
|
||||
@ -1050,7 +1050,7 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
|
||||
sequence_lengths = -1
|
||||
logger.warning(
|
||||
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
|
||||
f"unexpected if using padding tokens in conjuction with `inputs_embeds.`"
|
||||
f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
|
||||
)
|
||||
|
||||
pooled_logits = logits[range(batch_size), sequence_lengths]
|
||||
|
@ -382,7 +382,7 @@ class LongformerSelfAttention(nn.Module):
|
||||
# batch_size x num_heads x max_num_global_attention_tokens x sequence_length
|
||||
# which is the attention weights from tokens with global attention to all tokens
|
||||
# It doesn't not return local attention
|
||||
# In case of variable number of global attantion in the rows of a batch,
|
||||
# In case of variable number of global attention in the rows of a batch,
|
||||
# attn_probs are padded with -10000.0 attention scores
|
||||
attn_probs = attn_probs.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
|
||||
else:
|
||||
@ -416,7 +416,7 @@ class LongformerSelfAttention(nn.Module):
|
||||
-0.7584, 0.4206, -0.0405, 0.1599,
|
||||
2.0514, -1.1600, 0.5372, 0.2629 ]
|
||||
window_overlap = num_rows = 4
|
||||
(pad & diagonilize) =>
|
||||
(pad & diagonalize) =>
|
||||
[ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
|
||||
0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
|
||||
0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
|
||||
@ -440,7 +440,7 @@ class LongformerSelfAttention(nn.Module):
|
||||
|
||||
@staticmethod
|
||||
def _chunk(hidden_states, window_overlap):
|
||||
"""convert into overlapping chunkings. Chunk size = 2w, overlap size = w"""
|
||||
"""convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
|
||||
|
||||
# non-overlapping chunks of size = 2w
|
||||
hidden_states = hidden_states.view(
|
||||
@ -491,7 +491,7 @@ class LongformerSelfAttention(nn.Module):
|
||||
chunked_query = self._chunk(query, window_overlap)
|
||||
chunked_key = self._chunk(key, window_overlap)
|
||||
|
||||
# matrix multipication
|
||||
# matrix multiplication
|
||||
# bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
|
||||
# bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
|
||||
# bcxy: batch_size * num_heads x chunks x 2window_overlap x window_overlap
|
||||
@ -1030,7 +1030,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
|
||||
Mask to decide the attention given on each token, local attention or global attenion. Tokens with global
|
||||
Mask to decide the attention given on each token, local attention or global attention. Tokens with global
|
||||
attention attends to all other tokens, and all other tokens attend to them. This is important for
|
||||
task-specific finetuning because it makes the model more flexible at representing the task. For example,
|
||||
for classification, the <s> token should be given global attention. For QA, all question tokens should also
|
||||
|
@ -58,7 +58,7 @@ class GeLU(nn.Module):
|
||||
@dataclass
|
||||
class LxmertModelOutput(ModelOutput):
|
||||
"""
|
||||
Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilites for the language,
|
||||
Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
|
||||
visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
|
||||
encoder")
|
||||
|
||||
@ -405,7 +405,7 @@ class LxmertSelfAttentionLayer(nn.Module):
|
||||
self.output = LxmertAttentionOutput(config)
|
||||
|
||||
def forward(self, input_tensor, attention_mask, output_attentions=False):
|
||||
# Self attention attends to itself, thus keys and querys are the same (input_tensor).
|
||||
# Self attention attends to itself, thus keys and queries are the same (input_tensor).
|
||||
output = self.self(
|
||||
input_tensor,
|
||||
input_tensor,
|
||||
@ -799,7 +799,7 @@ LXMERT_START_DOCSTRING = r"""
|
||||
<https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
|
||||
pretrained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
|
||||
using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
|
||||
question answering attribute prediction, and object tag predicition.
|
||||
question answering attribute prediction, and object tag prediction.
|
||||
|
||||
This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
|
||||
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
|
||||
@ -1076,12 +1076,10 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
|
||||
will add newly initialized weights. Reducing the size will remove weights from the end
|
||||
|
||||
Args:
|
||||
cur_qa_logit_layer (:obj:`torch.nn.Linear`):
|
||||
Old linear layer to be resized.
|
||||
num_labels (:obj:`int`, `optional`):
|
||||
New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
|
||||
weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
|
||||
just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing
|
||||
just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
|
||||
anything.
|
||||
|
||||
Return:
|
||||
@ -1298,12 +1296,10 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
|
||||
will add newly initialized weights. Reducing the size will remove weights from the end
|
||||
|
||||
Args:
|
||||
cur_qa_logit_layer (:obj:`torch.nn.Linear`):
|
||||
Old linear layer to be resized.
|
||||
num_labels (:obj:`int`, `optional`):
|
||||
New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
|
||||
weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
|
||||
just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing
|
||||
just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
|
||||
anything.
|
||||
|
||||
Return:
|
||||
|
@ -887,7 +887,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
|
||||
)
|
||||
|
||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
||||
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||
|
@ -40,7 +40,7 @@ class RetrievAugLMMarginOutput(ModelOutput):
|
||||
|
||||
Args:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
|
||||
Languaged modeling loss.
|
||||
Language modeling loss.
|
||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
|
||||
each vocabulary token.
|
||||
@ -413,7 +413,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
|
||||
|
||||
Used by the (:class:`~transformers.RagModel`) model during decoding.
|
||||
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Provide for generation tasks. `None` by default, constuct as per instructions for the generator model
|
||||
Provide for generation tasks. `None` by default, construct as per instructions for the generator model
|
||||
you're using with your RAG instance.
|
||||
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
|
||||
@ -424,7 +424,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
|
||||
:obj:`past_key_values` are used in the (:class:`~transformers.RagTokenForGeneration`) model during
|
||||
decoding.
|
||||
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
|
||||
Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and
|
||||
Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
|
||||
:obj:`question_encoder_last_hidden_state`. If the model has is not initialized with a ``retriever``
|
||||
:obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores` can be computed via
|
||||
:obj:`question_encoder_last_hidden_state` and :obj:`retrieved_doc_embeds`, see examples for more
|
||||
@ -660,7 +660,7 @@ class RagModel(RagPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
"""
|
||||
A RAG-sequence model impementation. It performs RAG-sequence specific marginalization in the forward pass.
|
||||
A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
|
||||
""",
|
||||
RAG_START_DOCSTRING,
|
||||
)
|
||||
@ -736,7 +736,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
|
||||
>>> input_ids = input_dict["input_ids"]
|
||||
>>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
|
||||
|
||||
>>> # or use retriever seperately
|
||||
>>> # or use retriever separately
|
||||
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
|
||||
>>> # 1. Encode
|
||||
>>> question_hidden_states = model.question_encoder(input_ids)[0]
|
||||
@ -940,13 +940,13 @@ class RagSequenceForGeneration(RagPreTrainedModel):
|
||||
) # batch_size x n_docs x tgt_len x dim
|
||||
doc_logprobs = torch.nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1)
|
||||
|
||||
# RAG-sequence marginaliation
|
||||
# RAG-sequence marginalization
|
||||
first_token_scores = seq_logprobs[:, :, :1, :]
|
||||
second_token_scores = seq_logprobs[:, :, 1:2, :]
|
||||
remainder = seq_logprobs[:, :, 2:, :]
|
||||
rag_logprobs = torch.cat([first_token_scores, second_token_scores + doc_logprobs, remainder], dim=2)
|
||||
|
||||
# calcualate loss
|
||||
# calculate loss
|
||||
target = target.unsqueeze(1).unsqueeze(-1).repeat(1, n_docs, 1, 1)
|
||||
assert target.dim() == rag_logprobs.dim()
|
||||
|
||||
@ -986,7 +986,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
|
||||
|
||||
@add_start_docstrings_to_model_forward(
|
||||
"""
|
||||
A RAG-token model impementation. It performs RAG-token specific marginalization in the forward pass.
|
||||
A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
|
||||
""",
|
||||
RAG_START_DOCSTRING,
|
||||
)
|
||||
@ -1129,7 +1129,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
|
||||
>>> input_ids = input_dict["input_ids"]
|
||||
>>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
|
||||
|
||||
>>> # or use retriever seperately
|
||||
>>> # or use retriever separately
|
||||
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
|
||||
>>> # 1. Encode
|
||||
>>> question_hidden_states = model.question_encoder(input_ids)[0]
|
||||
@ -1257,7 +1257,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
|
||||
to the forward pass. :obj:`context_input_ids` are returned by
|
||||
:meth:`~transformers.RagRetriever.__call__`.
|
||||
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
|
||||
Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and
|
||||
Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
|
||||
:obj:`question_encoder_last_hidden_state`.
|
||||
|
||||
If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided
|
||||
|
@ -986,7 +986,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
||||
class ReverseSort(Function):
|
||||
"""
|
||||
After chunked attention is applied which sorted clusters, original ordering has to be restored. Since customized
|
||||
backward function is used for Reformer, the gradients of the output vectors have to be explicitely sorted here.
|
||||
backward function is used for Reformer, the gradients of the output vectors have to be explicitly sorted here.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
@ -2075,7 +2075,7 @@ class ReformerModel(ReformerPreTrainedModel):
|
||||
device=device,
|
||||
)
|
||||
|
||||
# start index for postion encoding depends on incremental decoding
|
||||
# start index for position encoding depends on incremental decoding
|
||||
if past_buckets_states is not None:
|
||||
start_idx_pos_encodings = past_buckets_states[0][1].shape[1]
|
||||
else:
|
||||
|
@ -79,7 +79,7 @@ RETRIBERT_START_DOCSTRING = r"""
|
||||
|
||||
|
||||
@add_start_docstrings(
|
||||
"""Bert Based model to embed queries or document for document retreival. """,
|
||||
"""Bert Based model to embed queries or document for document retrieval. """,
|
||||
RETRIBERT_START_DOCSTRING,
|
||||
)
|
||||
class RetriBertModel(RetriBertPreTrainedModel):
|
||||
@ -117,7 +117,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
|
||||
attention_mask, input_shape, device
|
||||
)
|
||||
|
||||
# define function for cehckpointing
|
||||
# define function for checkpointing
|
||||
def partial_encode(*inputs):
|
||||
encoder_outputs = sent_encoder.encoder(
|
||||
inputs[0],
|
||||
@ -200,7 +200,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
|
||||
|
||||
Return:
|
||||
:obj:`torch.FloatTensor`: The bidirectional cross-entropy loss obtained while trying to match each query to
|
||||
its corresponding document and each cocument to its corresponding query in the batch
|
||||
its corresponding document and each document to its corresponding query in the batch
|
||||
"""
|
||||
device = input_ids_query.device
|
||||
q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user