mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 05:10:06 +06:00
Fix doc errors and typos across the board (#8139)
* Fix doc errors and typos across the board * Fix a typo * Fix the CI * Fix more typos * Fix CI * More fixes * Fix CI * More fixes * More fixes
This commit is contained in:
parent
4731a00c3e
commit
969859d5f6
@ -96,7 +96,7 @@ folder.
|
|||||||
|
|
||||||
## Start contributing! (Pull Requests)
|
## Start contributing! (Pull Requests)
|
||||||
|
|
||||||
Before writing code, we strongly advise you to search through the exising PRs or
|
Before writing code, we strongly advise you to search through the existing PRs or
|
||||||
issues to make sure that nobody is already working on the same thing. If you are
|
issues to make sure that nobody is already working on the same thing. If you are
|
||||||
unsure, it is always a good idea to open an issue to get some feedback.
|
unsure, it is always a good idea to open an issue to get some feedback.
|
||||||
|
|
||||||
@ -235,7 +235,7 @@ Follow these steps to start contributing:
|
|||||||
### Checklist
|
### Checklist
|
||||||
|
|
||||||
1. The title of your pull request should be a summary of its contribution;
|
1. The title of your pull request should be a summary of its contribution;
|
||||||
2. If your pull request adresses an issue, please mention the issue number in
|
2. If your pull request addresses an issue, please mention the issue number in
|
||||||
the pull request description to make sure they are linked (and people
|
the pull request description to make sure they are linked (and people
|
||||||
consulting the issue know you are working on it);
|
consulting the issue know you are working on it);
|
||||||
3. To indicate a work in progress please prefix the title with `[WIP]`. These
|
3. To indicate a work in progress please prefix the title with `[WIP]`. These
|
||||||
|
@ -80,9 +80,9 @@ cache home followed by ``/transformers/`` (even if you don't have PyTorch instal
|
|||||||
So if you don't have any specific environment variable set, the cache directory will be at
|
So if you don't have any specific environment variable set, the cache directory will be at
|
||||||
``~/.cache/torch/transformers/``.
|
``~/.cache/torch/transformers/``.
|
||||||
|
|
||||||
**Note:** If you have set a shell enviromnent variable for one of the predecessors of this library
|
**Note:** If you have set a shell environment variable for one of the predecessors of this library
|
||||||
(``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
|
(``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
|
||||||
enviromnent variable for ``TRANSFORMERS_CACHE``.
|
environment variable for ``TRANSFORMERS_CACHE``.
|
||||||
|
|
||||||
### Note on model downloads (Continuous Integration or large-scale deployments)
|
### Note on model downloads (Continuous Integration or large-scale deployments)
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ Here is a quick summary of what you should take care of when migrating from `pyt
|
|||||||
|
|
||||||
The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
|
||||||
|
|
||||||
The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
|
The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
|
||||||
|
|
||||||
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
|
||||||
|
|
||||||
@ -109,7 +109,7 @@ for batch in train_data:
|
|||||||
loss.backward()
|
loss.backward()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
### In 🤗 Transformers, optimizer and schedules are splitted and instantiated like this:
|
### In 🤗 Transformers, optimizer and schedules are split and instantiated like this:
|
||||||
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
|
||||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
|
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
|
||||||
### and used like this:
|
### and used like this:
|
||||||
|
@ -119,7 +119,7 @@ Other files can safely be deleted.
|
|||||||
Upload your model with the CLI
|
Upload your model with the CLI
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Now go in a terminal and run the following command. It should be in the virtual enviromnent where you installed 🤗
|
Now go in a terminal and run the following command. It should be in the virtual environment where you installed 🤗
|
||||||
Transformers, since that command :obj:`transformers-cli` comes from the library.
|
Transformers, since that command :obj:`transformers-cli` comes from the library.
|
||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
@ -510,8 +510,8 @@ As a default all models apply *Top-K* sampling when used in pipelines, as config
|
|||||||
|
|
||||||
|
|
||||||
Here, the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am
|
Here, the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am
|
||||||
concerned, I will"*. The default arguments of ``PreTrainedModel.generate()`` can be directly overriden in the pipeline,
|
concerned, I will"*. The default arguments of ``PreTrainedModel.generate()`` can be directly overridden in the
|
||||||
as is shown above for the argument ``max_length``.
|
pipeline, as is shown above for the argument ``max_length``.
|
||||||
|
|
||||||
Here is an example of text generation using ``XLNet`` and its tokenzier.
|
Here is an example of text generation using ``XLNet`` and its tokenzier.
|
||||||
|
|
||||||
|
@ -291,10 +291,9 @@ def hans_convert_examples_to_features(
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
examples: List of ``InputExamples`` containing the examples.
|
examples: List of ``InputExamples`` containing the examples.
|
||||||
tokenizer: Instance of a tokenizer that will tokenize the examples.
|
|
||||||
max_length: Maximum example length.
|
|
||||||
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method.
|
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method.
|
||||||
output_mode: String indicating the output mode. Either ``regression`` or ``classification``.
|
max_length: Maximum example length.
|
||||||
|
tokenizer: Instance of a tokenizer that will tokenize the examples.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of task-specific ``InputFeatures`` which can be fed to the model.
|
A list of task-specific ``InputFeatures`` which can be fed to the model.
|
||||||
|
@ -155,7 +155,7 @@ class BertModelWithPabee(BertModel):
|
|||||||
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
|
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
|
||||||
|
|
||||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||||
if self.config.is_decoder and encoder_hidden_states is not None:
|
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||||
|
@ -198,7 +198,7 @@ class DeeBertModel(BertPreTrainedModel):
|
|||||||
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
|
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
|
||||||
|
|
||||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||||
if encoder_attention_mask.dim() == 3:
|
if encoder_attention_mask.dim() == 3:
|
||||||
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
|
||||||
if encoder_attention_mask.dim() == 2:
|
if encoder_attention_mask.dim() == 2:
|
||||||
@ -260,7 +260,7 @@ class BertHighway(nn.Module):
|
|||||||
|
|
||||||
# BertModel
|
# BertModel
|
||||||
bmodel_output = (pooler_input, pooler_output) + encoder_outputs[1:]
|
bmodel_output = (pooler_input, pooler_output) + encoder_outputs[1:]
|
||||||
# "return" bodel_output
|
# "return" bmodel_output
|
||||||
|
|
||||||
# Dropout and classification
|
# Dropout and classification
|
||||||
pooled_output = bmodel_output[1]
|
pooled_output = bmodel_output[1]
|
||||||
|
@ -265,7 +265,7 @@ class Distiller:
|
|||||||
-------
|
-------
|
||||||
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
|
||||||
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
|
||||||
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
|
clm_labels: `torch.tensor(bs, seq_length)` - The causal language modeling labels. There is a -100 where there is nothing to predict.
|
||||||
"""
|
"""
|
||||||
token_ids, lengths = batch
|
token_ids, lengths = batch
|
||||||
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
|
||||||
@ -401,9 +401,9 @@ class Distiller:
|
|||||||
# https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
|
# https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
|
||||||
# https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
|
# https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
|
||||||
if self.params.restrict_ce_to_mask:
|
if self.params.restrict_ce_to_mask:
|
||||||
mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_length, voc_size)
|
||||||
else:
|
else:
|
||||||
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
|
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_length, voc_size)
|
||||||
s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||||
s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
|
s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
|
||||||
t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
|
||||||
|
@ -61,7 +61,7 @@ class LmSeqsDataset(Dataset):
|
|||||||
|
|
||||||
def remove_long_sequences(self):
|
def remove_long_sequences(self):
|
||||||
"""
|
"""
|
||||||
Sequences that are too long are splitted by chunk of max_model_input_size.
|
Sequences that are too long are split by chunk of max_model_input_size.
|
||||||
"""
|
"""
|
||||||
max_len = self.params.max_model_input_size
|
max_len = self.params.max_model_input_size
|
||||||
indices = self.lengths > max_len
|
indices = self.lengths > max_len
|
||||||
@ -138,8 +138,8 @@ class LmSeqsDataset(Dataset):
|
|||||||
# logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
|
# logger.info(f'{data_len} tokens ({nb_unique_tokens} unique)')
|
||||||
|
|
||||||
# unk_idx = self.params.special_tok_ids['unk_token']
|
# unk_idx = self.params.special_tok_ids['unk_token']
|
||||||
# nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
|
# nb_unknown = sum([(t==unk_idx).sum() for t in self.token_ids])
|
||||||
# logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
|
# logger.info(f'{nb_unknown} unknown tokens (covering {100*nb_unknown/data_len:.2f}% of the data)')
|
||||||
|
|
||||||
def batch_sequences(self, batch):
|
def batch_sequences(self, batch):
|
||||||
"""
|
"""
|
||||||
|
@ -96,7 +96,7 @@ if __name__ == "__main__":
|
|||||||
compressed_sd["lm_head.weight"] = state_dict["lm_head.weight"]
|
compressed_sd["lm_head.weight"] = state_dict["lm_head.weight"]
|
||||||
|
|
||||||
print(f"N layers selected for distillation: {std_idx}")
|
print(f"N layers selected for distillation: {std_idx}")
|
||||||
print(f"Number of params transfered for distillation: {len(compressed_sd.keys())}")
|
print(f"Number of params transferred for distillation: {len(compressed_sd.keys())}")
|
||||||
|
|
||||||
print(f"Save transfered checkpoint to {args.dump_checkpoint}.")
|
print(f"Save transferred checkpoint to {args.dump_checkpoint}.")
|
||||||
torch.save(compressed_sd, args.dump_checkpoint)
|
torch.save(compressed_sd, args.dump_checkpoint)
|
||||||
|
@ -266,14 +266,14 @@ def find_top_rpn_proposals(
|
|||||||
):
|
):
|
||||||
"""Args:
|
"""Args:
|
||||||
proposals (list[Tensor]): (L, N, Hi*Wi*A, 4).
|
proposals (list[Tensor]): (L, N, Hi*Wi*A, 4).
|
||||||
pred_objectness_logits: tensors of lenngth L.
|
pred_objectness_logits: tensors of length L.
|
||||||
nms_thresh (float): IoU threshold to use for NMS
|
nms_thresh (float): IoU threshold to use for NMS
|
||||||
pre_nms_topk (int): before nms
|
pre_nms_topk (int): before nms
|
||||||
post_nms_topk (int): after nms
|
post_nms_topk (int): after nms
|
||||||
min_box_side_len (float): minimum proposal box side
|
min_box_side_len (float): minimum proposal box side
|
||||||
training (bool): True if proposals are to be used in training,
|
training (bool): True if proposals are to be used in training,
|
||||||
Returns:
|
Returns:
|
||||||
resuls (List[Dict]): stores post_nms_topk object proposals for image i.
|
results (List[Dict]): stores post_nms_topk object proposals for image i.
|
||||||
"""
|
"""
|
||||||
num_images = len(images)
|
num_images = len(images)
|
||||||
device = proposals[0].device
|
device = proposals[0].device
|
||||||
@ -648,7 +648,7 @@ class RPNOutputs(object):
|
|||||||
images (ImageList): :class:`ImageList` instance representing N input images
|
images (ImageList): :class:`ImageList` instance representing N input images
|
||||||
pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A, Hi, W)
|
pred_objectness_logits (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A, Hi, W)
|
||||||
pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A*4, Hi, Wi)
|
pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape (N, A*4, Hi, Wi)
|
||||||
anchors (list[torch.Tensor]): nested list ofboxes. anchors[i][j] at (n, l) stores anchor array for feature map l
|
anchors (list[torch.Tensor]): nested list of boxes. anchors[i][j] at (n, l) stores anchor array for feature map l
|
||||||
boundary_threshold (int): if >= 0, then anchors that extend beyond the image boundary by more than boundary_thresh are not used in training.
|
boundary_threshold (int): if >= 0, then anchors that extend beyond the image boundary by more than boundary_thresh are not used in training.
|
||||||
gt_boxes (list[Boxes], optional): A list of N elements.
|
gt_boxes (list[Boxes], optional): A list of N elements.
|
||||||
smooth_l1_beta (float): The transition point between L1 and L2 lossn. When set to 0, the loss becomes L1. When +inf, it is ignored
|
smooth_l1_beta (float): The transition point between L1 and L2 lossn. When set to 0, the loss becomes L1. When +inf, it is ignored
|
||||||
@ -1186,7 +1186,7 @@ class ROIOutputs(object):
|
|||||||
attr_probs_all, attrs_all = self._predict_attrs(attr_logits, preds_per_image)
|
attr_probs_all, attrs_all = self._predict_attrs(attr_logits, preds_per_image)
|
||||||
features = features.split(preds_per_image, dim=0)
|
features = features.split(preds_per_image, dim=0)
|
||||||
|
|
||||||
# fun for each image too, also I can expirement and do multiple images
|
# fun for each image too, also I can experiment and do multiple images
|
||||||
final_results = []
|
final_results = []
|
||||||
zipped = zip(boxes_all, obj_scores_all, attr_probs_all, attrs_all, sizes)
|
zipped = zip(boxes_all, obj_scores_all, attr_probs_all, attrs_all, sizes)
|
||||||
for i, (boxes, obj_scores, attr_probs, attrs, size) in enumerate(zipped):
|
for i, (boxes, obj_scores, attr_probs, attrs, size) in enumerate(zipped):
|
||||||
@ -1412,7 +1412,7 @@ class AnchorGenerator(nn.Module):
|
|||||||
|
|
||||||
def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
|
def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)):
|
||||||
"""
|
"""
|
||||||
anchors are continious geometric rectangles
|
anchors are continuous geometric rectangles
|
||||||
centered on one feature map point sample.
|
centered on one feature map point sample.
|
||||||
We can later build the set of anchors
|
We can later build the set of anchors
|
||||||
for the entire feature map by tiling these tensors
|
for the entire feature map by tiling these tensors
|
||||||
@ -1865,7 +1865,7 @@ class GeneralizedRCNN(nn.Module):
|
|||||||
scales_yx=None,
|
scales_yx=None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
# run images through bacbone
|
# run images through backbone
|
||||||
original_sizes = image_shapes * scales_yx
|
original_sizes = image_shapes * scales_yx
|
||||||
features = self.backbone(images)
|
features = self.backbone(images)
|
||||||
|
|
||||||
|
@ -116,7 +116,7 @@ class Preprocess:
|
|||||||
images = self.aug(images)
|
images = self.aug(images)
|
||||||
# transpose images and convert to torch tensors
|
# transpose images and convert to torch tensors
|
||||||
# images = [torch.as_tensor(i.astype("float32")).permute(2, 0, 1).to(self.device) for i in images]
|
# images = [torch.as_tensor(i.astype("float32")).permute(2, 0, 1).to(self.device) for i in images]
|
||||||
# now normalize before pad to aoid useless arithmatic
|
# now normalize before pad to avoid useless arithmetic
|
||||||
images = [self.normalizer(x) for x in images]
|
images = [self.normalizer(x) for x in images]
|
||||||
# now pad them to do the following operations
|
# now pad them to do the following operations
|
||||||
images, sizes = self.pad(images)
|
images, sizes = self.pad(images)
|
||||||
|
@ -236,7 +236,7 @@ def compare(in_tensor):
|
|||||||
), f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} % element-wise mismatch"
|
), f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} % element-wise mismatch"
|
||||||
raise Exception("tensors are all good")
|
raise Exception("tensors are all good")
|
||||||
|
|
||||||
# Hugging face functiions below
|
# Hugging face functions below
|
||||||
|
|
||||||
|
|
||||||
def is_remote_url(url_or_filename):
|
def is_remote_url(url_or_filename):
|
||||||
@ -520,7 +520,7 @@ def get_image_from_url(url):
|
|||||||
return img
|
return img
|
||||||
|
|
||||||
|
|
||||||
# to load legace frcnn checkpoint from detectron
|
# to load legacy frcnn checkpoint from detectron
|
||||||
def load_frcnn_pkl_from_url(url):
|
def load_frcnn_pkl_from_url(url):
|
||||||
fn = url.split("/")[-1]
|
fn = url.split("/")[-1]
|
||||||
if fn not in os.listdir(os.getcwd()):
|
if fn not in os.listdir(os.getcwd()):
|
||||||
|
@ -33,7 +33,7 @@ def main(args):
|
|||||||
remaining_count = 0 # Number of remaining (not pruned) params in the encoder
|
remaining_count = 0 # Number of remaining (not pruned) params in the encoder
|
||||||
encoder_count = 0 # Number of params in the encoder
|
encoder_count = 0 # Number of params in the encoder
|
||||||
|
|
||||||
print("name".ljust(60, " "), "Remaining Weights %", "Remaning Weight")
|
print("name".ljust(60, " "), "Remaining Weights %", "Remaining Weight")
|
||||||
for name, param in st.items():
|
for name, param in st.items():
|
||||||
if "encoder" not in name:
|
if "encoder" not in name:
|
||||||
continue
|
continue
|
||||||
|
@ -591,7 +591,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
|
|||||||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||||
|
|
||||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||||
if self.config.is_decoder and encoder_hidden_states is not None:
|
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||||
@ -631,7 +631,7 @@ class MaskedBertModel(MaskedBertPreTrainedModel):
|
|||||||
) # We can specify head_mask for each layer
|
) # We can specify head_mask for each layer
|
||||||
head_mask = head_mask.to(
|
head_mask = head_mask.to(
|
||||||
dtype=next(self.parameters()).dtype
|
dtype=next(self.parameters()).dtype
|
||||||
) # switch to fload if need + fp16 compatibility
|
) # switch to float if need + fp16 compatibility
|
||||||
else:
|
else:
|
||||||
head_mask = [None] * self.config.num_hidden_layers
|
head_mask = [None] * self.config.num_hidden_layers
|
||||||
|
|
||||||
|
@ -225,7 +225,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
|||||||
desc="Epoch",
|
desc="Epoch",
|
||||||
disable=args.local_rank not in [-1, 0],
|
disable=args.local_rank not in [-1, 0],
|
||||||
)
|
)
|
||||||
set_seed(args) # Added here for reproductibility
|
set_seed(args) # Added here for reproducibility
|
||||||
for _ in train_iterator:
|
for _ in train_iterator:
|
||||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||||
for step, batch in enumerate(epoch_iterator):
|
for step, batch in enumerate(epoch_iterator):
|
||||||
@ -705,7 +705,7 @@ def main():
|
|||||||
"--final_lambda",
|
"--final_lambda",
|
||||||
default=0.0,
|
default=0.0,
|
||||||
type=float,
|
type=float,
|
||||||
help="Regularization intensity (used in conjunction with `regulariation`.",
|
help="Regularization intensity (used in conjunction with `regularization`.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
|
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
|
||||||
@ -816,7 +816,7 @@ def main():
|
|||||||
if args.local_rank == -1 or args.no_cuda:
|
if args.local_rank == -1 or args.no_cuda:
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||||
torch.cuda.set_device(args.local_rank)
|
torch.cuda.set_device(args.local_rank)
|
||||||
device = torch.device("cuda", args.local_rank)
|
device = torch.device("cuda", args.local_rank)
|
||||||
torch.distributed.init_process_group(backend="nccl")
|
torch.distributed.init_process_group(backend="nccl")
|
||||||
|
@ -231,7 +231,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
|||||||
train_iterator = trange(
|
train_iterator = trange(
|
||||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||||
)
|
)
|
||||||
# Added here for reproductibility
|
# Added here for reproducibility
|
||||||
set_seed(args)
|
set_seed(args)
|
||||||
|
|
||||||
for _ in train_iterator:
|
for _ in train_iterator:
|
||||||
@ -824,7 +824,7 @@ def main():
|
|||||||
"--final_lambda",
|
"--final_lambda",
|
||||||
default=0.0,
|
default=0.0,
|
||||||
type=float,
|
type=float,
|
||||||
help="Regularization intensity (used in conjunction with `regulariation`.",
|
help="Regularization intensity (used in conjunction with `regularization`.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
|
parser.add_argument("--global_topk", action="store_true", help="Global TopK on the Scores.")
|
||||||
@ -977,7 +977,7 @@ def main():
|
|||||||
if args.local_rank == -1 or args.no_cuda:
|
if args.local_rank == -1 or args.no_cuda:
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
||||||
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
|
||||||
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
|
||||||
torch.cuda.set_device(args.local_rank)
|
torch.cuda.set_device(args.local_rank)
|
||||||
device = torch.device("cuda", args.local_rank)
|
device = torch.device("cuda", args.local_rank)
|
||||||
torch.distributed.init_process_group(backend="nccl")
|
torch.distributed.init_process_group(backend="nccl")
|
||||||
|
@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
|
|||||||
class RagPyTorchDistributedRetriever(RagRetriever):
|
class RagPyTorchDistributedRetriever(RagRetriever):
|
||||||
"""
|
"""
|
||||||
A distributed retriever built on top of the ``torch.distributed`` communication package. During training all workers
|
A distributed retriever built on top of the ``torch.distributed`` communication package. During training all workers
|
||||||
initalize their own instance of the retriever, however, only the main worker loads the index into memory. The index is stored
|
initialize their own instance of the retriever, however, only the main worker loads the index into memory. The index is stored
|
||||||
in cpu memory. The index will also work well in a non-distributed setup.
|
in cpu memory. The index will also work well in a non-distributed setup.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -45,7 +45,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
|
|||||||
|
|
||||||
def init_retrieval(self, distributed_port: int):
|
def init_retrieval(self, distributed_port: int):
|
||||||
"""
|
"""
|
||||||
Retriever initalization function, needs to be called from the training process. The function sets some common parameters
|
Retriever initialization function, needs to be called from the training process. The function sets some common parameters
|
||||||
and environment variables. On top of that, (only) the main process in the process group loads the index into memory.
|
and environment variables. On top of that, (only) the main process in the process group loads the index into memory.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -56,7 +56,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
|
|||||||
|
|
||||||
logger.info("initializing retrieval")
|
logger.info("initializing retrieval")
|
||||||
|
|
||||||
# initializing a separate process group for retrievel as the default
|
# initializing a separate process group for retrieval as the default
|
||||||
# nccl backend doesn't support gather/scatter operations while gloo
|
# nccl backend doesn't support gather/scatter operations while gloo
|
||||||
# is too slow to replace nccl for the core gpu communication
|
# is too slow to replace nccl for the core gpu communication
|
||||||
if dist.is_initialized():
|
if dist.is_initialized():
|
||||||
@ -101,7 +101,7 @@ class RagPyTorchDistributedRetriever(RagRetriever):
|
|||||||
n_docs (:obj:`int`):
|
n_docs (:obj:`int`):
|
||||||
The number of docs retrieved per query.
|
The number of docs retrieved per query.
|
||||||
|
|
||||||
Ouput:
|
Output:
|
||||||
retrieved_doc_embeds (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`
|
retrieved_doc_embeds (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`
|
||||||
The retrieval embeddings of the retrieved docs per query.
|
The retrieval embeddings of the retrieved docs per query.
|
||||||
doc_ids (:obj:`np.ndarray` of shape :obj:`batch_size, n_docs`)
|
doc_ids (:obj:`np.ndarray` of shape :obj:`batch_size, n_docs`)
|
||||||
|
@ -176,7 +176,7 @@ def get_args():
|
|||||||
choices=["e2e", "retrieval"],
|
choices=["e2e", "retrieval"],
|
||||||
default="e2e",
|
default="e2e",
|
||||||
type=str,
|
type=str,
|
||||||
help="Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calulates precision@k.",
|
help="Evaluation mode, e2e calculates exact match and F1 of the downstream task, retrieval calculates precision@k.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--k", default=1, type=int, help="k for the precision@k calculation")
|
parser.add_argument("--k", default=1, type=int, help="k for the precision@k calculation")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -206,7 +206,7 @@ def get_args():
|
|||||||
"--predictions_path",
|
"--predictions_path",
|
||||||
type=str,
|
type=str,
|
||||||
default="predictions.txt",
|
default="predictions.txt",
|
||||||
help="Name of the predictions file, to be stored in the checkpoints directry",
|
help="Name of the predictions file, to be stored in the checkpoints directory",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--eval_all_checkpoints",
|
"--eval_all_checkpoints",
|
||||||
|
@ -26,7 +26,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
|||||||
|
|
||||||
|
|
||||||
def split_text(text: str, n=100, character=" ") -> List[str]:
|
def split_text(text: str, n=100, character=" ") -> List[str]:
|
||||||
"""Split the text every ``n``-th occurence of ``character``"""
|
"""Split the text every ``n``-th occurrence of ``character``"""
|
||||||
text = text.split(character)
|
text = text.split(character)
|
||||||
return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]
|
return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ class BertAbsConfig(PretrainedConfig):
|
|||||||
enc_ff_size: int
|
enc_ff_size: int
|
||||||
The size of the encoder's feed-forward layers.
|
The size of the encoder's feed-forward layers.
|
||||||
enc_dropout: int
|
enc_dropout: int
|
||||||
The dropout probabilitiy for all fully connected layers in the
|
The dropout probability for all fully connected layers in the
|
||||||
embeddings, layers, pooler and also the attention probabilities in
|
embeddings, layers, pooler and also the attention probabilities in
|
||||||
the encoder.
|
the encoder.
|
||||||
dec_layer: int
|
dec_layer: int
|
||||||
@ -56,7 +56,7 @@ class BertAbsConfig(PretrainedConfig):
|
|||||||
dec_ff_size: int
|
dec_ff_size: int
|
||||||
The size of the decoder's feed-forward layers.
|
The size of the decoder's feed-forward layers.
|
||||||
dec_dropout: int
|
dec_dropout: int
|
||||||
The dropout probabilitiy for all fully connected layers in the
|
The dropout probability for all fully connected layers in the
|
||||||
embeddings, layers, pooler and also the attention probabilities in
|
embeddings, layers, pooler and also the attention probabilities in
|
||||||
the decoder.
|
the decoder.
|
||||||
"""
|
"""
|
||||||
|
@ -152,7 +152,7 @@ class TransformerDecoder(nn.Module):
|
|||||||
dropout (float): dropout parameters
|
dropout (float): dropout parameters
|
||||||
embeddings (:obj:`onmt.modules.Embeddings`):
|
embeddings (:obj:`onmt.modules.Embeddings`):
|
||||||
embeddings to use, should have positional encodings
|
embeddings to use, should have positional encodings
|
||||||
attn_type (str): if using a seperate copy attention
|
attn_type (str): if using a separate copy attention
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
|
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
|
||||||
@ -817,11 +817,7 @@ class Translator(object):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
batch (:obj:`Batch`): a batch from a dataset object
|
batch (:obj:`Batch`): a batch from a dataset object
|
||||||
data (:obj:`Dataset`): the dataset object
|
|
||||||
fast (bool): enables fast beam search (may not support all features)
|
fast (bool): enables fast beam search (may not support all features)
|
||||||
|
|
||||||
Todo:
|
|
||||||
Shouldn't need the original dataset.
|
|
||||||
"""
|
"""
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)
|
return self._fast_translate_batch(batch, self.max_length, min_length=self.min_length)
|
||||||
|
@ -12,7 +12,7 @@ def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None
|
|||||||
state_dict = torch.load(src_path, map_location=map_location)
|
state_dict = torch.load(src_path, map_location=map_location)
|
||||||
for k, v in tqdm(state_dict.items()):
|
for k, v in tqdm(state_dict.items()):
|
||||||
if not isinstance(v, torch.Tensor):
|
if not isinstance(v, torch.Tensor):
|
||||||
raise TypeError("FP16 conversion only works on paths that are saved state dics, like pytorch_model.bin")
|
raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
|
||||||
state_dict[k] = v.half()
|
state_dict[k] = v.half()
|
||||||
if save_path is None: # overwrite src_path
|
if save_path is None: # overwrite src_path
|
||||||
save_path = src_path
|
save_path = src_path
|
||||||
|
@ -7,7 +7,7 @@ language: ar
|
|||||||
|
|
||||||
**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config. More details are available in the [AraBERT PAPER](https://arxiv.org/abs/2003.00104v2) and in the [AraBERT Meetup](https://github.com/WissamAntoun/pydata_khobar_meetup)
|
**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config. More details are available in the [AraBERT PAPER](https://arxiv.org/abs/2003.00104v2) and in the [AraBERT Meetup](https://github.com/WissamAntoun/pydata_khobar_meetup)
|
||||||
|
|
||||||
There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
|
There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were split using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
|
||||||
|
|
||||||
The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
|
The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ language: ar
|
|||||||
|
|
||||||
**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config. More details are available in the [AraBERT PAPER](https://arxiv.org/abs/2003.00104v2) and in the [AraBERT Meetup](https://github.com/WissamAntoun/pydata_khobar_meetup)
|
**AraBERT** is an Arabic pretrained lanaguage model based on [Google's BERT architechture](https://github.com/google-research/bert). AraBERT uses the same BERT-Base config. More details are available in the [AraBERT PAPER](https://arxiv.org/abs/2003.00104v2) and in the [AraBERT Meetup](https://github.com/WissamAntoun/pydata_khobar_meetup)
|
||||||
|
|
||||||
There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were splitted using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
|
There are two version off the model AraBERTv0.1 and AraBERTv1, with the difference being that AraBERTv1 uses pre-segmented text where prefixes and suffixes were split using the [Farasa Segmenter](http://alt.qcri.org/farasa/segmenter.html).
|
||||||
|
|
||||||
The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
|
The model was trained on ~70M sentences or ~23GB of Arabic text with ~3B words. The training corpora are a collection of publically available large scale raw arabic text ([Arabic Wikidumps](https://archive.org/details/arwiki-20190201), [The 1.5B words Arabic Corpus](https://www.semanticscholar.org/paper/1.5-billion-words-Arabic-Corpus-El-Khair/f3eeef4afb81223df96575adadf808fe7fe440b4), [The OSIAN Corpus](https://www.aclweb.org/anthology/W19-4619), Assafir news articles, and 4 other manually crawled news websites (Al-Akhbar, Annahar, AL-Ahram, AL-Wafd) from [the Wayback Machine](http://web.archive.org/))
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ tags:
|
|||||||
---
|
---
|
||||||
|
|
||||||
## CS224n SQuAD2.0 Project Dataset
|
## CS224n SQuAD2.0 Project Dataset
|
||||||
The goal of this model is to save CS224n students GPU time when establising
|
The goal of this model is to save CS224n students GPU time when establishing
|
||||||
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
baselines to beat for the [Default Final Project](http://web.stanford.edu/class/cs224n/project/default-final-project-handout.pdf).
|
||||||
The training set used to fine-tune this model is the same as
|
The training set used to fine-tune this model is the same as
|
||||||
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
the [official one](https://rajpurkar.github.io/SQuAD-explorer/); however,
|
||||||
|
@ -34,7 +34,7 @@ model = AutoModelWithLMHead.from_pretrained("jannesg/takalane_afr_roberta")
|
|||||||
|
|
||||||
#### Limitations and bias
|
#### Limitations and bias
|
||||||
|
|
||||||
Updates will be added continously to improve performance.
|
Updates will be added continuously to improve performance.
|
||||||
|
|
||||||
## Training data
|
## Training data
|
||||||
|
|
||||||
|
@ -94,7 +94,7 @@ fill_mask(PYTHON_CODE3)
|
|||||||
|
|
||||||
> Great! 🎉
|
> Great! 🎉
|
||||||
|
|
||||||
## This work is heavely inspired on [CodeBERTa](https://github.com/huggingface/transformers/blob/master/model_cards/huggingface/CodeBERTa-small-v1/README.md) by huggingface team
|
## This work is heavily inspired on [CodeBERTa](https://github.com/huggingface/transformers/blob/master/model_cards/huggingface/CodeBERTa-small-v1/README.md) by huggingface team
|
||||||
|
|
||||||
<br>
|
<br>
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ This model is a fine-tuned on [NER-C](https://www.kaggle.com/nltkdata/conll-corp
|
|||||||
|
|
||||||
- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora)
|
- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora)
|
||||||
|
|
||||||
I preprocessed the dataset and splitted it as train / dev (80/20)
|
I preprocessed the dataset and split it as train / dev (80/20)
|
||||||
|
|
||||||
| Dataset | # Examples |
|
| Dataset | # Examples |
|
||||||
| ---------------------- | ----- |
|
| ---------------------- | ----- |
|
||||||
|
@ -65,7 +65,7 @@ Citation:
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and split the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
||||||
|
|
||||||
| Dataset | # samples |
|
| Dataset | # samples |
|
||||||
| ----------- | --------- |
|
| ----------- | --------- |
|
||||||
|
@ -65,7 +65,7 @@ Citation:
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
As **XQuAD** is just an evaluation dataset, I used `Data augmentation techniques` (scraping, neural machine translation, etc) to obtain more samples and split the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
||||||
|
|
||||||
| Dataset | # samples |
|
| Dataset | # samples |
|
||||||
| ----------- | --------- |
|
| ----------- | --------- |
|
||||||
|
@ -11,7 +11,7 @@ This model is a fine-tuned on [NER-C](https://www.kaggle.com/nltkdata/conll-corp
|
|||||||
|
|
||||||
- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora)
|
- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora)
|
||||||
|
|
||||||
I preprocessed the dataset and splitted it as train / dev (80/20)
|
I preprocessed the dataset and split it as train / dev (80/20)
|
||||||
|
|
||||||
| Dataset | # Examples |
|
| Dataset | # Examples |
|
||||||
| ---------------------- | ----- |
|
| ---------------------- | ----- |
|
||||||
|
@ -11,7 +11,7 @@ This model is a fine-tuned on Spanish [CONLL CORPORA](https://www.kaggle.com/nlt
|
|||||||
|
|
||||||
- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora) with data augmentation techniques
|
- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora) with data augmentation techniques
|
||||||
|
|
||||||
I preprocessed the dataset and splitted it as train / dev (80/20)
|
I preprocessed the dataset and split it as train / dev (80/20)
|
||||||
|
|
||||||
| Dataset | # Examples |
|
| Dataset | # Examples |
|
||||||
| ---------------------- | ----- |
|
| ---------------------- | ----- |
|
||||||
|
@ -44,7 +44,7 @@ python transformers/examples/question-answering/run_squad.py \
|
|||||||
--save_steps 1000
|
--save_steps 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
It is importatnt to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
|
It is important to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
|
||||||
|
|
||||||
## Test set Results 🧾
|
## Test set Results 🧾
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ python transformers/examples/question-answering/run_squad.py \
|
|||||||
--version_2_with_negative
|
--version_2_with_negative
|
||||||
```
|
```
|
||||||
|
|
||||||
It is importatnt to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
|
It is important to say that this models converges much faster than other ones. So, it is also cheap to fine-tune.
|
||||||
|
|
||||||
## Test set Results 🧾
|
## Test set Results 🧾
|
||||||
|
|
||||||
|
@ -48,7 +48,7 @@ python code/run_squad.py \
|
|||||||
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
||||||
|
|
||||||
|
|
||||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||||
|
|
||||||
## Model in action
|
## Model in action
|
||||||
|
|
||||||
|
@ -54,7 +54,7 @@ python code/run_squad.py \
|
|||||||
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
||||||
|
|
||||||
|
|
||||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||||
|
|
||||||
## Model in action
|
## Model in action
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ python code/run_tacred.py \
|
|||||||
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-base-finetuned-tacred) |
|
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-base-finetuned-tacred) |
|
||||||
|
|
||||||
|
|
||||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||||
|
|
||||||
|
|
||||||
> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
|
> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
|
||||||
|
@ -48,7 +48,7 @@ python code/run_squad.py \
|
|||||||
| SpanBERT (large) | **94.6** (this) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
| SpanBERT (large) | **94.6** (this) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
||||||
|
|
||||||
|
|
||||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||||
|
|
||||||
## Model in action
|
## Model in action
|
||||||
|
|
||||||
|
@ -54,7 +54,7 @@ python code/run_squad.py \
|
|||||||
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | **88.7** (this) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | **88.7** (this) | 79.6 | [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred) |
|
||||||
|
|
||||||
|
|
||||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||||
|
|
||||||
## Model in action
|
## Model in action
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ python code/run_tacred.py \
|
|||||||
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | **70.8** (this one) |
|
| SpanBERT (large) | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1) | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2) | 79.6 | **70.8** (this one) |
|
||||||
|
|
||||||
|
|
||||||
Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
Note: The numbers marked as * are evaluated on the development sets because those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
|
||||||
|
|
||||||
|
|
||||||
> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
|
> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
|
||||||
|
@ -50,7 +50,7 @@ tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL-sql
|
|||||||
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL-sql-to-en")
|
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL-sql-to-en")
|
||||||
|
|
||||||
def get_explanation(query):
|
def get_explanation(query):
|
||||||
input_text = "translante Sql to English: %s </s>" % query
|
input_text = "translate Sql to English: %s </s>" % query
|
||||||
features = tokenizer([input_text], return_tensors='pt')
|
features = tokenizer([input_text], return_tensors='pt')
|
||||||
|
|
||||||
output = model.generate(input_ids=features['input_ids'],
|
output = model.generate(input_ids=features['input_ids'],
|
||||||
|
@ -50,7 +50,7 @@ tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
|
|||||||
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
|
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL")
|
||||||
|
|
||||||
def get_sql(query):
|
def get_sql(query):
|
||||||
input_text = "translante English to SQL: %s </s>" % query
|
input_text = "translate English to SQL: %s </s>" % query
|
||||||
features = tokenizer([input_text], return_tensors='pt')
|
features = tokenizer([input_text], return_tensors='pt')
|
||||||
|
|
||||||
output = model.generate(input_ids=features['input_ids'],
|
output = model.generate(input_ids=features['input_ids'],
|
||||||
|
@ -50,7 +50,7 @@ tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-small-finetuned-wikiSQL")
|
|||||||
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-small-finetuned-wikiSQL")
|
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-small-finetuned-wikiSQL")
|
||||||
|
|
||||||
def get_sql(query):
|
def get_sql(query):
|
||||||
input_text = "translante English to SQL: %s </s>" % query
|
input_text = "translate English to SQL: %s </s>" % query
|
||||||
features = tokenizer([input_text], return_tensors='pt')
|
features = tokenizer([input_text], return_tensors='pt')
|
||||||
|
|
||||||
output = model.generate(input_ids=features['input_ids'],
|
output = model.generate(input_ids=features['input_ids'],
|
||||||
|
@ -71,7 +71,7 @@ Citation:
|
|||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
As XQuAD is just an evaluation dataset, I used Data augmentation techniques (scraping, neural machine translation, etc) to obtain more samples and splited the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
As XQuAD is just an evaluation dataset, I used Data augmentation techniques (scraping, neural machine translation, etc) to obtain more samples and split the dataset in order to have a train and test set. The test set was created in a way that contains the same number of samples for each language. Finally, I got:
|
||||||
|
|
||||||
| Dataset | # samples |
|
| Dataset | # samples |
|
||||||
| ----------- | --------- |
|
| ----------- | --------- |
|
||||||
|
@ -172,7 +172,7 @@ class MemorySummary(NamedTuple):
|
|||||||
`MemorySummary` namedtuple otherwise with the fields:
|
`MemorySummary` namedtuple otherwise with the fields:
|
||||||
|
|
||||||
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by
|
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by
|
||||||
substracting the memory after executing each line from the memory before executing said line.
|
subtracting the memory after executing each line from the memory before executing said line.
|
||||||
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
|
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
|
||||||
obtained by summing repeated memory increase for a line if it's executed several times. The list is sorted
|
obtained by summing repeated memory increase for a line if it's executed several times. The list is sorted
|
||||||
from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory
|
from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory
|
||||||
@ -208,7 +208,7 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
- `max_memory`: (`int`) cosumed memory peak in Bytes
|
- `max_memory`: (`int`) consumed memory peak in Bytes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def get_cpu_memory(process_id: int) -> int:
|
def get_cpu_memory(process_id: int) -> int:
|
||||||
@ -221,7 +221,7 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
|
|||||||
|
|
||||||
Returns
|
Returns
|
||||||
|
|
||||||
- `memory`: (`int`) cosumed memory in Bytes
|
- `memory`: (`int`) consumed memory in Bytes
|
||||||
"""
|
"""
|
||||||
process = psutil.Process(process_id)
|
process = psutil.Process(process_id)
|
||||||
try:
|
try:
|
||||||
@ -367,7 +367,7 @@ def start_memory_tracing(
|
|||||||
devices = list(range(nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
|
devices = list(range(nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
|
||||||
nvml.nvmlShutdown()
|
nvml.nvmlShutdown()
|
||||||
except (OSError, nvml.NVMLError):
|
except (OSError, nvml.NVMLError):
|
||||||
logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
|
logger.warning("Error while initializing communication with GPU. " "We won't perform GPU memory tracing.")
|
||||||
log_gpu = False
|
log_gpu = False
|
||||||
else:
|
else:
|
||||||
log_gpu = is_torch_available() or is_tf_available()
|
log_gpu = is_torch_available() or is_tf_available()
|
||||||
@ -472,9 +472,10 @@ def stop_memory_tracing(
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
||||||
- `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
|
`memory_trace` (optional output of start_memory_tracing, default: None):
|
||||||
- `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total
|
memory trace to convert in summary
|
||||||
memory
|
`ignore_released_memory` (boolean, default: None):
|
||||||
|
if True we only sum memory increase to compute total memory
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
|
|
||||||
@ -482,7 +483,7 @@ def stop_memory_tracing(
|
|||||||
- `MemorySummary` namedtuple otherwise with the fields:
|
- `MemorySummary` namedtuple otherwise with the fields:
|
||||||
|
|
||||||
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by
|
- `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace` by
|
||||||
substracting the memory after executing each line from the memory before executing said line.
|
subtracting the memory after executing each line from the memory before executing said line.
|
||||||
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each
|
- `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each
|
||||||
line obtained by summing repeated memory increase for a line if it's executed several times. The list is
|
line obtained by summing repeated memory increase for a line if it's executed several times. The list is
|
||||||
sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative
|
sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative
|
||||||
|
@ -41,7 +41,7 @@ class ConvertCommand(BaseTransformersCLICommand):
|
|||||||
"--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
|
"--tf_checkpoint", type=str, required=True, help="TensorFlow checkpoint path or folder."
|
||||||
)
|
)
|
||||||
train_parser.add_argument(
|
train_parser.add_argument(
|
||||||
"--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch savd model output."
|
"--pytorch_dump_output", type=str, required=True, help="Path to the PyTorch saved model output."
|
||||||
)
|
)
|
||||||
train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
|
train_parser.add_argument("--config", type=str, default="", help="Configuration file path or folder.")
|
||||||
train_parser.add_argument(
|
train_parser.add_argument(
|
||||||
|
@ -61,7 +61,7 @@ class BartConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
|
@ -76,7 +76,7 @@ class BertConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||||
|
@ -42,7 +42,7 @@ class BertGenerationConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||||
@ -62,7 +62,7 @@ class BertGenerationConfig(PretrainedConfig):
|
|||||||
>>> # Initializing a BertGeneration config
|
>>> # Initializing a BertGeneration config
|
||||||
>>> configuration = BertGenerationConfig()
|
>>> configuration = BertGenerationConfig()
|
||||||
|
|
||||||
>>> # Initializing a modelfrom the config
|
>>> # Initializing a model from the config
|
||||||
>>> model = BertGenerationEncoder(configuration)
|
>>> model = BertGenerationEncoder(configuration)
|
||||||
|
|
||||||
>>> # Accessing the model configuration
|
>>> # Accessing the model configuration
|
||||||
|
@ -58,7 +58,7 @@ class BlenderbotConfig(BartConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
|
@ -55,7 +55,7 @@ class DebertaConfig(PretrainedConfig):
|
|||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
|
||||||
:obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
|
:obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
|
||||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||||
|
@ -61,7 +61,7 @@ class DistilBertConfig(PretrainedConfig):
|
|||||||
hidden_dim (:obj:`int`, `optional`, defaults to 3072):
|
hidden_dim (:obj:`int`, `optional`, defaults to 3072):
|
||||||
The size of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
The size of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
||||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
|
activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
|
||||||
|
@ -57,7 +57,7 @@ class DPRConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||||
|
@ -62,7 +62,7 @@ class ElectraConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||||
|
@ -59,11 +59,11 @@ class FlaubertConfig(XLMConfig):
|
|||||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probability for the attention mechanism
|
The dropout probability for the attention mechanism
|
||||||
gelu_activation (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
gelu_activation (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
Whether or not to use a `gelu` actibation instead of `relu`.
|
Whether or not to use a `gelu` activation instead of `relu`.
|
||||||
sinusoidal_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
sinusoidal_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
|
Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
|
||||||
causal (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
causal (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether or not the model shoul behave in a causal manner. Causal models use a triangular attention mask in
|
Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
|
||||||
order to only attend to the left-side context instead if a bidirectional context.
|
order to only attend to the left-side context instead if a bidirectional context.
|
||||||
asm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
asm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
|
Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
|
||||||
|
@ -73,7 +73,7 @@ class FSMTConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
|
@ -68,7 +68,7 @@ class FunnelConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probability for the attention probabilities.
|
The dropout probability for the attention probabilities.
|
||||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
|
@ -54,7 +54,7 @@ class LayoutLMConfig(BertConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||||
|
@ -57,7 +57,7 @@ class LxmertConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||||
@ -95,10 +95,9 @@ class LxmertConfig(PretrainedConfig):
|
|||||||
Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
|
Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
|
||||||
objective.
|
objective.
|
||||||
task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
Whether or not to add object predicition, attribute predicition and feature regression to the loss
|
Whether or not to add object prediction, attribute ppredictionand feature regression to the loss objective.
|
||||||
objective.
|
|
||||||
task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
Whether or not to add the question-asnwering loss to the objective
|
Whether or not to add the question-asansweringoss to the objective
|
||||||
visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
Whether or not to calculate the object-prediction loss objective
|
Whether or not to calculate the object-prediction loss objective
|
||||||
visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
@ -106,10 +105,10 @@ class LxmertConfig(PretrainedConfig):
|
|||||||
visual_feat_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
visual_feat_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
Whether or not to calculate the feature-regression loss objective
|
Whether or not to calculate the feature-regression loss objective
|
||||||
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether or not the model should return the attentions from the vision, langauge, and cross-modality layers
|
Whether or not the model should return the attentions from the vision, language, and cross-modality layers
|
||||||
should be returned.
|
should be returned.
|
||||||
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether or not the model should return the hidden states from the vision, langauge, and cross-modality
|
Whether or not the model should return the hidden states from the vision, language, and cross-modality
|
||||||
layers should be returned.
|
layers should be returned.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ class MarianConfig(BartConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
|
@ -57,7 +57,7 @@ class MBartConfig(BartConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
|
@ -96,7 +96,7 @@ class PegasusConfig(BartConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
|
@ -60,7 +60,7 @@ class ProphetNetConfig(PretrainedConfig):
|
|||||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||||
just in case (e.g., 512 or 1024 or 2048).
|
just in case (e.g., 512 or 1024 or 2048).
|
||||||
|
@ -30,7 +30,7 @@ RAG_CONFIG_DOC = r"""
|
|||||||
Separator inserted between the title and the text of the retrieved document when calling
|
Separator inserted between the title and the text of the retrieved document when calling
|
||||||
:class:`~transformers.RagRetriever`.
|
:class:`~transformers.RagRetriever`.
|
||||||
doc_sep (:obj:`str`, `optional`, defaults to ``" // "``):
|
doc_sep (:obj:`str`, `optional`, defaults to ``" // "``):
|
||||||
Separator inserted between the the text of the retrieved document and the original input when calliang
|
Separator inserted between the the text of the retrieved document and the original input when calling
|
||||||
:class:`~transformers.RagRetriever`.
|
:class:`~transformers.RagRetriever`.
|
||||||
n_docs (:obj:`int`, `optional`, defaults to 5):
|
n_docs (:obj:`int`, `optional`, defaults to 5):
|
||||||
Number of documents to retrieve.
|
Number of documents to retrieve.
|
||||||
@ -39,7 +39,7 @@ RAG_CONFIG_DOC = r"""
|
|||||||
retrieval_vector_size (:obj:`int`, `optional`, defaults to 768):
|
retrieval_vector_size (:obj:`int`, `optional`, defaults to 768):
|
||||||
Dimensionality of the document embeddings indexed by :class:`~transformers.RagRetriever`.
|
Dimensionality of the document embeddings indexed by :class:`~transformers.RagRetriever`.
|
||||||
retrieval_batch_size (:obj:`int`, `optional`, defaults to 8):
|
retrieval_batch_size (:obj:`int`, `optional`, defaults to 8):
|
||||||
Retrieval batch size, defined as the number of queries issues concurrently to the faiss index excapsulated
|
Retrieval batch size, defined as the number of queries issues concurrently to the faiss index encapsulated
|
||||||
:class:`~transformers.RagRetriever`.
|
:class:`~transformers.RagRetriever`.
|
||||||
dataset (:obj:`str`, `optional`, defaults to :obj:`"wiki_dpr"`):
|
dataset (:obj:`str`, `optional`, defaults to :obj:`"wiki_dpr"`):
|
||||||
A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and ids
|
A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and ids
|
||||||
|
@ -82,7 +82,7 @@ class ReformerConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the feed forward layer in the residual attention
|
The non-linear activation function (function or string) in the feed forward layer in the residual attention
|
||||||
block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
|
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
hidden_size (:obj:`int`, `optional`, defaults to 256):
|
hidden_size (:obj:`int`, `optional`, defaults to 256):
|
||||||
Dimensionality of the output hidden states of the residual attention blocks.
|
Dimensionality of the output hidden states of the residual attention blocks.
|
||||||
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
||||||
|
@ -20,7 +20,7 @@ from .utils import logging
|
|||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
# TODO: uploadto AWS
|
# TODO: upload to AWS
|
||||||
RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
"retribert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
|
"retribert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
|
||||||
}
|
}
|
||||||
@ -51,7 +51,7 @@ class RetriBertConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||||
|
@ -52,7 +52,7 @@ class SqueezeBertConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
|
||||||
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
||||||
|
@ -77,7 +77,7 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
adaptive (:obj:`boolean`, `optional`, defaults to :obj:`True`):
|
adaptive (:obj:`boolean`, `optional`, defaults to :obj:`True`):
|
||||||
Whether or not to use adaptive softmax.
|
Whether or not to use adaptive softmax.
|
||||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||||
dropatt (:obj:`float`, `optional`, defaults to 0):
|
dropatt (:obj:`float`, `optional`, defaults to 0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
untie_r (:obj:`boolean`, `optional`, defaults to :obj:`True`):
|
untie_r (:obj:`boolean`, `optional`, defaults to :obj:`True`):
|
||||||
|
@ -83,7 +83,7 @@ def generate_identified_filename(filename: Path, identifier: str) -> Path:
|
|||||||
filename: pathlib.Path The actual path object we would like to add an identifier suffix
|
filename: pathlib.Path The actual path object we would like to add an identifier suffix
|
||||||
identifier: The suffix to add
|
identifier: The suffix to add
|
||||||
|
|
||||||
Returns: String with concatenated indentifier at the end of the filename
|
Returns: String with concatenated identifier at the end of the filename
|
||||||
"""
|
"""
|
||||||
return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)
|
return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix)
|
||||||
|
|
||||||
|
@ -30,7 +30,7 @@ class LightningModel(pl.LightningModule):
|
|||||||
self.num_labels = 2
|
self.num_labels = 2
|
||||||
self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
|
self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels)
|
||||||
|
|
||||||
# implement only because lighning requires to do so
|
# implement only because lightning requires to do so
|
||||||
def forward(self):
|
def forward(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -57,7 +57,7 @@ def convert_longformer_qa_checkpoint_to_pytorch(
|
|||||||
# save model
|
# save model
|
||||||
longformer_for_qa.save_pretrained(pytorch_dump_folder_path)
|
longformer_for_qa.save_pretrained(pytorch_dump_folder_path)
|
||||||
|
|
||||||
print("Conversion succesful. Model saved under {}".format(pytorch_dump_folder_path))
|
print("Conversion successful. Model saved under {}".format(pytorch_dump_folder_path))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -75,7 +75,7 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
type=str,
|
type=str,
|
||||||
required=True,
|
required=True,
|
||||||
help="Path the official PyTorch Lighning Checkpoint.",
|
help="Path the official PyTorch Lightning Checkpoint.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
|
"--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
|
||||||
|
@ -34,7 +34,7 @@ class TatoebaConverter:
|
|||||||
|
|
||||||
1. convert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
|
1. convert numpy state dict to hf format (same code as OPUS-MT-Train conversion).
|
||||||
2. rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
|
2. rename opus model to huggingface format. This means replace each alpha3 code with an alpha2 code if a unique
|
||||||
one existes. e.g. aav-eng -> aav-en, heb-eng -> he-en
|
one exists. e.g. aav-eng -> aav-en, heb-eng -> he-en
|
||||||
3. write a model card containing the original Tatoeba-Challenge/README.md and extra info about alpha3 group
|
3. write a model card containing the original Tatoeba-Challenge/README.md and extra info about alpha3 group
|
||||||
members.
|
members.
|
||||||
"""
|
"""
|
||||||
|
@ -123,7 +123,7 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--force_download",
|
"--force_download",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Re-dowload checkpoints.",
|
help="Re-download checkpoints.",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
@ -104,7 +104,7 @@ if __name__ == "__main__":
|
|||||||
"--finetuning_task",
|
"--finetuning_task",
|
||||||
default=None,
|
default=None,
|
||||||
type=str,
|
type=str,
|
||||||
help="Name of a task on which the XLNet TensorFloaw model was fine-tuned",
|
help="Name of a task on which the XLNet TensorFlow model was fine-tuned",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
print(args)
|
print(args)
|
||||||
|
@ -330,7 +330,7 @@ class DataCollatorForSOP(DataCollatorForLanguageModeling):
|
|||||||
input_ids, labels, attention_mask = self.mask_tokens(input_ids)
|
input_ids, labels, attention_mask = self.mask_tokens(input_ids)
|
||||||
|
|
||||||
token_type_ids = [example["token_type_ids"] for example in examples]
|
token_type_ids = [example["token_type_ids"] for example in examples]
|
||||||
# size of segment_ids varied because randomness, padding zero to the end as the orignal implementation
|
# size of segment_ids varied because randomness, padding zero to the end as the original implementation
|
||||||
token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
|
token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
|
||||||
|
|
||||||
sop_label_list = [example["sentence_order_label"] for example in examples]
|
sop_label_list = [example["sentence_order_label"] for example in examples]
|
||||||
|
@ -71,7 +71,7 @@ class TextDataset(Dataset):
|
|||||||
tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
|
tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
|
||||||
)
|
)
|
||||||
# Note that we are losing the last truncated example here for the sake of simplicity (no padding)
|
# Note that we are losing the last truncated example here for the sake of simplicity (no padding)
|
||||||
# If your dataset is small, first you should loook for a bigger one :-) and second you
|
# If your dataset is small, first you should look for a bigger one :-) and second you
|
||||||
# can change this behavior by adding (model specific) padding.
|
# can change this behavior by adding (model specific) padding.
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
@ -327,7 +327,7 @@ def squad_convert_examples_to_features(
|
|||||||
padding_strategy: Default to "max_length". Which padding strategy to use
|
padding_strategy: Default to "max_length". Which padding strategy to use
|
||||||
return_dataset: Default False. Either 'pt' or 'tf'.
|
return_dataset: Default False. Either 'pt' or 'tf'.
|
||||||
if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
|
if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
|
||||||
threads: multiple processing threadsa-smi
|
threads: multiple processing threads.
|
||||||
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -527,7 +527,7 @@ def squad_convert_examples_to_features(
|
|||||||
|
|
||||||
class SquadProcessor(DataProcessor):
|
class SquadProcessor(DataProcessor):
|
||||||
"""
|
"""
|
||||||
Processor for the SQuAD data set. Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
|
Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
|
||||||
version 2.0 of SQuAD, respectively.
|
version 2.0 of SQuAD, respectively.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -245,9 +245,6 @@ class SingleSentenceClassificationProcessor(DataProcessor):
|
|||||||
Args:
|
Args:
|
||||||
tokenizer: Instance of a tokenizer that will tokenize the examples
|
tokenizer: Instance of a tokenizer that will tokenize the examples
|
||||||
max_length: Maximum example length
|
max_length: Maximum example length
|
||||||
task: GLUE task
|
|
||||||
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
|
|
||||||
output_mode: String indicating the output mode. Either ``regression`` or ``classification``
|
|
||||||
pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
|
pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
|
||||||
pad_token: Padding token
|
pad_token: Padding token
|
||||||
mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
|
mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
|
||||||
|
@ -89,7 +89,7 @@ try:
|
|||||||
# Check we're not importing a "datasets" directory somewhere
|
# Check we're not importing a "datasets" directory somewhere
|
||||||
_datasets_available = hasattr(datasets, "__version__") and hasattr(datasets, "load_dataset")
|
_datasets_available = hasattr(datasets, "__version__") and hasattr(datasets, "load_dataset")
|
||||||
if _datasets_available:
|
if _datasets_available:
|
||||||
logger.debug(f"Succesfully imported datasets version {datasets.__version__}")
|
logger.debug(f"Successfully imported datasets version {datasets.__version__}")
|
||||||
else:
|
else:
|
||||||
logger.debug("Imported a datasets object but this doesn't seem to be the 🤗 datasets library.")
|
logger.debug("Imported a datasets object but this doesn't seem to be the 🤗 datasets library.")
|
||||||
|
|
||||||
@ -147,7 +147,7 @@ try:
|
|||||||
import faiss # noqa: F401
|
import faiss # noqa: F401
|
||||||
|
|
||||||
_faiss_available = True
|
_faiss_available = True
|
||||||
logger.debug(f"Succesfully imported faiss version {faiss.__version__}")
|
logger.debug(f"Successfully imported faiss version {faiss.__version__}")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
_faiss_available = False
|
_faiss_available = False
|
||||||
|
|
||||||
@ -290,7 +290,7 @@ def torch_only_method(fn):
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
DATASETS_IMPORT_ERROR = """
|
DATASETS_IMPORT_ERROR = """
|
||||||
{0} requires the 🤗 Datasets library but it was not found in your enviromnent. You can install it with:
|
{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
|
||||||
```
|
```
|
||||||
pip install datasets
|
pip install datasets
|
||||||
```
|
```
|
||||||
@ -308,7 +308,7 @@ that python file if that's the case.
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
TOKENIZERS_IMPORT_ERROR = """
|
TOKENIZERS_IMPORT_ERROR = """
|
||||||
{0} requires the 🤗 Tokenizers library but it was not found in your enviromnent. You can install it with:
|
{0} requires the 🤗 Tokenizers library but it was not found in your environment. You can install it with:
|
||||||
```
|
```
|
||||||
pip install tokenizers
|
pip install tokenizers
|
||||||
```
|
```
|
||||||
@ -321,30 +321,30 @@ In a notebook or a colab, you can install it by executing a cell with
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
SENTENCEPIECE_IMPORT_ERROR = """
|
SENTENCEPIECE_IMPORT_ERROR = """
|
||||||
{0} requires the SentencePiece library but it was not found in your enviromnent. Checkout the instructions on the
|
{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
|
||||||
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
|
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
|
||||||
that match your enviromnent.
|
that match your environment.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
FAISS_IMPORT_ERROR = """
|
FAISS_IMPORT_ERROR = """
|
||||||
{0} requires the faiss library but it was not found in your enviromnent. Checkout the instructions on the
|
{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
|
||||||
installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
|
installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
|
||||||
that match your enviromnent.
|
that match your environment.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
PYTORCH_IMPORT_ERROR = """
|
PYTORCH_IMPORT_ERROR = """
|
||||||
{0} requires the PyTorch library but it was not found in your enviromnent. Checkout the instructions on the
|
{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
|
||||||
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your enviromnent.
|
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
SKLEARN_IMPORT_ERROR = """
|
SKLEARN_IMPORT_ERROR = """
|
||||||
{0} requires the scikit-learn library but it was not found in your enviromnent. You can install it with:
|
{0} requires the scikit-learn library but it was not found in your environment. You can install it with:
|
||||||
```
|
```
|
||||||
pip install -U scikit-learn
|
pip install -U scikit-learn
|
||||||
```
|
```
|
||||||
@ -357,15 +357,15 @@ In a notebook or a colab, you can install it by executing a cell with
|
|||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
TENSORFLOW_IMPORT_ERROR = """
|
TENSORFLOW_IMPORT_ERROR = """
|
||||||
{0} requires the TensorFlow library but it was not found in your enviromnent. Checkout the instructions on the
|
{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
|
||||||
installation page: https://www.tensorflow.org/install and follow the ones that match your enviromnent.
|
installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# docstyle-ignore
|
# docstyle-ignore
|
||||||
FLAX_IMPORT_ERROR = """
|
FLAX_IMPORT_ERROR = """
|
||||||
{0} requires the FLAX library but it was not found in your enviromnent. Checkout the instructions on the
|
{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
|
||||||
installation page: https://github.com/google/flax and follow the ones that match your enviromnent.
|
installation page: https://github.com/google/flax and follow the ones that match your environment.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -918,13 +918,13 @@ def cached_path(
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
|
cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
|
||||||
force_download: if True, re-dowload the file even if it's already cached in the cache dir.
|
force_download: if True, re-download the file even if it's already cached in the cache dir.
|
||||||
resume_download: if True, resume the download if incompletly recieved file is found.
|
resume_download: if True, resume the download if incompletely received file is found.
|
||||||
user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
|
user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
|
||||||
extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
|
extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
|
||||||
file in a folder along the archive.
|
file in a folder along the archive.
|
||||||
force_extract: if True when extract_compressed_file is True and the archive was already extracted,
|
force_extract: if True when extract_compressed_file is True and the archive was already extracted,
|
||||||
re-extract the archive and overide the folder where it was extracted.
|
re-extract the archive and override the folder where it was extracted.
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
|
None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). Local path (string)
|
||||||
|
@ -25,14 +25,14 @@ logger = logging.get_logger(__name__)
|
|||||||
|
|
||||||
class TFGenerationMixin:
|
class TFGenerationMixin:
|
||||||
"""
|
"""
|
||||||
A class contraining all of the functions supporting generation, to be used as a mixin in
|
A class containing all of the functions supporting generation, to be used as a mixin in
|
||||||
:class:`~transfomers.TFPreTrainedModel`.
|
:class:`~transformers.TFPreTrainedModel`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def prepare_inputs_for_generation(self, inputs, **kwargs):
|
def prepare_inputs_for_generation(self, inputs, **kwargs):
|
||||||
"""
|
"""
|
||||||
Implement in subclasses of :class:`~transfomers.TFPreTrainedModel` for custom behavior to prepare inputs in the
|
Implement in subclasses of :class:`~transformers.TFPreTrainedModel` for custom behavior to prepare inputs in
|
||||||
generate method.
|
the generate method.
|
||||||
"""
|
"""
|
||||||
return {"inputs": inputs}
|
return {"inputs": inputs}
|
||||||
|
|
||||||
@ -216,17 +216,17 @@ class TFGenerationMixin:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if input_ids is not None:
|
if input_ids is not None:
|
||||||
batch_size = shape_list(input_ids)[0] # overriden by the input batch_size
|
batch_size = shape_list(input_ids)[0] # overridden by the input batch_size
|
||||||
else:
|
else:
|
||||||
batch_size = 1
|
batch_size = 1
|
||||||
|
|
||||||
assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer."
|
assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
|
||||||
assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
|
assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
|
||||||
assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
|
assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
|
||||||
assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
|
assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
|
||||||
assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
|
assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
|
||||||
assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
|
assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
|
||||||
assert temperature > 0, "`temperature` should be strictely positive."
|
assert temperature > 0, "`temperature` should be strictly positive."
|
||||||
assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
|
assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
|
||||||
assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
|
assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
|
||||||
assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
|
assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
|
||||||
@ -239,10 +239,10 @@ class TFGenerationMixin:
|
|||||||
assert (eos_token_id is None) or (
|
assert (eos_token_id is None) or (
|
||||||
isinstance(eos_token_id, int) and (eos_token_id >= 0)
|
isinstance(eos_token_id, int) and (eos_token_id >= 0)
|
||||||
), "`eos_token_id` should be a positive integer."
|
), "`eos_token_id` should be a positive integer."
|
||||||
assert length_penalty > 0, "`length_penalty` should be strictely positive."
|
assert length_penalty > 0, "`length_penalty` should be strictly positive."
|
||||||
assert (
|
assert (
|
||||||
isinstance(num_return_sequences, int) and num_return_sequences > 0
|
isinstance(num_return_sequences, int) and num_return_sequences > 0
|
||||||
), "`num_return_sequences` should be a strictely positive integer."
|
), "`num_return_sequences` should be a strictly positive integer."
|
||||||
assert (
|
assert (
|
||||||
bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
|
bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
|
||||||
), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
|
), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
|
||||||
@ -722,7 +722,7 @@ class TFGenerationMixin:
|
|||||||
beam_scores[:, None], (batch_size * num_beams, vocab_size)
|
beam_scores[:, None], (batch_size * num_beams, vocab_size)
|
||||||
) # (batch_size * num_beams, vocab_size)
|
) # (batch_size * num_beams, vocab_size)
|
||||||
|
|
||||||
# re-organize to group the beam together (we are keeping top hypothesis accross beams)
|
# re-organize to group the beam together (we are keeping top hypothesis across beams)
|
||||||
next_scores = tf.reshape(
|
next_scores = tf.reshape(
|
||||||
next_scores, (batch_size, num_beams * vocab_size)
|
next_scores, (batch_size, num_beams * vocab_size)
|
||||||
) # (batch_size, num_beams * vocab_size)
|
) # (batch_size, num_beams * vocab_size)
|
||||||
@ -897,7 +897,7 @@ class TFGenerationMixin:
|
|||||||
|
|
||||||
def adjust_logits_during_generation(self, logits, **kwargs):
|
def adjust_logits_during_generation(self, logits, **kwargs):
|
||||||
"""
|
"""
|
||||||
Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to adjust the logits in
|
Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
|
||||||
the generate method.
|
the generate method.
|
||||||
"""
|
"""
|
||||||
return logits
|
return logits
|
||||||
@ -978,7 +978,7 @@ def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
|
|||||||
|
|
||||||
def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
|
def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
|
||||||
"""
|
"""
|
||||||
Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
|
Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
logits: logits distribution shape (batch size, vocabulary size)
|
logits: logits distribution shape (batch size, vocabulary size)
|
||||||
@ -1047,7 +1047,7 @@ def set_tensor_by_indices_to_value(tensor, indices, value):
|
|||||||
|
|
||||||
def sample_without_replacement(logits, num_samples):
|
def sample_without_replacement(logits, num_samples):
|
||||||
"""
|
"""
|
||||||
categorical sampling witouth replacement is currently not implemented the gumbel-max trick will do for now see
|
categorical sampling without replacement is currently not implemented the gumbel-max trick will do for now see
|
||||||
https://github.com/tensorflow/tensorflow/issues/9260 for more info
|
https://github.com/tensorflow/tensorflow/issues/9260 for more info
|
||||||
"""
|
"""
|
||||||
z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1))
|
z = -tf.math.log(tf.random.uniform(shape_list(logits), 0, 1))
|
||||||
|
@ -29,20 +29,20 @@ logger = logging.get_logger(__name__)
|
|||||||
|
|
||||||
class GenerationMixin:
|
class GenerationMixin:
|
||||||
"""
|
"""
|
||||||
A class contraining all of the functions supporting generation, to be used as a mixin in
|
A class containing all of the functions supporting generation, to be used as a mixin in
|
||||||
:class:`~transfomers.PreTrainedModel`.
|
:class:`~transformers.PreTrainedModel`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def prepare_inputs_for_generation(self, input_ids, **kwargs):
|
def prepare_inputs_for_generation(self, input_ids, **kwargs):
|
||||||
"""
|
"""
|
||||||
Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to prepare inputs in the
|
Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to prepare inputs in the
|
||||||
generate method.
|
generate method.
|
||||||
"""
|
"""
|
||||||
return {"input_ids": input_ids}
|
return {"input_ids": input_ids}
|
||||||
|
|
||||||
def adjust_logits_during_generation(self, logits, **kwargs):
|
def adjust_logits_during_generation(self, logits, **kwargs):
|
||||||
"""
|
"""
|
||||||
Implement in subclasses of :class:`~transfomers.PreTrainedModel` for custom behavior to adjust the logits in
|
Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
|
||||||
the generate method.
|
the generate method.
|
||||||
"""
|
"""
|
||||||
return logits
|
return logits
|
||||||
@ -285,7 +285,7 @@ class GenerationMixin:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if input_ids is not None:
|
if input_ids is not None:
|
||||||
batch_size = input_ids.shape[0] # overriden by the input batch_size
|
batch_size = input_ids.shape[0] # overridden by the input batch_size
|
||||||
else:
|
else:
|
||||||
batch_size = 1
|
batch_size = 1
|
||||||
|
|
||||||
@ -533,7 +533,7 @@ class GenerationMixin:
|
|||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
|
Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
|
||||||
independantly.
|
independently.
|
||||||
"""
|
"""
|
||||||
# length of generated sentences / unfinished sentences
|
# length of generated sentences / unfinished sentences
|
||||||
unfinished_sents = input_ids.new(batch_size).fill_(1)
|
unfinished_sents = input_ids.new(batch_size).fill_(1)
|
||||||
@ -600,7 +600,7 @@ class GenerationMixin:
|
|||||||
# unfinished_sents is set to zero if eos in sentence
|
# unfinished_sents is set to zero if eos in sentence
|
||||||
unfinished_sents.mul_((~eos_in_sents).long())
|
unfinished_sents.mul_((~eos_in_sents).long())
|
||||||
|
|
||||||
# stop when there is a </s> in each sentence, or if we exceed the maximul length
|
# stop when there is a </s> in each sentence, or if we exceed the maximum length
|
||||||
if unfinished_sents.max() == 0:
|
if unfinished_sents.max() == 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
@ -724,7 +724,7 @@ class GenerationMixin:
|
|||||||
else:
|
else:
|
||||||
next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size)
|
next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size)
|
||||||
|
|
||||||
# re-organize to group the beam together (we are keeping top hypothesis accross beams)
|
# re-organize to group the beam together (we are keeping top hypothesis across beams)
|
||||||
next_scores = next_scores.view(
|
next_scores = next_scores.view(
|
||||||
batch_size, num_beams * vocab_size
|
batch_size, num_beams * vocab_size
|
||||||
) # (batch_size, num_beams * vocab_size)
|
) # (batch_size, num_beams * vocab_size)
|
||||||
@ -969,7 +969,7 @@ def top_k_top_p_filtering(
|
|||||||
min_tokens_to_keep: int = 1,
|
min_tokens_to_keep: int = 1,
|
||||||
) -> Tensor:
|
) -> Tensor:
|
||||||
"""
|
"""
|
||||||
Filter a distribution of logits using top-k and/or nucleus (top-p) filterin
|
Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
logits: logits distribution shape (batch size, vocabulary size)
|
logits: logits distribution shape (batch size, vocabulary size)
|
||||||
|
@ -49,7 +49,7 @@ class ModelCard:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
# Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
|
# Recommended attributes from https://arxiv.org/abs/1810.03993 (see papers)
|
||||||
self.model_details = kwargs.pop("model_details", {})
|
self.model_details = kwargs.pop("model_details", {})
|
||||||
self.intended_use = kwargs.pop("intended_use", {})
|
self.intended_use = kwargs.pop("intended_use", {})
|
||||||
self.factors = kwargs.pop("factors", {})
|
self.factors = kwargs.pop("factors", {})
|
||||||
|
@ -488,7 +488,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
|
|||||||
model_args (additional positional arguments, `optional`):
|
model_args (additional positional arguments, `optional`):
|
||||||
Will be passed along to the underlying model ``__init__()`` method.
|
Will be passed along to the underlying model ``__init__()`` method.
|
||||||
config (:class:`~transformers.PretrainedConfig`, `optional`):
|
config (:class:`~transformers.PretrainedConfig`, `optional`):
|
||||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can
|
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
|
||||||
be automatically loaded when:
|
be automatically loaded when:
|
||||||
|
|
||||||
- The model is a model provided by the library (loaded with the `shortcut name` string of a
|
- The model is a model provided by the library (loaded with the `shortcut name` string of a
|
||||||
@ -522,7 +522,7 @@ AUTO_MODEL_PRETRAINED_DOCSTRING = r"""
|
|||||||
output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
|
output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
|
Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
|
||||||
local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
|
local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether or not to only look at local files (e.g., not try doanloading the model).
|
Whether or not to only look at local files (e.g., not try downloading the model).
|
||||||
use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
|
use_cdn(:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
|
Whether or not to use Cloudfront (a Content Delivery Network, or CDN) when searching for the model on
|
||||||
our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
|
our S3 (faster). Should be set to :obj:`False` for checkpoints larger than 20GB.
|
||||||
@ -1424,7 +1424,7 @@ class AutoModelForTokenClassification:
|
|||||||
class AutoModelForMultipleChoice:
|
class AutoModelForMultipleChoice:
|
||||||
r"""
|
r"""
|
||||||
This is a generic model class that will be instantiated as one of the model classes of the library---with a
|
This is a generic model class that will be instantiated as one of the model classes of the library---with a
|
||||||
multiple choice classifcation head---when created with the when created with the
|
multiple choice classification head---when created with the when created with the
|
||||||
:meth:`~transformers.AutoModelForMultipleChoice.from_pretrained` class method or the
|
:meth:`~transformers.AutoModelForMultipleChoice.from_pretrained` class method or the
|
||||||
:meth:`~transformers.AutoModelForMultipleChoice.from_config` class method.
|
:meth:`~transformers.AutoModelForMultipleChoice.from_config` class method.
|
||||||
|
|
||||||
|
@ -906,7 +906,7 @@ class BartModel(PretrainedBartModel):
|
|||||||
output_hidden_states=output_hidden_states,
|
output_hidden_states=output_hidden_states,
|
||||||
return_dict=return_dict,
|
return_dict=return_dict,
|
||||||
)
|
)
|
||||||
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False
|
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
|
||||||
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
|
||||||
encoder_outputs = BaseModelOutput(
|
encoder_outputs = BaseModelOutput(
|
||||||
last_hidden_state=encoder_outputs[0],
|
last_hidden_state=encoder_outputs[0],
|
||||||
|
@ -69,8 +69,8 @@ class XSoftmax(torch.autograd.Function):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
input (:obj:`torch.tensor`): The input tensor that will apply softmax.
|
input (:obj:`torch.tensor`): The input tensor that will apply softmax.
|
||||||
mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax caculation.
|
mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
|
||||||
dim (int): The dimenssion that will apply softmax
|
dim (int): The dimension that will apply softmax
|
||||||
|
|
||||||
Example::
|
Example::
|
||||||
import torch
|
import torch
|
||||||
@ -540,16 +540,16 @@ class DisentangledSelfAttention(torch.nn.Module):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
hidden_states (:obj:`torch.FloatTensor`):
|
hidden_states (:obj:`torch.FloatTensor`):
|
||||||
Input states to the module usally the output from previous layer, it will be the Q,K and V in
|
Input states to the module usually the output from previous layer, it will be the Q,K and V in
|
||||||
`Attention(Q,K,V)`
|
`Attention(Q,K,V)`
|
||||||
|
|
||||||
attention_mask (:obj:`torch.ByteTensor`):
|
attention_mask (:obj:`torch.ByteTensor`):
|
||||||
An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maxium
|
An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maximum
|
||||||
sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
|
sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j`
|
||||||
th token.
|
th token.
|
||||||
|
|
||||||
return_att (:obj:`bool`, optional):
|
return_att (:obj:`bool`, optional):
|
||||||
Whether return the attention maxitrix.
|
Whether return the attention matrix.
|
||||||
|
|
||||||
query_states (:obj:`torch.FloatTensor`, optional):
|
query_states (:obj:`torch.FloatTensor`, optional):
|
||||||
The `Q` state in `Attention(Q,K,V)`.
|
The `Q` state in `Attention(Q,K,V)`.
|
||||||
@ -627,7 +627,7 @@ class DisentangledSelfAttention(torch.nn.Module):
|
|||||||
relative_pos = relative_pos.unsqueeze(1)
|
relative_pos = relative_pos.unsqueeze(1)
|
||||||
# bxhxqxk
|
# bxhxqxk
|
||||||
elif relative_pos.dim() != 4:
|
elif relative_pos.dim() != 4:
|
||||||
raise ValueError(f"Relative postion ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
|
raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}")
|
||||||
|
|
||||||
att_span = min(max(query_layer.size(-2), key_layer.size(-2)), self.max_relative_positions)
|
att_span = min(max(query_layer.size(-2), key_layer.size(-2)), self.max_relative_positions)
|
||||||
relative_pos = relative_pos.long().to(query_layer.device)
|
relative_pos = relative_pos.long().to(query_layer.device)
|
||||||
@ -772,7 +772,7 @@ DEBERTA_START_DOCSTRING = r"""
|
|||||||
The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
|
The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
|
||||||
<https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
|
<https://arxiv.org/abs/2006.03654>`_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of
|
||||||
BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
|
BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
|
||||||
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-trianing data.
|
improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pre-training data.
|
||||||
|
|
||||||
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
|
This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
|
||||||
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
|
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
|
||||||
|
@ -290,7 +290,7 @@ class Transformer(nn.Module):
|
|||||||
attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.
|
attn_mask: torch.tensor(bs, seq_length) Attention mask on the sequence.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hiddens states in the last (top)
|
hidden_state: torch.tensor(bs, seq_length, dim) Sequence of hidden states in the last (top)
|
||||||
layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
|
layer all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
|
||||||
Tuple of length n_layers with the hidden states from each layer.
|
Tuple of length n_layers with the hidden states from each layer.
|
||||||
Optional: only if output_hidden_states=True
|
Optional: only if output_hidden_states=True
|
||||||
|
@ -418,7 +418,7 @@ DPR_READER_INPUTS_DOCSTRING = r"""
|
|||||||
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
|
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
|
||||||
tensors for more detail.
|
tensors for more detail.
|
||||||
output_hidden_states (:obj:`bool`, `optional`):
|
output_hidden_states (:obj:`bool`, `optional`):
|
||||||
Whether or not to rturn the hidden states of all layers. See ``hidden_states`` under returned tensors for
|
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
|
||||||
more detail.
|
more detail.
|
||||||
return_dict (:obj:`bool`, `optional`):
|
return_dict (:obj:`bool`, `optional`):
|
||||||
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
|
Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
|
||||||
|
@ -30,7 +30,7 @@ logger = logging.get_logger(__name__)
|
|||||||
_CONFIG_FOR_DOC = "EncoderDecoderConfig"
|
_CONFIG_FOR_DOC = "EncoderDecoderConfig"
|
||||||
|
|
||||||
ENCODER_DECODER_START_DOCSTRING = r"""
|
ENCODER_DECODER_START_DOCSTRING = r"""
|
||||||
This class can be used to inialize a sequence-to-sequnece model with any pretrained autoencoding model as the
|
This class can be used to initialize a sequence-tsequencece model with any pretrained autoencoding model as the
|
||||||
encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
|
encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
|
||||||
:meth:`~transformers.AutoModel.from_pretrained` function and the decoder is loaded via
|
:meth:`~transformers.AutoModel.from_pretrained` function and the decoder is loaded via
|
||||||
:meth:`~transformers.AutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically added
|
:meth:`~transformers.AutoModelForCausalLM.from_pretrained` function. Cross-attention layers are automatically added
|
||||||
|
@ -99,7 +99,7 @@ FLAUBERT_INPUTS_DOCSTRING = r"""
|
|||||||
`What are position IDs? <../glossary.html#position-ids>`_
|
`What are position IDs? <../glossary.html#position-ids>`_
|
||||||
lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
|
lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
|
||||||
Length of each sentence that can be used to avoid performing attention on padding token indices. You can
|
Length of each sentence that can be used to avoid performing attention on padding token indices. You can
|
||||||
also use :obj:`attention_mask` for the same result (see above), kept here for compatbility. Indices
|
also use :obj:`attention_mask` for the same result (see above), kept here for compatibility. Indices
|
||||||
selected in ``[0, ..., input_ids.size(-1)]``:
|
selected in ``[0, ..., input_ids.size(-1)]``:
|
||||||
cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
|
cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`):
|
||||||
Dictionary strings to ``torch.FloatTensor`` that contains precomputed hidden-states (key and values in the
|
Dictionary strings to ``torch.FloatTensor`` that contains precomputed hidden-states (key and values in the
|
||||||
|
@ -124,18 +124,18 @@ class FlaxAutoModel(object):
|
|||||||
All remaining positional arguments will be passed to the underlying model's ``__init__`` method
|
All remaining positional arguments will be passed to the underlying model's ``__init__`` method
|
||||||
|
|
||||||
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
|
||||||
Configuration for the model to use instead of an automatically loaded configuation. Configuration can
|
Configuration for the model to use instead of an automatically loaded configuration. Configuration can
|
||||||
be automatically loaded when:
|
be automatically loaded when:
|
||||||
|
|
||||||
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a
|
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a
|
||||||
pretrained model), or
|
pretrained model), or
|
||||||
- the model was saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and is reloaded
|
- the model was saved using :func:`~transformers.FlaxPreTrainedModel.save_pretrained` and is reloaded
|
||||||
by suppling the save directory.
|
by supplying the save directory.
|
||||||
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a
|
- the model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
|
||||||
configuration JSON file named `config.json` is found in the directory.
|
configuration JSON file named `config.json` is found in the directory.
|
||||||
|
|
||||||
state_dict: (`optional`) dict:
|
state_dict: (`optional`) dict:
|
||||||
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved
|
an optional state dictionary for the model to use instead of a state dictionary loaded from saved
|
||||||
weights file. This option can be used if you want to create a model from a pretrained configuration but
|
weights file. This option can be used if you want to create a model from a pretrained configuration but
|
||||||
load your own weights. In this case though, you should check if using
|
load your own weights. In this case though, you should check if using
|
||||||
:func:`~transformers.FlaxPreTrainedModel.save_pretrained` and
|
:func:`~transformers.FlaxPreTrainedModel.save_pretrained` and
|
||||||
@ -150,14 +150,14 @@ class FlaxAutoModel(object):
|
|||||||
they exists.
|
they exists.
|
||||||
|
|
||||||
resume_download: (`optional`) boolean, default False:
|
resume_download: (`optional`) boolean, default False:
|
||||||
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
|
Do not delete incompletely received file. Attempt to resume the download if such a file exists.
|
||||||
|
|
||||||
proxies: (`optional`) dict, default None:
|
proxies: (`optional`) dict, default None:
|
||||||
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
|
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
|
||||||
'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
|
'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
|
||||||
|
|
||||||
output_loading_info: (`optional`) boolean:
|
output_loading_info: (`optional`) boolean:
|
||||||
Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error
|
Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error
|
||||||
messages.
|
messages.
|
||||||
|
|
||||||
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
kwargs: (`optional`) Remaining dictionary of keyword arguments:
|
||||||
|
@ -64,7 +64,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
|
|||||||
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
|
||||||
|
|
||||||
- 1 for tokens that are **not masked**,
|
- 1 for tokens that are **not masked**,
|
||||||
- 0 for tokens that are **maked**.
|
- 0 for tokens that are **masked**.
|
||||||
|
|
||||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||||
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
|
token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
|
||||||
|
@ -226,7 +226,7 @@ class FunnelAttentionStructure(nn.Module):
|
|||||||
d_model = self.config.d_model
|
d_model = self.config.d_model
|
||||||
if self.config.attention_type == "factorized":
|
if self.config.attention_type == "factorized":
|
||||||
# Notations from the paper, appending A.2.2, final formula.
|
# Notations from the paper, appending A.2.2, final formula.
|
||||||
# We need to create and return the matrics phi, psi, pi and omega.
|
# We need to create and return the matrices phi, psi, pi and omega.
|
||||||
pos_seq = torch.arange(0, seq_len, 1.0, dtype=dtype, device=device)
|
pos_seq = torch.arange(0, seq_len, 1.0, dtype=dtype, device=device)
|
||||||
freq_seq = torch.arange(0, d_model // 2, 1.0, dtype=dtype, device=device)
|
freq_seq = torch.arange(0, d_model // 2, 1.0, dtype=dtype, device=device)
|
||||||
inv_freq = 1 / (10000 ** (freq_seq / (d_model // 2)))
|
inv_freq = 1 / (10000 ** (freq_seq / (d_model // 2)))
|
||||||
@ -1226,7 +1226,7 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""
|
"""
|
||||||
Funnel Transfprmer Model with a sequence classification/regression head on top (two linear layer on top of the
|
Funnel Transformer Model with a sequence classification/regression head on top (two linear layer on top of the
|
||||||
first timestep of the last hidden state) e.g. for GLUE tasks.
|
first timestep of the last hidden state) e.g. for GLUE tasks.
|
||||||
""",
|
""",
|
||||||
FUNNEL_START_DOCSTRING,
|
FUNNEL_START_DOCSTRING,
|
||||||
|
@ -588,7 +588,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
attention_mask = (1.0 - attention_mask) * -10000.0
|
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||||
|
|
||||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||||
if self.config.add_cross_attention and encoder_hidden_states is not None:
|
if self.config.add_cross_attention and encoder_hidden_states is not None:
|
||||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||||
@ -708,7 +708,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
position_ids = kwargs.get("position_ids", None)
|
position_ids = kwargs.get("position_ids", None)
|
||||||
|
|
||||||
if attention_mask is not None and position_ids is None:
|
if attention_mask is not None and position_ids is None:
|
||||||
# create postion_ids on the fly for batch generation
|
# create position_ids on the fly for batch generation
|
||||||
position_ids = attention_mask.long().cumsum(-1) - 1
|
position_ids = attention_mask.long().cumsum(-1) - 1
|
||||||
position_ids.masked_fill_(attention_mask == 0, 1)
|
position_ids.masked_fill_(attention_mask == 0, 1)
|
||||||
if past:
|
if past:
|
||||||
@ -1050,7 +1050,7 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel):
|
|||||||
sequence_lengths = -1
|
sequence_lengths = -1
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
|
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
|
||||||
f"unexpected if using padding tokens in conjuction with `inputs_embeds.`"
|
f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
|
||||||
)
|
)
|
||||||
|
|
||||||
pooled_logits = logits[range(batch_size), sequence_lengths]
|
pooled_logits = logits[range(batch_size), sequence_lengths]
|
||||||
|
@ -382,7 +382,7 @@ class LongformerSelfAttention(nn.Module):
|
|||||||
# batch_size x num_heads x max_num_global_attention_tokens x sequence_length
|
# batch_size x num_heads x max_num_global_attention_tokens x sequence_length
|
||||||
# which is the attention weights from tokens with global attention to all tokens
|
# which is the attention weights from tokens with global attention to all tokens
|
||||||
# It doesn't not return local attention
|
# It doesn't not return local attention
|
||||||
# In case of variable number of global attantion in the rows of a batch,
|
# In case of variable number of global attention in the rows of a batch,
|
||||||
# attn_probs are padded with -10000.0 attention scores
|
# attn_probs are padded with -10000.0 attention scores
|
||||||
attn_probs = attn_probs.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
|
attn_probs = attn_probs.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len)
|
||||||
else:
|
else:
|
||||||
@ -416,7 +416,7 @@ class LongformerSelfAttention(nn.Module):
|
|||||||
-0.7584, 0.4206, -0.0405, 0.1599,
|
-0.7584, 0.4206, -0.0405, 0.1599,
|
||||||
2.0514, -1.1600, 0.5372, 0.2629 ]
|
2.0514, -1.1600, 0.5372, 0.2629 ]
|
||||||
window_overlap = num_rows = 4
|
window_overlap = num_rows = 4
|
||||||
(pad & diagonilize) =>
|
(pad & diagonalize) =>
|
||||||
[ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
|
[ 0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000
|
||||||
0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
|
0.0000, -1.8348, 0.7672, 0.2986, 0.0285, 0.0000, 0.0000
|
||||||
0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
|
0.0000, 0.0000, -0.7584, 0.4206, -0.0405, 0.1599, 0.0000
|
||||||
@ -440,7 +440,7 @@ class LongformerSelfAttention(nn.Module):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _chunk(hidden_states, window_overlap):
|
def _chunk(hidden_states, window_overlap):
|
||||||
"""convert into overlapping chunkings. Chunk size = 2w, overlap size = w"""
|
"""convert into overlapping chunks. Chunk size = 2w, overlap size = w"""
|
||||||
|
|
||||||
# non-overlapping chunks of size = 2w
|
# non-overlapping chunks of size = 2w
|
||||||
hidden_states = hidden_states.view(
|
hidden_states = hidden_states.view(
|
||||||
@ -491,7 +491,7 @@ class LongformerSelfAttention(nn.Module):
|
|||||||
chunked_query = self._chunk(query, window_overlap)
|
chunked_query = self._chunk(query, window_overlap)
|
||||||
chunked_key = self._chunk(key, window_overlap)
|
chunked_key = self._chunk(key, window_overlap)
|
||||||
|
|
||||||
# matrix multipication
|
# matrix multiplication
|
||||||
# bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
|
# bcxd: batch_size * num_heads x chunks x 2window_overlap x head_dim
|
||||||
# bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
|
# bcyd: batch_size * num_heads x chunks x 2window_overlap x head_dim
|
||||||
# bcxy: batch_size * num_heads x chunks x 2window_overlap x window_overlap
|
# bcxy: batch_size * num_heads x chunks x 2window_overlap x window_overlap
|
||||||
@ -1030,7 +1030,7 @@ LONGFORMER_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||||
global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
|
global_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
|
||||||
Mask to decide the attention given on each token, local attention or global attenion. Tokens with global
|
Mask to decide the attention given on each token, local attention or global attention. Tokens with global
|
||||||
attention attends to all other tokens, and all other tokens attend to them. This is important for
|
attention attends to all other tokens, and all other tokens attend to them. This is important for
|
||||||
task-specific finetuning because it makes the model more flexible at representing the task. For example,
|
task-specific finetuning because it makes the model more flexible at representing the task. For example,
|
||||||
for classification, the <s> token should be given global attention. For QA, all question tokens should also
|
for classification, the <s> token should be given global attention. For QA, all question tokens should also
|
||||||
|
@ -58,7 +58,7 @@ class GeLU(nn.Module):
|
|||||||
@dataclass
|
@dataclass
|
||||||
class LxmertModelOutput(ModelOutput):
|
class LxmertModelOutput(ModelOutput):
|
||||||
"""
|
"""
|
||||||
Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilites for the language,
|
Lxmert's outputs that contain the last hidden states, pooled outputs, and attention probabilities for the language,
|
||||||
visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
|
visual, and, cross-modality encoders. (note: the visual encoder in Lxmert is referred to as the "relation-ship"
|
||||||
encoder")
|
encoder")
|
||||||
|
|
||||||
@ -405,7 +405,7 @@ class LxmertSelfAttentionLayer(nn.Module):
|
|||||||
self.output = LxmertAttentionOutput(config)
|
self.output = LxmertAttentionOutput(config)
|
||||||
|
|
||||||
def forward(self, input_tensor, attention_mask, output_attentions=False):
|
def forward(self, input_tensor, attention_mask, output_attentions=False):
|
||||||
# Self attention attends to itself, thus keys and querys are the same (input_tensor).
|
# Self attention attends to itself, thus keys and queries are the same (input_tensor).
|
||||||
output = self.self(
|
output = self.self(
|
||||||
input_tensor,
|
input_tensor,
|
||||||
input_tensor,
|
input_tensor,
|
||||||
@ -799,7 +799,7 @@ LXMERT_START_DOCSTRING = r"""
|
|||||||
<https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
|
<https://arxiv.org/abs/1908.07490>`__ by Hao Tan and Mohit Bansal. It's a vision and language transformer model,
|
||||||
pretrained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
|
pretrained on a variety of multi-modal datasets comprising of GQA, VQAv2.0, MCSCOCO captions, and Visual genome,
|
||||||
using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
|
using a combination of masked language modeling, region of interest feature regression, cross entropy loss for
|
||||||
question answering attribute prediction, and object tag predicition.
|
question answering attribute prediction, and object tag prediction.
|
||||||
|
|
||||||
This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
|
This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
|
||||||
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
|
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
|
||||||
@ -1076,12 +1076,10 @@ class LxmertForPreTraining(LxmertPreTrainedModel):
|
|||||||
will add newly initialized weights. Reducing the size will remove weights from the end
|
will add newly initialized weights. Reducing the size will remove weights from the end
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
cur_qa_logit_layer (:obj:`torch.nn.Linear`):
|
|
||||||
Old linear layer to be resized.
|
|
||||||
num_labels (:obj:`int`, `optional`):
|
num_labels (:obj:`int`, `optional`):
|
||||||
New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
|
New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
|
||||||
weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
|
weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
|
||||||
just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing
|
just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
|
||||||
anything.
|
anything.
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
@ -1298,12 +1296,10 @@ class LxmertForQuestionAnswering(LxmertPreTrainedModel):
|
|||||||
will add newly initialized weights. Reducing the size will remove weights from the end
|
will add newly initialized weights. Reducing the size will remove weights from the end
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
cur_qa_logit_layer (:obj:`torch.nn.Linear`):
|
|
||||||
Old linear layer to be resized.
|
|
||||||
num_labels (:obj:`int`, `optional`):
|
num_labels (:obj:`int`, `optional`):
|
||||||
New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
|
New number of labels in the linear layer weight matrix. Increasing the size will add newly initialized
|
||||||
weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
|
weights at the end. Reducing the size will remove weights from the end. If not provided or :obj:`None`,
|
||||||
just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model wihtout doing
|
just returns a pointer to the qa labels :obj:`torch.nn.Linear`` module of the model without doing
|
||||||
anything.
|
anything.
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
|
@ -887,7 +887,7 @@ class MobileBertModel(MobileBertPreTrainedModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# If a 2D ou 3D attention mask is provided for the cross-attention
|
# If a 2D ou 3D attention mask is provided for the cross-attention
|
||||||
# we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
|
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
||||||
if self.config.is_decoder and encoder_hidden_states is not None:
|
if self.config.is_decoder and encoder_hidden_states is not None:
|
||||||
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
|
||||||
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
||||||
|
@ -40,7 +40,7 @@ class RetrievAugLMMarginOutput(ModelOutput):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
|
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
|
||||||
Languaged modeling loss.
|
Language modeling loss.
|
||||||
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
|
logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
|
||||||
Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
|
Prediction scores of the language modeling head. The score is possibly marginalized over all documents for
|
||||||
each vocabulary token.
|
each vocabulary token.
|
||||||
@ -413,7 +413,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
Used by the (:class:`~transformers.RagModel`) model during decoding.
|
Used by the (:class:`~transformers.RagModel`) model during decoding.
|
||||||
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||||
Provide for generation tasks. `None` by default, constuct as per instructions for the generator model
|
Provide for generation tasks. `None` by default, construct as per instructions for the generator model
|
||||||
you're using with your RAG instance.
|
you're using with your RAG instance.
|
||||||
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
|
||||||
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
|
Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
|
||||||
@ -424,7 +424,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
|
|||||||
:obj:`past_key_values` are used in the (:class:`~transformers.RagTokenForGeneration`) model during
|
:obj:`past_key_values` are used in the (:class:`~transformers.RagTokenForGeneration`) model during
|
||||||
decoding.
|
decoding.
|
||||||
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
|
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
|
||||||
Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and
|
Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
|
||||||
:obj:`question_encoder_last_hidden_state`. If the model has is not initialized with a ``retriever``
|
:obj:`question_encoder_last_hidden_state`. If the model has is not initialized with a ``retriever``
|
||||||
:obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores` can be computed via
|
:obj:`doc_scores` has to be provided to the forward pass. :obj:`doc_scores` can be computed via
|
||||||
:obj:`question_encoder_last_hidden_state` and :obj:`retrieved_doc_embeds`, see examples for more
|
:obj:`question_encoder_last_hidden_state` and :obj:`retrieved_doc_embeds`, see examples for more
|
||||||
@ -660,7 +660,7 @@ class RagModel(RagPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings_to_model_forward(
|
@add_start_docstrings_to_model_forward(
|
||||||
"""
|
"""
|
||||||
A RAG-sequence model impementation. It performs RAG-sequence specific marginalization in the forward pass.
|
A RAG-sequence model implementation. It performs RAG-sequence specific marginalization in the forward pass.
|
||||||
""",
|
""",
|
||||||
RAG_START_DOCSTRING,
|
RAG_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
@ -736,7 +736,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
|
|||||||
>>> input_ids = input_dict["input_ids"]
|
>>> input_ids = input_dict["input_ids"]
|
||||||
>>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
|
>>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
|
||||||
|
|
||||||
>>> # or use retriever seperately
|
>>> # or use retriever separately
|
||||||
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
|
>>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", use_dummy_dataset=True)
|
||||||
>>> # 1. Encode
|
>>> # 1. Encode
|
||||||
>>> question_hidden_states = model.question_encoder(input_ids)[0]
|
>>> question_hidden_states = model.question_encoder(input_ids)[0]
|
||||||
@ -940,13 +940,13 @@ class RagSequenceForGeneration(RagPreTrainedModel):
|
|||||||
) # batch_size x n_docs x tgt_len x dim
|
) # batch_size x n_docs x tgt_len x dim
|
||||||
doc_logprobs = torch.nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1)
|
doc_logprobs = torch.nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1)
|
||||||
|
|
||||||
# RAG-sequence marginaliation
|
# RAG-sequence marginalization
|
||||||
first_token_scores = seq_logprobs[:, :, :1, :]
|
first_token_scores = seq_logprobs[:, :, :1, :]
|
||||||
second_token_scores = seq_logprobs[:, :, 1:2, :]
|
second_token_scores = seq_logprobs[:, :, 1:2, :]
|
||||||
remainder = seq_logprobs[:, :, 2:, :]
|
remainder = seq_logprobs[:, :, 2:, :]
|
||||||
rag_logprobs = torch.cat([first_token_scores, second_token_scores + doc_logprobs, remainder], dim=2)
|
rag_logprobs = torch.cat([first_token_scores, second_token_scores + doc_logprobs, remainder], dim=2)
|
||||||
|
|
||||||
# calcualate loss
|
# calculate loss
|
||||||
target = target.unsqueeze(1).unsqueeze(-1).repeat(1, n_docs, 1, 1)
|
target = target.unsqueeze(1).unsqueeze(-1).repeat(1, n_docs, 1, 1)
|
||||||
assert target.dim() == rag_logprobs.dim()
|
assert target.dim() == rag_logprobs.dim()
|
||||||
|
|
||||||
@ -986,7 +986,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings_to_model_forward(
|
@add_start_docstrings_to_model_forward(
|
||||||
"""
|
"""
|
||||||
A RAG-token model impementation. It performs RAG-token specific marginalization in the forward pass.
|
A RAG-token model implementation. It performs RAG-token specific marginalization in the forward pass.
|
||||||
""",
|
""",
|
||||||
RAG_START_DOCSTRING,
|
RAG_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
@ -1129,7 +1129,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
|
|||||||
>>> input_ids = input_dict["input_ids"]
|
>>> input_ids = input_dict["input_ids"]
|
||||||
>>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
|
>>> outputs = model(input_ids=input_ids, labels=input_dict["labels"])
|
||||||
|
|
||||||
>>> # or use retriever seperately
|
>>> # or use retriever separately
|
||||||
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
|
>>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
|
||||||
>>> # 1. Encode
|
>>> # 1. Encode
|
||||||
>>> question_hidden_states = model.question_encoder(input_ids)[0]
|
>>> question_hidden_states = model.question_encoder(input_ids)[0]
|
||||||
@ -1257,7 +1257,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
|
|||||||
to the forward pass. :obj:`context_input_ids` are returned by
|
to the forward pass. :obj:`context_input_ids` are returned by
|
||||||
:meth:`~transformers.RagRetriever.__call__`.
|
:meth:`~transformers.RagRetriever.__call__`.
|
||||||
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
|
doc_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.n_docs)`):
|
||||||
Score between each retrieved document embeddigs (see :obj:`retrieved_doc_embeds`) and
|
Score between each retrieved document embeddings (see :obj:`retrieved_doc_embeds`) and
|
||||||
:obj:`question_encoder_last_hidden_state`.
|
:obj:`question_encoder_last_hidden_state`.
|
||||||
|
|
||||||
If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided
|
If the model has is not initialized with a ``retriever``, :obj:`context_input_ids` has to be provided
|
||||||
|
@ -986,7 +986,7 @@ class LSHSelfAttention(nn.Module, EfficientAttentionMixin):
|
|||||||
class ReverseSort(Function):
|
class ReverseSort(Function):
|
||||||
"""
|
"""
|
||||||
After chunked attention is applied which sorted clusters, original ordering has to be restored. Since customized
|
After chunked attention is applied which sorted clusters, original ordering has to be restored. Since customized
|
||||||
backward function is used for Reformer, the gradients of the output vectors have to be explicitely sorted here.
|
backward function is used for Reformer, the gradients of the output vectors have to be explicitly sorted here.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -2075,7 +2075,7 @@ class ReformerModel(ReformerPreTrainedModel):
|
|||||||
device=device,
|
device=device,
|
||||||
)
|
)
|
||||||
|
|
||||||
# start index for postion encoding depends on incremental decoding
|
# start index for position encoding depends on incremental decoding
|
||||||
if past_buckets_states is not None:
|
if past_buckets_states is not None:
|
||||||
start_idx_pos_encodings = past_buckets_states[0][1].shape[1]
|
start_idx_pos_encodings = past_buckets_states[0][1].shape[1]
|
||||||
else:
|
else:
|
||||||
|
@ -79,7 +79,7 @@ RETRIBERT_START_DOCSTRING = r"""
|
|||||||
|
|
||||||
|
|
||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""Bert Based model to embed queries or document for document retreival. """,
|
"""Bert Based model to embed queries or document for document retrieval. """,
|
||||||
RETRIBERT_START_DOCSTRING,
|
RETRIBERT_START_DOCSTRING,
|
||||||
)
|
)
|
||||||
class RetriBertModel(RetriBertPreTrainedModel):
|
class RetriBertModel(RetriBertPreTrainedModel):
|
||||||
@ -117,7 +117,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
|
|||||||
attention_mask, input_shape, device
|
attention_mask, input_shape, device
|
||||||
)
|
)
|
||||||
|
|
||||||
# define function for cehckpointing
|
# define function for checkpointing
|
||||||
def partial_encode(*inputs):
|
def partial_encode(*inputs):
|
||||||
encoder_outputs = sent_encoder.encoder(
|
encoder_outputs = sent_encoder.encoder(
|
||||||
inputs[0],
|
inputs[0],
|
||||||
@ -200,7 +200,7 @@ class RetriBertModel(RetriBertPreTrainedModel):
|
|||||||
|
|
||||||
Return:
|
Return:
|
||||||
:obj:`torch.FloatTensor`: The bidirectional cross-entropy loss obtained while trying to match each query to
|
:obj:`torch.FloatTensor`: The bidirectional cross-entropy loss obtained while trying to match each query to
|
||||||
its corresponding document and each cocument to its corresponding query in the batch
|
its corresponding document and each document to its corresponding query in the batch
|
||||||
"""
|
"""
|
||||||
device = input_ids_query.device
|
device = input_ids_query.device
|
||||||
q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)
|
q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user