mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-03 03:31:05 +06:00
Remove bert fast dependency from electra (#19520)
* Replaced ElectraTokenizerFast with BertTokenzier class * Fixed Styling issue Co-authored-by: vishwaspai <vishwas.pai@emplay.net>
This commit is contained in:
parent
2720d5fc18
commit
72153ba611
@ -13,7 +13,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from ..bert.tokenization_bert_fast import BertTokenizerFast
|
||||
import json
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from tokenizers import normalizers
|
||||
|
||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||
from .tokenization_electra import ElectraTokenizer
|
||||
|
||||
|
||||
@ -69,7 +74,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
||||
"google/electra-large-discriminator": 512,
|
||||
}
|
||||
|
||||
|
||||
PRETRAINED_INIT_CONFIGURATION = {
|
||||
"google/electra-small-generator": {"do_lower_case": True},
|
||||
"google/electra-base-generator": {"do_lower_case": True},
|
||||
@ -80,17 +84,148 @@ PRETRAINED_INIT_CONFIGURATION = {
|
||||
}
|
||||
|
||||
|
||||
class ElectraTokenizerFast(BertTokenizerFast):
|
||||
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with Bert->Electra , BERT->ELECTRA
|
||||
class ElectraTokenizerFast(PreTrainedTokenizerFast):
|
||||
r"""
|
||||
Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's *tokenizers* library).
|
||||
Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
|
||||
|
||||
[`ElectraTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization: punctuation
|
||||
splitting and wordpiece.
|
||||
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
|
||||
refer to this superclass for more information regarding those methods.
|
||||
|
||||
Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
|
||||
Args:
|
||||
vocab_file (`str`):
|
||||
File containing the vocabulary.
|
||||
do_lower_case (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to lowercase the input when tokenizing.
|
||||
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
|
||||
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
|
||||
sequence classification or for a text and a question for question answering. It is also used as the last
|
||||
token of a sequence built with special tokens.
|
||||
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
|
||||
The classifier token which is used when doing sequence classification (classification of the whole sequence
|
||||
instead of per-token classification). It is the first token of the sequence when built with special tokens.
|
||||
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
|
||||
The token used for masking values. This is the token used when training this model with masked language
|
||||
modeling. This is the token which the model will try to predict.
|
||||
clean_text (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to clean the text before tokenization by removing any control characters and replacing all
|
||||
whitespaces by the classic one.
|
||||
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
|
||||
issue](https://github.com/huggingface/transformers/issues/328)).
|
||||
strip_accents (`bool`, *optional*):
|
||||
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
|
||||
value for `lowercase` (as in the original ELECTRA).
|
||||
wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
|
||||
The prefix for subwords.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
slow_tokenizer_class = ElectraTokenizer
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file=None,
|
||||
tokenizer_file=None,
|
||||
do_lower_case=True,
|
||||
unk_token="[UNK]",
|
||||
sep_token="[SEP]",
|
||||
pad_token="[PAD]",
|
||||
cls_token="[CLS]",
|
||||
mask_token="[MASK]",
|
||||
tokenize_chinese_chars=True,
|
||||
strip_accents=None,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
vocab_file,
|
||||
tokenizer_file=tokenizer_file,
|
||||
do_lower_case=do_lower_case,
|
||||
unk_token=unk_token,
|
||||
sep_token=sep_token,
|
||||
pad_token=pad_token,
|
||||
cls_token=cls_token,
|
||||
mask_token=mask_token,
|
||||
tokenize_chinese_chars=tokenize_chinese_chars,
|
||||
strip_accents=strip_accents,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
|
||||
if (
|
||||
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
|
||||
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
|
||||
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
|
||||
):
|
||||
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
|
||||
normalizer_state["lowercase"] = do_lower_case
|
||||
normalizer_state["strip_accents"] = strip_accents
|
||||
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
|
||||
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
|
||||
|
||||
self.do_lower_case = do_lower_case
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||
"""
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
||||
adding special tokens. A ELECTRA sequence has the following format:
|
||||
|
||||
- single sequence: `[CLS] X [SEP]`
|
||||
- pair of sequences: `[CLS] A [SEP] B [SEP]`
|
||||
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs to which the special tokens will be added.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
|
||||
Returns:
|
||||
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
||||
"""
|
||||
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
||||
|
||||
if token_ids_1:
|
||||
output += token_ids_1 + [self.sep_token_id]
|
||||
|
||||
return output
|
||||
|
||||
def create_token_type_ids_from_sequences(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ELECTRA
|
||||
sequence pair mask has the following format:
|
||||
|
||||
```
|
||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
||||
| first sequence | second sequence |
|
||||
```
|
||||
|
||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
||||
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
|
||||
Returns:
|
||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
||||
"""
|
||||
sep = [self.sep_token_id]
|
||||
cls = [self.cls_token_id]
|
||||
if token_ids_1 is None:
|
||||
return len(cls + token_ids_0 + sep) * [0]
|
||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||
return tuple(files)
|
||||
|
Loading…
Reference in New Issue
Block a user