mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-24 23:08:57 +06:00
refactor create_token_type_ids_from_sequences (#37681)
Some checks are pending
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Waiting to run
Build documentation / build (push) Waiting to run
Slow tests on important models (on Push - A10) / Get all modified files (push) Waiting to run
Slow tests on important models (on Push - A10) / Slow & FA2 tests (push) Blocked by required conditions
Self-hosted runner (push-caller) / Check if setup was changed (push) Waiting to run
Self-hosted runner (push-caller) / build-docker-containers (push) Blocked by required conditions
Self-hosted runner (push-caller) / Trigger Push CI (push) Blocked by required conditions
Secret Leaks / trufflehog (push) Waiting to run
Update Transformers metadata / build_and_package (push) Waiting to run
Some checks are pending
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Waiting to run
Build documentation / build (push) Waiting to run
Slow tests on important models (on Push - A10) / Get all modified files (push) Waiting to run
Slow tests on important models (on Push - A10) / Slow & FA2 tests (push) Blocked by required conditions
Self-hosted runner (push-caller) / Check if setup was changed (push) Waiting to run
Self-hosted runner (push-caller) / build-docker-containers (push) Blocked by required conditions
Self-hosted runner (push-caller) / Trigger Push CI (push) Blocked by required conditions
Secret Leaks / trufflehog (push) Waiting to run
Update Transformers metadata / build_and_package (push) Waiting to run
* rm build_input.. from old file * refactor create_token_type_ids_from_sequences * handle when cls_token_id is None * updated fix * markuplm * refactoring rest of models * copies * revert funnel * rm incorrect file * ruff * ruff
This commit is contained in:
parent
85f060e9b0
commit
324cc77dc3
@ -299,36 +299,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||||
|
@ -155,36 +155,6 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
return cls + token_ids_0 + sep
|
return cls + token_ids_0 + sep
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of ids.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not self.can_save_slow_tokenizer:
|
if not self.can_save_slow_tokenizer:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -236,35 +236,6 @@ class BertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
"""Fast Tokenization classes for Bert."""
|
"""Fast Tokenization classes for Bert."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
|
|
||||||
@ -138,35 +138,6 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -309,36 +309,6 @@ class BertJapaneseTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
if self.subword_tokenizer_type == "sentencepiece":
|
if self.subword_tokenizer_type == "sentencepiece":
|
||||||
|
@ -299,28 +299,5 @@ class BigBirdTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
|
||||||
pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
|
|
||||||
sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["BigBirdTokenizer"]
|
__all__ = ["BigBirdTokenizer"]
|
||||||
|
@ -175,36 +175,6 @@ class BigBirdTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of ids.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not self.can_save_slow_tokenizer:
|
if not self.can_save_slow_tokenizer:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -281,36 +281,6 @@ class BioGptTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
|
||||||
return [1] + ([0] * len(token_ids_0))
|
return [1] + ([0] * len(token_ids_0))
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
|
|
||||||
Transformer sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
|
|
||||||
# no bos used in fairseq
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(token_ids_0 + sep) * [0]
|
|
||||||
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||||
|
@ -205,37 +205,6 @@ class CanineTokenizer(PreTrainedTokenizer):
|
|||||||
result += ([0] * len(token_ids_1)) + [1]
|
result += ([0] * len(token_ids_1)) + [1]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CANINE
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
|
|
||||||
result = len(cls + token_ids_0 + sep) * [0]
|
|
||||||
if token_ids_1 is not None:
|
|
||||||
result += len(token_ids_1 + sep) * [1]
|
|
||||||
return result
|
|
||||||
|
|
||||||
# CanineTokenizer has no vocab file
|
# CanineTokenizer has no vocab file
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
|
||||||
return ()
|
return ()
|
||||||
|
@ -276,35 +276,6 @@ class CodeGenTokenizer(PreTrainedTokenizer):
|
|||||||
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
|
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id] if self.sep_token_id is not None else []
|
|
||||||
cls = [self.cls_token_id] if self.sep_token_id is not None else []
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||||
|
@ -154,36 +154,6 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return super()._encode_plus(*args, **kwargs)
|
return super()._encode_plus(*args, **kwargs)
|
||||||
|
|
||||||
# Copied from transformers.models.codegen.tokenization_codegen.CodeGenTokenizer.create_token_type_ids_from_sequences
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id] if self.sep_token_id is not None else []
|
|
||||||
cls = [self.cls_token_id] if self.sep_token_id is not None else []
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -239,35 +239,6 @@ class ConvBertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ConvBERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
"""Tokenization classes for ConvBERT."""
|
"""Tokenization classes for ConvBERT."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
|
|
||||||
@ -139,35 +139,6 @@ class ConvBertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ConvBERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -298,36 +298,6 @@ class DebertaTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
|
# Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
|
||||||
def _tokenize(self, text):
|
def _tokenize(self, text):
|
||||||
"""Tokenize a string."""
|
"""Tokenize a string."""
|
||||||
|
@ -179,36 +179,6 @@ class DebertaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
sep = [self.sep_token_id]
|
sep = [self.sep_token_id]
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
# Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._batch_encode_plus
|
# Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._batch_encode_plus
|
||||||
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
|
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
|
||||||
is_split_into_words = kwargs.get("is_split_into_words", False)
|
is_split_into_words = kwargs.get("is_split_into_words", False)
|
||||||
|
@ -208,33 +208,6 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
|
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
|
||||||
add_prefix_space = kwargs.pop("add_prefix_space", False)
|
add_prefix_space = kwargs.pop("add_prefix_space", False)
|
||||||
if is_split_into_words or add_prefix_space:
|
if is_split_into_words or add_prefix_space:
|
||||||
|
@ -169,33 +169,6 @@ class DebertaV2TokenizerFast(PreTrainedTokenizerFast):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not self.can_save_slow_tokenizer:
|
if not self.can_save_slow_tokenizer:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -304,35 +304,6 @@ class RealmTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A REALM sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
"""Fast Tokenization classes for REALM."""
|
"""Fast Tokenization classes for REALM."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
|
|
||||||
@ -215,35 +215,6 @@ class RealmTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A REALM sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -233,35 +233,6 @@ class RetriBertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
"""Tokenization classes for RetriBERT."""
|
"""Tokenization classes for RetriBERT."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
|
|
||||||
@ -142,35 +142,6 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -247,36 +247,6 @@ class DistilBertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary
|
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
"""Tokenization classes for DistilBERT."""
|
"""Tokenization classes for DistilBERT."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
|
|
||||||
@ -140,36 +140,6 @@ class DistilBertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.create_token_type_ids_from_sequences
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
|
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
|
@ -238,35 +238,6 @@ class ElectraTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Electra sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
|
|
||||||
@ -135,35 +135,6 @@ class ElectraTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ELECTRA sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -485,36 +485,6 @@ class FlaubertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
# Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.create_token_type_ids_from_sequences
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
# Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary
|
# Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
|
@ -293,35 +293,6 @@ class FNetTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet sequence
|
|
||||||
pair mask has the following format: :
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||||
|
@ -138,36 +138,6 @@ class FNetTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
return cls + token_ids_0 + sep
|
return cls + token_ids_0 + sep
|
||||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of ids.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||||
|
@ -430,39 +430,6 @@ class FSMTTokenizer(PreTrainedTokenizer):
|
|||||||
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return ([0] * len(token_ids_0)) + [1]
|
return ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
|
|
||||||
Transformer sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
|
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An
|
|
||||||
FAIRSEQ_TRANSFORMER sequence pair mask has the following format:
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
|
|
||||||
# no bos used in fairseq
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(token_ids_0 + sep) * [0]
|
|
||||||
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||||
|
@ -564,36 +564,6 @@ class HerbertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
# Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.create_token_type_ids_from_sequences
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
# Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary
|
# Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
|
@ -125,34 +125,6 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. HerBERT, like
|
|
||||||
BERT sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -239,35 +239,6 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A LayoutLM sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
"""Tokenization class for model LayoutLM."""
|
"""Tokenization class for model LayoutLM."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
|
|
||||||
@ -139,35 +139,6 @@ class LayoutLMTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A LayoutLM sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -358,29 +358,6 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
|
||||||
pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
|
|
||||||
sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -781,29 +781,6 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
|
||||||
pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
|
|
||||||
sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -238,35 +238,6 @@ class LxmertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Lxmert sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
|
|
||||||
@ -135,35 +135,6 @@ class LxmertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Lxmert sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -474,7 +474,6 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
|
|||||||
"""
|
"""
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
|
Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
|
||||||
make use of token type ids, therefore a list of zeros is returned.
|
make use of token type ids, therefore a list of zeros is returned.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
token_ids_0 (`List[int]`):
|
token_ids_0 (`List[int]`):
|
||||||
List of IDs.
|
List of IDs.
|
||||||
|
@ -240,35 +240,6 @@ class MobileBertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A MobileBERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
"""Tokenization classes for MobileBERT."""
|
"""Tokenization classes for MobileBERT."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
|
|
||||||
@ -140,35 +140,6 @@ class MobileBertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A MobileBERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -432,34 +432,6 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
|
|||||||
return ([0] * len(token_ids_0)) + [1]
|
return ([0] * len(token_ids_0)) + [1]
|
||||||
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ProphetNet
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(token_ids_0 + sep) * [0]
|
|
||||||
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -216,36 +216,6 @@ class RemBertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
|
@ -181,36 +181,6 @@ class RemBertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of ids.
|
|
||||||
token_ids_1 (`List[int]`, *optional*, defaults to `None`):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
|
@ -824,36 +824,6 @@ class RoCBertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str, str, str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str, str, str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -487,35 +487,6 @@ class RoFormerTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
"""Tokenization classes for RoFormer."""
|
"""Tokenization classes for RoFormer."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
from tokenizers.pre_tokenizers import BertPreTokenizer, PreTokenizer
|
from tokenizers.pre_tokenizers import BertPreTokenizer, PreTokenizer
|
||||||
@ -132,35 +132,6 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
|
|
||||||
sequence pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -239,35 +239,6 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SqueezeBERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
index = 0
|
index = 0
|
||||||
if os.path.isdir(save_directory):
|
if os.path.isdir(save_directory):
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
"""Tokenization classes for SqueezeBERT."""
|
"""Tokenization classes for SqueezeBERT."""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
from tokenizers import normalizers
|
from tokenizers import normalizers
|
||||||
|
|
||||||
@ -139,35 +139,6 @@ class SqueezeBertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A SqueezeBERT sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
|
||||||
return tuple(files)
|
return tuple(files)
|
||||||
|
@ -527,35 +527,6 @@ class XLMTokenizer(PreTrainedTokenizer):
|
|||||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||||
|
|
||||||
def create_token_type_ids_from_sequences(
|
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
|
||||||
) -> List[int]:
|
|
||||||
"""
|
|
||||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
|
|
||||||
pair mask has the following format:
|
|
||||||
|
|
||||||
```
|
|
||||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
|
||||||
| first sequence | second sequence |
|
|
||||||
```
|
|
||||||
|
|
||||||
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids_0 (`List[int]`):
|
|
||||||
List of IDs.
|
|
||||||
token_ids_1 (`List[int]`, *optional*):
|
|
||||||
Optional second list of IDs for sequence pairs.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
|
||||||
"""
|
|
||||||
sep = [self.sep_token_id]
|
|
||||||
cls = [self.cls_token_id]
|
|
||||||
if token_ids_1 is None:
|
|
||||||
return len(cls + token_ids_0 + sep) * [0]
|
|
||||||
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
|
||||||
|
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not os.path.isdir(save_directory):
|
if not os.path.isdir(save_directory):
|
||||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||||
|
@ -3389,9 +3389,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
Returns:
|
Returns:
|
||||||
`List[int]`: The token type ids.
|
`List[int]`: The token type ids.
|
||||||
"""
|
"""
|
||||||
|
cls_len = int(getattr(self, "cls_token_id", None) is not None)
|
||||||
|
sep_len = int(getattr(self, "sep_token_id", None) is not None)
|
||||||
|
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return len(token_ids_0) * [0]
|
return [0] * (cls_len + len(token_ids_0) + sep_len)
|
||||||
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
|
|
||||||
|
return [0] * (cls_len + len(token_ids_0) + sep_len) + [1] * (len(token_ids_1) + sep_len)
|
||||||
|
|
||||||
def build_inputs_with_special_tokens(
|
def build_inputs_with_special_tokens(
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
|
Loading…
Reference in New Issue
Block a user