mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Sequence IDS
This commit is contained in:
parent
7c789c337d
commit
2f259b228e
@ -292,3 +292,33 @@ class CommonTestCases:
|
||||
|
||||
assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
|
||||
assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
|
||||
|
||||
def test_sequence_ids(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
sequence_0 = "Encode this."
|
||||
sequence_1 = "This one too please."
|
||||
|
||||
# Testing single inputs
|
||||
encoded_sequence = tokenizer.encode(sequence_0)
|
||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
|
||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||
sequence_ids = encoded_sequence_dict["sequence_ids"]
|
||||
assert len(sequence_ids) == len(encoded_sequence_w_special)
|
||||
|
||||
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
||||
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
||||
assert encoded_sequence == filtered_sequence
|
||||
|
||||
# Testing inputs pairs
|
||||
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
|
||||
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
|
||||
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
||||
sequence_ids = encoded_sequence_dict["sequence_ids"]
|
||||
assert len(sequence_ids) == len(encoded_sequence_w_special)
|
||||
|
||||
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
|
||||
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
||||
assert encoded_sequence == filtered_sequence
|
||||
|
||||
|
||||
|
@ -204,6 +204,24 @@ class BertTokenizer(PreTrainedTokenizer):
|
||||
|
||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||
|
||||
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
|
||||
"""
|
||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||
|
||||
Args:
|
||||
token_ids_0: list of ids (must not contain special tokens)
|
||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||
for sequence pairs
|
||||
|
||||
Returns:
|
||||
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||
"""
|
||||
if token_ids_1:
|
||||
return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
|
||||
else:
|
||||
return [0] + ([1] * len(token_ids_0)) + [0]
|
||||
|
||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
||||
"""
|
||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||
|
@ -100,6 +100,24 @@ class RobertaTokenizer(GPT2Tokenizer):
|
||||
cls = [self.cls_token_id]
|
||||
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
||||
|
||||
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
|
||||
"""
|
||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||
|
||||
Args:
|
||||
token_ids_0: list of ids (must not contain special tokens)
|
||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||
for sequence pairs
|
||||
|
||||
Returns:
|
||||
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||
"""
|
||||
if token_ids_1:
|
||||
return [0] + ([1] * len(token_ids_0)) + [0, 0] + ([1] * len(token_ids_1)) + [0]
|
||||
else:
|
||||
return [0] + ([1] * len(token_ids_0)) + [0]
|
||||
|
||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
||||
"""
|
||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||
|
@ -826,7 +826,21 @@ class PreTrainedTokenizer(object):
|
||||
or PyTorch torch.Tensor instead of a list of python integers.
|
||||
|
||||
Return:
|
||||
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
|
||||
A Dictionary of shape::
|
||||
|
||||
{
|
||||
input_ids: list[int],
|
||||
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
|
||||
sequence_ids: list[int] if ``add_special_tokens`` if set to ``True``
|
||||
}
|
||||
|
||||
With the fields:
|
||||
``input_ids``: list of tokens to be fed to a model
|
||||
|
||||
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
|
||||
|
||||
``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
|
||||
tokens and 1 specifying sequence tokens.
|
||||
"""
|
||||
pair = bool(pair_ids is not None)
|
||||
len_ids = len(ids)
|
||||
@ -859,6 +873,7 @@ class PreTrainedTokenizer(object):
|
||||
if add_special_tokens:
|
||||
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
|
||||
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
|
||||
encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids)
|
||||
else:
|
||||
sequence = ids + pair_ids if pair else ids
|
||||
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
|
||||
@ -893,6 +908,9 @@ class PreTrainedTokenizer(object):
|
||||
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
|
||||
return token_ids_0 + token_ids_1
|
||||
|
||||
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
|
||||
return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
|
||||
|
||||
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
||||
""" Converts a single index or a sequence of indices (integers) in a token "
|
||||
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
|
||||
|
@ -770,6 +770,24 @@ class XLMTokenizer(PreTrainedTokenizer):
|
||||
cls = [self.cls_token_id]
|
||||
return cls + token_ids_0 + sep + token_ids_1 + sep
|
||||
|
||||
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
|
||||
"""
|
||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||
|
||||
Args:
|
||||
token_ids_0: list of ids (must not contain special tokens)
|
||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||
for sequence pairs
|
||||
|
||||
Returns:
|
||||
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||
"""
|
||||
if token_ids_1:
|
||||
return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
|
||||
else:
|
||||
return [0] + ([1] * len(token_ids_0)) + [0]
|
||||
|
||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
||||
"""
|
||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||
|
@ -200,6 +200,24 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
cls = [self.cls_token_id]
|
||||
return token_ids_0 + sep + token_ids_1 + sep + cls
|
||||
|
||||
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
|
||||
"""
|
||||
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
||||
|
||||
Args:
|
||||
token_ids_0: list of ids (must not contain special tokens)
|
||||
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
|
||||
for sequence pairs
|
||||
|
||||
Returns:
|
||||
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
|
||||
"""
|
||||
if token_ids_1:
|
||||
return ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0, 0]
|
||||
else:
|
||||
return ([1] * len(token_ids_0)) + [0, 0]
|
||||
|
||||
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
|
||||
"""
|
||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
||||
|
Loading…
Reference in New Issue
Block a user