Mask computing in standalone method. Tests.

This commit is contained in:
LysandreJik 2019-09-19 10:13:10 +02:00
parent bf503158c5
commit c10c7d59e7
7 changed files with 81 additions and 23 deletions

View File

@ -187,18 +187,18 @@ class CommonTestCases:
for weights_list_2 in weights_lists_2:
self.assertListEqual(weights_list, weights_list_2)
# def test_mask_output(self):
# if sys.version_info <= (3, 0):
# return
#
# tokenizer = self.get_tokenizer()
#
# if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
# seq_0 = "Test this method."
# seq_1 = "With these inputs."
# information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True)
# sequences, mask = information["sequence"], information["mask"]
# assert len(sequences) == len(mask)
def test_mask_output(self):
if sys.version_info <= (3, 0):
return
tokenizer = self.get_tokenizer()
if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
seq_0 = "Test this method."
seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True)
sequences, mask = information["sequence"], information["mask"]
assert len(sequences) == len(mask)
def test_number_of_added_tokens(self):
tokenizer = self.get_tokenizer()

View File

@ -204,6 +204,18 @@ class BertTokenizer(PreTrainedTokenizer):
return cls + token_ids_0 + sep + token_ids_1 + sep
def create_mask_from_sequences(self, sequence_0, sequence_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file."""
index = 0

View File

@ -64,12 +64,18 @@ class DistilBertTokenizer(BertTokenizer):
def add_special_tokens_single_sequence(self, token_ids):
return token_ids
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1, output_mask=False):
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
sep = [self.sep_token_id]
if output_mask:
return (
token_ids_0 + sep + token_ids_1,
[0] * len(token_ids_0 + sep) + [1] * len(token_ids_1)
)
else:
return token_ids_0 + sep + token_ids_1
return token_ids_0 + sep + token_ids_1
def create_mask_from_sequences(self, sequence_0, sequence_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return len(self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1)) * [1]

View File

@ -96,3 +96,15 @@ class RobertaTokenizer(GPT2Tokenizer):
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def create_mask_from_sequences(self, sequence_0, sequence_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return len(cls + self.encode(sequence_0) + sep + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]

View File

@ -779,8 +779,8 @@ class PreTrainedTokenizer(object):
second_sentence_tokens
)
# if output_mask:
# sequence, information["mask"] = encoded_sequence
if output_mask:
information["mask"] = self.create_mask_from_sequences(text, text_pair)
information["sequence"] = sequence
else:
@ -797,6 +797,10 @@ class PreTrainedTokenizer(object):
return information
def create_mask_from_sequences(self, sequence_0, sequence_1):
logger.warning("This tokenizer does not make use of special tokens.")
return [0] * len(self.encode(sequence_0)) + [1] * len(self.encode(sequence_1))
def add_special_tokens_single_sequence(self, token_ids):
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
return token_ids

View File

@ -770,6 +770,18 @@ class XLMTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def create_mask_from_sequences(self, sequence_0, sequence_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLM sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory."""
if not os.path.isdir(save_directory):

View File

@ -198,9 +198,21 @@ class XLNetTokenizer(PreTrainedTokenizer):
sep = [self.sep_token_id]
cls = [self.cls_token_id]
cls_segment_ids = [2]
return token_ids_0 + sep + token_ids_1 + sep + cls
def create_mask_from_sequences(self, sequence_0, sequence_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence | CLS segment ID
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
cls_segment_id = [2]
return len(self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.