mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 11:11:05 +06:00
GPT2 tokenizer should not output token type IDs (#5546)
* GPT2 tokenizer should not output token type IDs * Same for OpenAIGPT
This commit is contained in:
parent
7833b21a5a
commit
d6b0b9d451
@ -137,6 +137,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["attention_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -330,6 +331,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["attention_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -96,6 +96,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["attention_mask"]
|
||||
|
||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||
super().__init__(unk_token=unk_token, **kwargs)
|
||||
@ -261,6 +262,7 @@ class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["attention_mask"]
|
||||
|
||||
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
|
||||
kwargs.setdefault("unk_token", unk_token)
|
||||
|
Loading…
Reference in New Issue
Block a user