mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-01 10:41:07 +06:00
[UdopTokenizer
] Fix post merge imports (#29451)
* update
* ...
* nits
* arf
* 🧼
* beat the last guy
* style everyone
This commit is contained in:
parent
fa7f3cf336
commit
132852203a
@ -157,12 +157,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# TODO(PVP) - this should be removed in Transformers v5
|
|
||||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
|
||||||
"microsoft/udop-large": 512,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class UdopTokenizer(PreTrainedTokenizer):
|
class UdopTokenizer(PreTrainedTokenizer):
|
||||||
"""
|
"""
|
||||||
Adapted from [`LayoutXLMTokenizer`] and [`T5Tokenizer`]. Based on
|
Adapted from [`LayoutXLMTokenizer`] and [`T5Tokenizer`]. Based on
|
||||||
@ -256,7 +250,6 @@ class UdopTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
|
||||||
model_input_names = ["input_ids", "attention_mask"]
|
model_input_names = ["input_ids", "attention_mask"]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -29,11 +29,6 @@ from ...tokenization_utils_base import (
|
|||||||
)
|
)
|
||||||
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
from ...tokenization_utils_fast import PreTrainedTokenizerFast
|
||||||
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging
|
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging
|
||||||
from ..udop.tokenization_udop import (
|
|
||||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
|
|
||||||
PRETRAINED_VOCAB_FILES_MAP,
|
|
||||||
VOCAB_FILES_NAMES,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if is_sentencepiece_available():
|
if is_sentencepiece_available():
|
||||||
@ -42,6 +37,17 @@ else:
|
|||||||
UdopTokenizer = None
|
UdopTokenizer = None
|
||||||
|
|
||||||
|
|
||||||
|
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
|
||||||
|
|
||||||
|
PRETRAINED_VOCAB_FILES_MAP = {
|
||||||
|
"vocab_file": {
|
||||||
|
"microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/spiece.model",
|
||||||
|
},
|
||||||
|
"tokenizer_file": {
|
||||||
|
"microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/tokenizer.json",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
UDOP_ENCODE_KWARGS_DOCSTRING = r"""
|
UDOP_ENCODE_KWARGS_DOCSTRING = r"""
|
||||||
@ -197,7 +203,6 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
|
||||||
model_input_names = ["input_ids", "attention_mask"]
|
model_input_names = ["input_ids", "attention_mask"]
|
||||||
slow_tokenizer_class = UdopTokenizer
|
slow_tokenizer_class = UdopTokenizer
|
||||||
|
|
||||||
|
@ -22,12 +22,12 @@ from typing import List
|
|||||||
from transformers import (
|
from transformers import (
|
||||||
AddedToken,
|
AddedToken,
|
||||||
SpecialTokensMixin,
|
SpecialTokensMixin,
|
||||||
|
UdopTokenizer,
|
||||||
UdopTokenizerFast,
|
UdopTokenizerFast,
|
||||||
is_tf_available,
|
is_tf_available,
|
||||||
is_torch_available,
|
is_torch_available,
|
||||||
logging,
|
logging,
|
||||||
)
|
)
|
||||||
from transformers.models.udop.tokenization_udop import UdopTokenizer
|
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
get_tests_dir,
|
get_tests_dir,
|
||||||
is_pt_tf_cross_test,
|
is_pt_tf_cross_test,
|
||||||
@ -1717,6 +1717,10 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_alignement_methods(self):
|
def test_alignement_methods(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@unittest.skip("#TODO will be removed in main")
|
||||||
|
def test_pretrained_model_lists(self):
|
||||||
|
pass
|
||||||
|
|
||||||
@unittest.skip("UDOP tokenizer requires boxes besides sequences.")
|
@unittest.skip("UDOP tokenizer requires boxes besides sequences.")
|
||||||
def test_maximum_encoding_length_pair_input(self):
|
def test_maximum_encoding_length_pair_input(self):
|
||||||
pass
|
pass
|
||||||
|
Loading…
Reference in New Issue
Block a user