mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Fix tests + first example of doc
This commit is contained in:
parent
c999a3e505
commit
31e5b5ff22
@ -255,6 +255,7 @@ class PreTrainedTokenizer(object):
|
||||
pretrained_model_name_or_path: either:
|
||||
|
||||
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
|
||||
- a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
|
||||
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
|
||||
|
||||
@ -282,6 +283,9 @@ class PreTrainedTokenizer(object):
|
||||
# Download vocabulary from S3 and cache.
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
|
||||
# Download vocabulary from S3 (user-uploaded) and cache.
|
||||
tokenizer = BertTokenizer.from_pretrained('dbmz/bert-base-german-cased')
|
||||
|
||||
# If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
|
||||
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
|
||||
|
||||
@ -327,6 +331,9 @@ class PreTrainedTokenizer(object):
|
||||
if os.path.isdir(pretrained_model_name_or_path):
|
||||
# If a directory is provided we look for the standard filenames
|
||||
full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
|
||||
if not os.path.exists(full_file_name):
|
||||
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
|
||||
full_file_name = None
|
||||
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
|
||||
# If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
|
||||
full_file_name = pretrained_model_name_or_path
|
||||
|
Loading…
Reference in New Issue
Block a user