Merge pull request #2270 from aaugustin/remove-python-2

Remove support for Python 2
This commit is contained in:
Aymeric Augustin 2019-12-22 23:04:37 +01:00 committed by GitHub
commit ce50305e5b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
155 changed files with 217 additions and 603 deletions

View File

@ -1,6 +1,6 @@
version: 2
jobs:
run_tests_py3_torch_and_tf:
run_tests_torch_and_tf:
working_directory: ~/transformers
docker:
- image: circleci/python:3.5
@ -17,7 +17,7 @@ jobs:
- run: sudo pip install tensorboardX scikit-learn
- run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
- run: codecov
run_tests_py3_torch:
run_tests_torch:
working_directory: ~/transformers
docker:
- image: circleci/python:3.5
@ -33,7 +33,7 @@ jobs:
- run: sudo pip install tensorboardX scikit-learn
- run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
- run: codecov
run_tests_py3_tf:
run_tests_tf:
working_directory: ~/transformers
docker:
- image: circleci/python:3.5
@ -49,7 +49,7 @@ jobs:
- run: sudo pip install tensorboardX scikit-learn
- run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
- run: codecov
run_tests_py3_custom_tokenizers:
run_tests_custom_tokenizers:
working_directory: ~/transformers
docker:
- image: circleci/python:3.5
@ -59,7 +59,7 @@ jobs:
- run: sudo pip install pytest pytest-xdist
- run: sudo pip install mecab-python3
- run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./tests/test_tokenization_bert_japanese.py
run_examples_py3_torch:
run_examples_torch:
working_directory: ~/transformers
docker:
- image: circleci/python:3.5
@ -121,9 +121,9 @@ workflows:
jobs:
- check_code_quality
- check_repository_consistency
- run_examples_py3_torch
- run_tests_py3_custom_tokenizers
- run_tests_py3_torch_and_tf
- run_tests_py3_torch
- run_tests_py3_tf
- run_examples_torch
- run_tests_custom_tokenizers
- run_tests_torch_and_tf
- run_tests_torch
- run_tests_tf
- deploy_doc: *workflow_filters

View File

@ -64,7 +64,7 @@ Choose the right framework for every part of a model's lifetime
## Installation
This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+), PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
This repo is tested on Python 3.5+, PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
### With pip

View File

@ -1,6 +1,6 @@
# Installation
Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
Transformers is tested on Python 3.5+ and PyTorch 1.1.0
## With pip
@ -44,7 +44,7 @@ By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `
## OpenAI GPT original tokenization workflow
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` and `SpaCy`:
``` bash
pip install spacy ftfy==4.4.3

View File

@ -16,7 +16,7 @@
"""BERT finetuning runner.
Finetuning the library models for multiple choice on SWAG (Bert).
"""
from __future__ import absolute_import, division, print_function
import argparse
import csv
@ -24,7 +24,6 @@ import glob
import logging
import os
import random
import sys
import numpy as np
import torch
@ -104,12 +103,7 @@ class InputFeatures(object):
def read_swag_examples(input_file, is_training=True):
with open(input_file, "r", encoding="utf-8") as f:
reader = csv.reader(f)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, "utf-8") for cell in line) # noqa: F821
lines.append(line)
lines = list(csv.reader(f))
if is_training and lines[0][-1] != "label":
raise ValueError("For training, the input file must contain a label column.")
@ -347,7 +341,7 @@ def train(args, train_dataset, model, tokenizer):
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
set_seed(args) # Added here for reproductibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):

View File

@ -19,7 +19,7 @@
This script with default values evaluates a pretrained Transformer-XL on WikiText 103
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" This is the exact same script as `examples/run_squad.py` (as of 2019, October 4th) with an additional and optional step of distillation."""
from __future__ import absolute_import, division, print_function
import argparse
import glob
@ -160,7 +159,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
set_seed(args) # Added here for reproductibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" Finetuning the library models for multimodal multiclass prediction on MM-IMDB dataset."""
from __future__ import absolute_import, division, print_function
import argparse
import glob
@ -165,7 +164,7 @@ def train(args, train_dataset, model, tokenizer, criterion):
best_f1, n_no_improve = 0, 0
model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
set_seed(args) # Added here for reproductibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):

View File

@ -16,7 +16,7 @@
# limitations under the License.
""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
from __future__ import absolute_import, division, print_function
import argparse
import glob
@ -186,7 +185,7 @@ def train(args, train_dataset, model, tokenizer):
train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
)
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
set_seed(args) # Added here for reproductibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):

View File

@ -19,7 +19,6 @@ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while B
using a masked language modeling (MLM) loss.
"""
from __future__ import absolute_import, division, print_function
import argparse
import glob
@ -282,7 +281,7 @@ def train(args, train_dataset, model, tokenizer):
train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
)
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
set_seed(args) # Added here for reproducibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
from __future__ import absolute_import, division, print_function
import argparse
import glob
@ -146,7 +145,7 @@ def train(args, train_dataset, model, tokenizer):
best_steps = 0
model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
set_seed(args) # Added here for reproductibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """
from __future__ import absolute_import, division, print_function
import argparse
import glob
@ -170,7 +169,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
)
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
set_seed(args) # Added here for reproductibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
from __future__ import absolute_import, division, print_function
import argparse
import glob
@ -186,7 +185,7 @@ def train(args, train_dataset, model, tokenizer):
train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
)
# Added here for reproductibility (even between python 2 and 3)
# Added here for reproductibility
set_seed(args)
for _ in train_iterator:

View File

@ -16,7 +16,6 @@
""" Finetuning multi-lingual models on XNLI (Bert, DistilBERT, XLM).
Adapted from `examples/run_glue.py`"""
from __future__ import absolute_import, division, print_function
import argparse
import glob
@ -165,7 +164,7 @@ def train(args, train_dataset, model, tokenizer):
train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
)
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
set_seed(args) # Added here for reproductibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):

View File

@ -94,7 +94,7 @@ def process_story(raw_story):
def _add_missing_period(line):
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', "\u2019", "\u2019", ")"]
if line.startswith("@highlight"):
return line
if line[-1] in END_TOKENS:

View File

@ -12,25 +12,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import, division, print_function
import argparse
import logging
import sys
import unittest
from unittest.mock import patch
import run_generation
import run_glue
import run_squad
try:
# python 3.4+ can use builtin unittest.mock instead of mock package
from unittest.mock import patch
except ImportError:
from mock import patch
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()

View File

@ -15,15 +15,12 @@
# limitations under the License.
""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """
from __future__ import absolute_import, division, print_function
import csv
import glob
import json
import logging
import os
import sys
from io import open
from typing import List
import tqdm
@ -180,13 +177,7 @@ class SwagProcessor(DataProcessor):
def _read_csv(self, input_file):
with open(input_file, "r", encoding="utf-8") as f:
reader = csv.reader(f)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, "utf-8") for cell in line) # noqa: F821
lines.append(line)
return lines
return list(csv.reader(f))
def _create_examples(self, lines: List[List[str]], type: str):
"""Creates examples for the training and dev sets."""

View File

@ -15,11 +15,9 @@
# limitations under the License.
""" Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """
from __future__ import absolute_import, division, print_function
import logging
import os
from io import open
logger = logging.getLogger(__name__)

View File

@ -14,7 +14,7 @@ To create the package for pypi.
creating the wheel and the source distribution (obviously).
For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
(this will build a wheel for the python version you use to build it - make sure you use python 3.x).
(this will build a wheel for the python version you use to build it).
For the sources, run: "python setup.py sdist"
You should now have a /dist directory with both .whl and .tar.gz source versions.
@ -33,7 +33,6 @@ To create the package for pypi.
7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
"""
from io import open
from setuptools import find_packages, setup

View File

@ -34,8 +34,8 @@ class ANSI:
Helper for en.wikipedia.org/wiki/ANSI_escape_code
"""
_bold = u"\u001b[1m"
_reset = u"\u001b[0m"
_bold = "\u001b[1m"
_reset = "\u001b[0m"
@classmethod
def bold(cls, s):

View File

@ -14,7 +14,6 @@
# limitations under the License.
""" Auto Model class. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" BERT model configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" CamemBERT configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -14,7 +14,6 @@
# limitations under the License.
""" Salesforce CTRL configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" DistilBERT model configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" OpenAI GPT-2 configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" MMBT configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" OpenAI GPT configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" RoBERTa configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -14,7 +14,6 @@
# limitations under the License.
""" T5 model configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" Transformer XL configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,13 +15,11 @@
# limitations under the License.
""" Configuration base class and utilities."""
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import json
import logging
import os
from io import open
from .file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" XLM configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" XLM-RoBERTa configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" XLNet configuration """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -14,7 +14,6 @@
# limitations under the License.
"""Convert ALBERT checkpoint."""
from __future__ import absolute_import, division, print_function
import argparse
import logging

View File

@ -14,7 +14,6 @@
# limitations under the License.
"""Convert BERT checkpoint."""
from __future__ import absolute_import, division, print_function
import argparse
import logging

View File

@ -14,11 +14,9 @@
# limitations under the License.
"""Convert OpenAI GPT checkpoint."""
from __future__ import absolute_import, division, print_function
import argparse
import logging
from io import open
import torch

View File

@ -14,11 +14,9 @@
# limitations under the License.
"""Convert OpenAI GPT checkpoint."""
from __future__ import absolute_import, division, print_function
import argparse
import logging
from io import open
import torch

View File

@ -14,7 +14,6 @@
# limitations under the License.
""" Convert pytorch checkpoints to TensorFlow """
from __future__ import absolute_import, division, print_function
import argparse
import logging

View File

@ -14,7 +14,6 @@
# limitations under the License.
"""Convert RoBERTa checkpoint."""
from __future__ import absolute_import, division, print_function
import argparse
import logging

View File

@ -14,7 +14,6 @@
# limitations under the License.
"""Convert T5 checkpoint."""
from __future__ import absolute_import, division, print_function
import argparse
import logging

View File

@ -14,13 +14,12 @@
# limitations under the License.
"""Convert Transformer XL checkpoint and datasets."""
from __future__ import absolute_import, division, print_function
import argparse
import logging
import os
import pickle
import sys
from io import open
import torch
@ -35,12 +34,6 @@ from transformers import (
from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
logging.basicConfig(level=logging.INFO)
# We do this to be able to load python 2 datasets pickles

View File

@ -14,12 +14,10 @@
# limitations under the License.
"""Convert OpenAI GPT checkpoint."""
from __future__ import absolute_import, division, print_function
import argparse
import json
import logging
from io import open
import numpy
import torch

View File

@ -14,7 +14,6 @@
# limitations under the License.
"""Convert BERT checkpoint."""
from __future__ import absolute_import, division, print_function
import argparse
import logging

View File

@ -14,7 +14,6 @@ import logging
import math
import re
import string
from io import open
from transformers.tokenization_bert import BasicTokenizer

View File

@ -18,7 +18,6 @@ import copy
import csv
import json
import logging
import sys
from ...file_utils import is_tf_available, is_torch_available
@ -98,13 +97,7 @@ class DataProcessor(object):
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding="utf-8-sig") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, "utf-8") for cell in line) # noqa: F821
lines.append(line)
return lines
return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
class SingleSentenceClassificationProcessor(DataProcessor):

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" XNLI utils (dataset loading and evaluation) """
from __future__ import absolute_import, division, print_function
import logging
import os

View File

@ -3,7 +3,7 @@ Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import fnmatch
import json
@ -14,11 +14,10 @@ import tempfile
from contextlib import contextmanager
from functools import partial, wraps
from hashlib import sha256
from io import open
from urllib.parse import urlparse
import boto3
import requests
import six
from botocore.config import Config
from botocore.exceptions import ClientError
from filelock import FileLock
@ -66,10 +65,6 @@ except ImportError:
)
default_cache_path = os.path.join(torch_cache_home, "transformers")
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
try:
from pathlib import Path
@ -107,36 +102,20 @@ def is_tf_available():
return _tf_available
if not six.PY2:
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = "".join(docstr) + fn.__doc__
return fn
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = "".join(docstr) + fn.__doc__
return fn
return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + "".join(docstr)
return fn
return docstring_decorator
return docstring_decorator
else:
# Not possible to update class docstrings on python2
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
return fn
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + "".join(docstr)
return fn
return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
return fn
return docstring_decorator
return docstring_decorator
def is_remote_url(url_or_filename):
@ -183,7 +162,7 @@ def filename_to_url(filename, cache_dir=None):
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
if isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
cache_path = os.path.join(cache_dir, filename)
@ -218,9 +197,9 @@ def cached_path(
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
if isinstance(url_or_filename, Path):
url_or_filename = str(url_or_filename)
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
if isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
if is_remote_url(url_or_filename):
@ -297,7 +276,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
if isinstance(user_agent, dict):
ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
elif isinstance(user_agent, six.string_types):
elif isinstance(user_agent, str):
ua += "; " + user_agent
headers = {"user-agent": ua}
if resume_size > 0:
@ -331,9 +310,7 @@ def get_from_cache(
"""
if cache_dir is None:
cache_dir = TRANSFORMERS_CACHE
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
if isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
if not os.path.exists(cache_dir):
@ -352,8 +329,6 @@ def get_from_cache(
except (EnvironmentError, requests.exceptions.Timeout):
etag = None
if sys.version_info[0] == 2 and etag is not None:
etag = etag.decode("utf-8")
filename = url_to_filename(url, etag)
# get cache path to put the file
@ -417,9 +392,6 @@ def get_from_cache(
meta = {"url": url, "etag": etag}
meta_path = cache_path + ".json"
with open(meta_path, "w") as meta_file:
output_string = json.dumps(meta)
if sys.version_info[0] == 2 and isinstance(output_string, str):
output_string = unicode(output_string, "utf-8") # noqa: F821
meta_file.write(output_string)
json.dump(meta, meta_file)
return cache_path

View File

@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import, division, print_function
import io
import os
@ -20,7 +20,6 @@ from os.path import expanduser
from typing import List
import requests
import six
from tqdm import tqdm
@ -28,14 +27,7 @@ ENDPOINT = "https://huggingface.co"
class S3Obj:
def __init__(
self,
filename, # type: str
LastModified, # type: str
ETag, # type: str
Size, # type: int
**kwargs
):
def __init__(self, filename: str, LastModified: str, ETag: str, Size: int, **kwargs):
self.filename = filename
self.LastModified = LastModified
self.ETag = ETag
@ -43,13 +35,7 @@ class S3Obj:
class PresignedUrl:
def __init__(
self,
write, # type: str
access, # type: str
type, # type: str
**kwargs
):
def __init__(self, write: str, access: str, type: str, **kwargs):
self.write = write
self.access = access
self.type = type # mime-type to send to S3.
@ -59,12 +45,7 @@ class HfApi:
def __init__(self, endpoint=None):
self.endpoint = endpoint if endpoint is not None else ENDPOINT
def login(
self,
username, # type: str
password, # type: str
):
# type: (...) -> str
def login(self, username: str, password: str) -> str:
"""
Call HF API to sign in a user and get a token if credentials are valid.
@ -80,10 +61,7 @@ class HfApi:
d = r.json()
return d["token"]
def whoami(
self, token, # type: str
):
# type: (...) -> str
def whoami(self, token: str) -> str:
"""
Call HF API to know "whoami"
"""
@ -93,8 +71,7 @@ class HfApi:
d = r.json()
return d["user"]
def logout(self, token):
# type: (...) -> None
def logout(self, token: str) -> None:
"""
Call HF API to log out.
"""
@ -102,19 +79,17 @@ class HfApi:
r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
r.raise_for_status()
def presign(self, token, filename):
# type: (...) -> PresignedUrl
def presign(self, token: str, filename) -> PresignedUrl:
"""
Call HF API to get a presigned url to upload `filename` to S3.
"""
path = "{}/api/presign".format(self.endpoint)
r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename},)
r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename})
r.raise_for_status()
d = r.json()
return PresignedUrl(**d)
def presign_and_upload(self, token, filename, filepath):
# type: (...) -> str
def presign_and_upload(self, token: str, filename, filepath) -> str:
"""
Get a presigned url, then upload file to S3.
@ -158,13 +133,10 @@ class TqdmProgressFileReader:
def __init__(self, f: io.BufferedReader):
self.f = f
self.total_size = os.fstat(f.fileno()).st_size # type: int
self.total_size = os.fstat(f.fileno()).st_size
self.pbar = tqdm(total=self.total_size, leave=False)
if six.PY3:
# does not work unless PY3
# no big deal as the CLI does not currently support PY2 anyways.
self.read = f.read
f.read = self._read
self.read = f.read
f.read = self._read
def _read(self, n=-1):
self.pbar.update(n)
@ -182,16 +154,7 @@ class HfFolder:
"""
Save token, creating folder as needed.
"""
if six.PY3:
os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
else:
# Python 2
try:
os.makedirs(os.path.dirname(cls.path_token))
except OSError as e:
if e.errno != os.errno.EEXIST:
raise e
pass
os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
with open(cls.path_token, "w+") as f:
f.write(token)

View File

@ -14,13 +14,11 @@
# limitations under the License.
""" Configuration base class and utilities."""
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import json
import logging
import os
from io import open
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
from .file_utils import (

View File

@ -14,7 +14,6 @@
# limitations under the License.
""" Auto Model class. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,12 +15,10 @@
# limitations under the License.
"""PyTorch BERT model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import math
import os
import sys
import torch
from torch import nn
@ -339,9 +337,7 @@ class BertIntermediate(nn.Module):
def __init__(self, config):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
@ -461,9 +457,7 @@ class BertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super(BertPredictionHeadTransform, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act

View File

@ -15,7 +15,6 @@
# limitations under the License.
"""PyTorch CamemBERT model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" PyTorch CTRL model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -16,7 +16,7 @@
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import logging

View File

@ -14,7 +14,6 @@
# limitations under the License.
""" Classes to support Encoder-Decoder architectures """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os

View File

@ -15,7 +15,6 @@
# limitations under the License.
"""PyTorch OpenAI GPT-2 model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import math

View File

@ -15,7 +15,6 @@
# limitations under the License.
"""PyTorch MMBT model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,13 +15,11 @@
# limitations under the License.
"""PyTorch OpenAI GPT model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import math
import os
from io import open
import torch
import torch.nn as nn

View File

@ -15,7 +15,6 @@
# limitations under the License.
"""PyTorch RoBERTa model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -14,7 +14,6 @@
# limitations under the License.
""" PyTorch T5 model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import itertools

View File

@ -14,10 +14,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 ALBERT model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import sys
import tensorflow as tf
@ -311,9 +310,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
)
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
if isinstance(config.hidden_act, str):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act
@ -454,9 +451,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
self.dense = tf.keras.layers.Dense(
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
if isinstance(config.hidden_act, str):
self.activation = ACT2FN[config.hidden_act]
else:
self.activation = config.hidden_act

View File

@ -14,7 +14,6 @@
# limitations under the License.
""" Auto Model class. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,10 +15,8 @@
# limitations under the License.
""" TF 2.0 BERT model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import sys
import numpy as np
import tensorflow as tf
@ -311,9 +309,7 @@ class TFBertIntermediate(tf.keras.layers.Layer):
self.dense = tf.keras.layers.Dense(
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
@ -418,9 +414,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
self.dense = tf.keras.layers.Dense(
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
)
if isinstance(config.hidden_act, str) or (
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
):
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
self.transform_act_fn = config.hidden_act

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" TF 2.0 CTRL model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -14,7 +14,7 @@
# limitations under the License.
""" TF 2.0 DistilBERT model
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import math

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" TF 2.0 OpenAI GPT-2 model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" TF 2.0 OpenAI GPT model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" PyTorch - TF 2.0 general utilities."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" TF 2.0 RoBERTa model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
""" TF 2.0 T5 model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import itertools

View File

@ -16,7 +16,6 @@
""" TF 2.0 Transformer XL model.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
"""TF general model utils."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os

View File

@ -14,7 +14,7 @@
# limitations under the License.
""" TF 2.0 XLM model.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import itertools
import logging

View File

@ -15,10 +15,9 @@
# limitations under the License.
""" TF 2.0 XLNet model.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import sys
import numpy as np
import tensorflow as tf
@ -290,9 +289,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2"
)
self.dropout = tf.keras.layers.Dropout(config.dropout)
if isinstance(config.ff_activation, str) or (
sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode) # noqa: F821
):
if isinstance(config.ff_activation, str):
self.activation_function = ACT2FN[config.ff_activation]
else:
self.activation_function = config.ff_activation

View File

@ -18,7 +18,6 @@
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
"""PyTorch BERT model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os

View File

@ -14,7 +14,7 @@
# limitations under the License.
""" PyTorch XLM model.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import itertools
import logging

View File

@ -15,7 +15,6 @@
# limitations under the License.
"""PyTorch XLM-RoBERTa model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -15,11 +15,10 @@
# limitations under the License.
""" PyTorch XLNet model.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import math
import sys
import torch
from torch import nn
@ -420,9 +419,7 @@ class XLNetFeedForward(nn.Module):
self.layer_1 = nn.Linear(config.d_model, config.d_inner)
self.layer_2 = nn.Linear(config.d_inner, config.d_model)
self.dropout = nn.Dropout(config.dropout)
if isinstance(config.ff_activation, str) or (
sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode) # noqa: F821
):
if isinstance(config.ff_activation, str):
self.activation_function = ACT2FN[config.ff_activation]
else:
self.activation_function = config.ff_activation

View File

@ -14,7 +14,6 @@
# ==============================================================================
"""Functions and classes related to optimization (weight updates)."""
from __future__ import absolute_import, division, print_function
import re

View File

@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import, division, print_function, unicode_literals
import csv
import json
@ -26,7 +26,6 @@ from os.path import abspath, exists
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
import six
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
from .configuration_utils import PretrainedConfig
@ -939,7 +938,7 @@ def pipeline(
modelcard = config
# Instantiate tokenizer if needed
if isinstance(tokenizer, six.string_types):
if isinstance(tokenizer, str):
tokenizer = AutoTokenizer.from_pretrained(tokenizer)
# Instantiate config if needed

View File

@ -13,15 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for ALBERT model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os
import unicodedata
from shutil import copyfile
import six
from .tokenization_utils import PreTrainedTokenizer
@ -139,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"')
if six.PY2 and isinstance(outputs, str):
outputs = outputs.decode("utf-8")
if not self.keep_accents:
outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
@ -150,14 +145,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
return outputs
def _tokenize(self, text, return_unicode=True, sample=False):
""" Tokenize a string.
return_unicode is used only for py2
"""
def _tokenize(self, text, sample=False):
""" Tokenize a string. """
text = self.preprocess_text(text)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if six.PY2 and isinstance(text, unicode): # noqa: F821
text = text.encode("utf-8")
if not sample:
pieces = self.sp_model.EncodeAsPieces(text)
@ -177,27 +167,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
else:
new_pieces.append(piece)
# note(zhiliny): convert back to unicode for py2
if six.PY2 and return_unicode:
ret_pieces = []
for piece in new_pieces:
if isinstance(piece, str):
piece = piece.decode("utf-8")
ret_pieces.append(piece)
new_pieces = ret_pieces
return new_pieces
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index, return_unicode=True):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
token = self.sp_model.IdToPiece(index)
if six.PY2 and return_unicode and isinstance(token, str):
token = token.decode("utf-8")
return token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""

View File

@ -14,7 +14,6 @@
# limitations under the License.
""" Auto Model class. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -14,13 +14,11 @@
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import logging
import os
import unicodedata
from io import open
from .tokenization_utils import PreTrainedTokenizer
@ -203,11 +201,11 @@ class BertTokenizer(PreTrainedTokenizer):
return split_tokens
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):

View File

@ -14,15 +14,12 @@
# limitations under the License.
"""Tokenization classes."""
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import logging
import os
import unicodedata
import six
from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
@ -195,10 +192,7 @@ class MecabTokenizer(object):
never_split = self.never_split + (never_split if never_split is not None else [])
tokens = []
if six.PY2:
mecab_output = self.mecab.parse(text.encode("utf-8")).decode("utf-8")
else:
mecab_output = self.mecab.parse(text)
mecab_output = self.mecab.parse(text)
cursor = 0
for line in mecab_output.split("\n"):

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License
""" Tokenization classes for Camembert model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os
@ -155,7 +155,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token]
elif self.sp_model.PieceToId(token) == 0:
@ -164,7 +164,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
return self.fairseq_offset + self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset)

View File

@ -13,12 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for Salesforce CTRL."""
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import os
from io import open
import regex as re
@ -204,11 +203,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
return split_tokens
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):

View File

@ -14,7 +14,6 @@
# limitations under the License.
"""Tokenization classes for DistilBERT."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging

View File

@ -13,28 +13,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import os
import sys
from io import open
from functools import lru_cache
import regex as re
from .tokenization_utils import PreTrainedTokenizer
try:
from functools import lru_cache
except ImportError:
# Just a dummy decorator to get the checks to run on python2
# because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
def lru_cache():
return lambda func: func
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {
@ -80,7 +70,6 @@ def bytes_to_unicode():
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
"""
_chr = unichr if sys.version_info[0] == 2 else chr # noqa: F821
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
@ -91,7 +80,7 @@ def bytes_to_unicode():
bs.append(b)
cs.append(2 ** 8 + n)
n += 1
cs = [_chr(n) for n in cs]
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
@ -212,23 +201,18 @@ class GPT2Tokenizer(PreTrainedTokenizer):
bpe_tokens = []
for token in re.findall(self.pat, text):
if sys.version_info[0] == 2:
token = "".join(
self.byte_encoder[ord(b)] for b in token
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
else:
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):

View File

@ -13,13 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import os
import re
from io import open
from .tokenization_bert import BasicTokenizer
from .tokenization_utils import PreTrainedTokenizer
@ -177,7 +176,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
return split_tokens
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):

View File

@ -13,22 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for RoBERTa."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
from .tokenization_gpt2 import GPT2Tokenizer
try:
from functools import lru_cache
except ImportError:
# Just a dummy decorator to get the checks to run on python2
# because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
def lru_cache():
return lambda func: func
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {

View File

@ -14,15 +14,12 @@
# limitations under the License.
""" Tokenization class for model T5."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os
import re
from shutil import copyfile
import six
from .tokenization_utils import PreTrainedTokenizer
@ -138,41 +135,29 @@ class T5Tokenizer(PreTrainedTokenizer):
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
def _tokenize(self, text, return_unicode=True, sample=False):
def _tokenize(self, text, sample=False):
""" Take as input a string and return a list of strings (tokens) for words/sub-words
"""
if not sample:
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
# convert back to unicode for py2
if six.PY2 and return_unicode:
ret_pieces = []
for piece in pieces:
if isinstance(piece, str):
piece = piece.decode("utf-8")
ret_pieces.append(piece)
pieces = ret_pieces
return pieces
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if token.startswith("<extra_id_"):
match = re.match(r"<extra_id_(\d+)>", token)
num = int(match.group(1))
return self.vocab_size - num - 1
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index, return_unicode=True):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
if index < self.sp_model.get_piece_size():
token = self.sp_model.IdToPiece(index)
else:
token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
if six.PY2 and return_unicode and isinstance(token, str):
token = token.decode("utf-8")
return token
def convert_tokens_to_string(self, tokens):

View File

@ -16,14 +16,13 @@
""" Tokenization classes for Transformer XL model.
Adapted from https://github.com/kimiyoung/transformer-xl.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import glob
import logging
import os
import sys
import pickle
from collections import Counter, OrderedDict
from io import open
import numpy as np
@ -36,11 +35,6 @@ try:
except ImportError:
pass
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
logger = logging.getLogger(__name__)
@ -238,7 +232,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
return self.idx2sym[idx]
def _convert_token_to_id(self, sym):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if sym in self.sym2idx:
return self.sym2idx[sym]
else:

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
from __future__ import absolute_import, division, print_function, unicode_literals
import copy
import itertools
@ -21,9 +21,6 @@ import json
import logging
import os
import re
from io import open
import six
from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
@ -251,11 +248,9 @@ class PreTrainedTokenizer(object):
for key, value in kwargs.items():
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
if key == "additional_special_tokens":
assert isinstance(value, (list, tuple)) and all(
isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821
)
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
else:
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821
assert isinstance(value, str)
setattr(self, key, value)
@classmethod
@ -567,7 +562,7 @@ class PreTrainedTokenizer(object):
to_add_tokens = []
for token in new_tokens:
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) # noqa: F821
assert isinstance(token, str)
if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
token = token.lower()
if (
@ -649,12 +644,10 @@ class PreTrainedTokenizer(object):
for key, value in special_tokens_dict.items():
assert key in self.SPECIAL_TOKENS_ATTRIBUTES
if key == "additional_special_tokens":
assert isinstance(value, (list, tuple)) and all(
isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821
)
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
added_tokens += self.add_tokens(value)
else:
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821
assert isinstance(value, str)
added_tokens += self.add_tokens([value])
logger.info("Assigning %s to the %s key of the tokenizer", value, key)
setattr(self, key, value)
@ -740,13 +733,13 @@ class PreTrainedTokenizer(object):
raise NotImplementedError
def convert_tokens_to_ids(self, tokens):
""" Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
""" Converts a single token, or a sequence of tokens, (str) in a single integer id
(resp. a sequence of ids), using the vocabulary.
"""
if tokens is None:
return None
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): # noqa: F821
if isinstance(tokens, str):
return self._convert_token_to_id_with_added_voc(tokens)
ids = []
@ -901,9 +894,9 @@ class PreTrainedTokenizer(object):
"""
def get_input_ids(text):
if isinstance(text, six.string_types):
if isinstance(text, str):
return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types):
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
return self.convert_tokens_to_ids(text)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
return text
@ -1297,7 +1290,7 @@ class PreTrainedTokenizer(object):
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
""" Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
(resp.) a sequence of tokens (str), using the vocabulary and added tokens.
Args:
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for XLM."""
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
@ -21,7 +21,6 @@ import os
import re
import sys
import unicodedata
from io import open
import sacremoses as sm
@ -798,11 +797,11 @@ class XLMTokenizer(PreTrainedTokenizer):
return split_tokens
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License
""" Tokenization classes for XLM-RoBERTa model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os
@ -171,13 +171,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return self.sp_model.EncodeAsPieces(text)
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
if token in self.fairseq_tokens_to_ids:
return self.fairseq_tokens_to_ids[token]
return self.sp_model.PieceToId(token) + self.fairseq_offset
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
"""Converts an index (integer) in a token (str) using the vocab."""
if index in self.fairseq_ids_to_tokens:
return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset)

View File

@ -13,15 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for XLNet model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os
import unicodedata
from shutil import copyfile
import six
from .tokenization_utils import PreTrainedTokenizer
@ -139,9 +137,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"')
if six.PY2 and isinstance(outputs, str):
outputs = outputs.decode("utf-8")
if not self.keep_accents:
outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
@ -150,14 +145,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
return outputs
def _tokenize(self, text, return_unicode=True, sample=False):
""" Tokenize a string.
return_unicode is used only for py2
"""
def _tokenize(self, text, sample=False):
""" Tokenize a string. """
text = self.preprocess_text(text)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if six.PY2 and isinstance(text, unicode): # noqa: F821
text = text.encode("utf-8")
if not sample:
pieces = self.sp_model.EncodeAsPieces(text)
@ -177,27 +167,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
else:
new_pieces.append(piece)
# note(zhiliny): convert back to unicode for py2
if six.PY2 and return_unicode:
ret_pieces = []
for piece in new_pieces:
if isinstance(piece, str):
piece = piece.decode("utf-8")
ret_pieces.append(piece)
new_pieces = ret_pieces
return new_pieces
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
""" Converts a token (str) in an id using the vocab. """
return self.sp_model.PieceToId(token)
def _convert_id_to_token(self, index, return_unicode=True):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
token = self.sp_model.IdToPiece(index)
if six.PY2 and return_unicode and isinstance(token, str):
token = token.decode("utf-8")
return token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""

View File

@ -14,7 +14,6 @@
# limitations under the License.
""" Finetuning the library models for task XXX."""
from __future__ import absolute_import, division, print_function
import argparse
import glob
@ -156,7 +155,7 @@ def train(args, train_dataset, model, tokenizer):
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
set_seed(args) # Added here for reproductibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):

Some files were not shown because too many files have changed in this diff Show More