mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 19:21:31 +06:00
Merge pull request #2270 from aaugustin/remove-python-2
Remove support for Python 2
This commit is contained in:
commit
ce50305e5b
@ -1,6 +1,6 @@
|
||||
version: 2
|
||||
jobs:
|
||||
run_tests_py3_torch_and_tf:
|
||||
run_tests_torch_and_tf:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
@ -17,7 +17,7 @@ jobs:
|
||||
- run: sudo pip install tensorboardX scikit-learn
|
||||
- run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
|
||||
- run: codecov
|
||||
run_tests_py3_torch:
|
||||
run_tests_torch:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
@ -33,7 +33,7 @@ jobs:
|
||||
- run: sudo pip install tensorboardX scikit-learn
|
||||
- run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
|
||||
- run: codecov
|
||||
run_tests_py3_tf:
|
||||
run_tests_tf:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
@ -49,7 +49,7 @@ jobs:
|
||||
- run: sudo pip install tensorboardX scikit-learn
|
||||
- run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
|
||||
- run: codecov
|
||||
run_tests_py3_custom_tokenizers:
|
||||
run_tests_custom_tokenizers:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
@ -59,7 +59,7 @@ jobs:
|
||||
- run: sudo pip install pytest pytest-xdist
|
||||
- run: sudo pip install mecab-python3
|
||||
- run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./tests/test_tokenization_bert_japanese.py
|
||||
run_examples_py3_torch:
|
||||
run_examples_torch:
|
||||
working_directory: ~/transformers
|
||||
docker:
|
||||
- image: circleci/python:3.5
|
||||
@ -121,9 +121,9 @@ workflows:
|
||||
jobs:
|
||||
- check_code_quality
|
||||
- check_repository_consistency
|
||||
- run_examples_py3_torch
|
||||
- run_tests_py3_custom_tokenizers
|
||||
- run_tests_py3_torch_and_tf
|
||||
- run_tests_py3_torch
|
||||
- run_tests_py3_tf
|
||||
- run_examples_torch
|
||||
- run_tests_custom_tokenizers
|
||||
- run_tests_torch_and_tf
|
||||
- run_tests_torch
|
||||
- run_tests_tf
|
||||
- deploy_doc: *workflow_filters
|
||||
|
@ -64,7 +64,7 @@ Choose the right framework for every part of a model's lifetime
|
||||
|
||||
## Installation
|
||||
|
||||
This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+), PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
|
||||
This repo is tested on Python 3.5+, PyTorch 1.0.0+ and TensorFlow 2.0.0-rc1
|
||||
|
||||
### With pip
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
# Installation
|
||||
|
||||
Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
|
||||
Transformers is tested on Python 3.5+ and PyTorch 1.1.0
|
||||
|
||||
## With pip
|
||||
|
||||
@ -44,7 +44,7 @@ By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to `
|
||||
|
||||
## OpenAI GPT original tokenization workflow
|
||||
|
||||
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
|
||||
If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` and `SpaCy`:
|
||||
|
||||
``` bash
|
||||
pip install spacy ftfy==4.4.3
|
||||
|
@ -16,7 +16,7 @@
|
||||
"""BERT finetuning runner.
|
||||
Finetuning the library models for multiple choice on SWAG (Bert).
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
@ -24,7 +24,6 @@ import glob
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@ -104,12 +103,7 @@ class InputFeatures(object):
|
||||
|
||||
def read_swag_examples(input_file, is_training=True):
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, "utf-8") for cell in line) # noqa: F821
|
||||
lines.append(line)
|
||||
lines = list(csv.reader(f))
|
||||
|
||||
if is_training and lines[0][-1] != "label":
|
||||
raise ValueError("For training, the input file must contain a label column.")
|
||||
@ -347,7 +341,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
This script with default values evaluates a pretrained Transformer-XL on WikiText 103
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" This is the exact same script as `examples/run_squad.py` (as of 2019, October 4th) with an additional and optional step of distillation."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@ -160,7 +159,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" Finetuning the library models for multimodal multiclass prediction on MM-IMDB dataset."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@ -165,7 +164,7 @@ def train(args, train_dataset, model, tokenizer, criterion):
|
||||
best_f1, n_no_improve = 0, 0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
@ -16,7 +16,7 @@
|
||||
# limitations under the License.
|
||||
""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@ -186,7 +185,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
@ -19,7 +19,6 @@ GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while B
|
||||
using a masked language modeling (MLM) loss.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@ -282,7 +281,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
set_seed(args) # Added here for reproducibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproducibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@ -146,7 +145,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
best_steps = 0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@ -170,7 +169,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@ -186,7 +185,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
# Added here for reproductibility (even between python 2 and 3)
|
||||
# Added here for reproductibility
|
||||
set_seed(args)
|
||||
|
||||
for _ in train_iterator:
|
||||
|
@ -16,7 +16,6 @@
|
||||
""" Finetuning multi-lingual models on XNLI (Bert, DistilBERT, XLM).
|
||||
Adapted from `examples/run_glue.py`"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@ -165,7 +164,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
train_iterator = trange(
|
||||
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
|
||||
)
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
@ -94,7 +94,7 @@ def process_story(raw_story):
|
||||
|
||||
|
||||
def _add_missing_period(line):
|
||||
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
|
||||
END_TOKENS = [".", "!", "?", "...", "'", "`", '"', "\u2019", "\u2019", ")"]
|
||||
if line.startswith("@highlight"):
|
||||
return line
|
||||
if line[-1] in END_TOKENS:
|
||||
|
@ -12,25 +12,19 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
import run_generation
|
||||
import run_glue
|
||||
import run_squad
|
||||
|
||||
|
||||
try:
|
||||
# python 3.4+ can use builtin unittest.mock instead of mock package
|
||||
from unittest.mock import patch
|
||||
except ImportError:
|
||||
from mock import patch
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
@ -15,15 +15,12 @@
|
||||
# limitations under the License.
|
||||
""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import csv
|
||||
import glob
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from io import open
|
||||
from typing import List
|
||||
|
||||
import tqdm
|
||||
@ -180,13 +177,7 @@ class SwagProcessor(DataProcessor):
|
||||
|
||||
def _read_csv(self, input_file):
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
reader = csv.reader(f)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, "utf-8") for cell in line) # noqa: F821
|
||||
lines.append(line)
|
||||
return lines
|
||||
return list(csv.reader(f))
|
||||
|
||||
def _create_examples(self, lines: List[List[str]], type: str):
|
||||
"""Creates examples for the training and dev sets."""
|
||||
|
@ -15,11 +15,9 @@
|
||||
# limitations under the License.
|
||||
""" Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import logging
|
||||
import os
|
||||
from io import open
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
3
setup.py
3
setup.py
@ -14,7 +14,7 @@ To create the package for pypi.
|
||||
creating the wheel and the source distribution (obviously).
|
||||
|
||||
For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
|
||||
(this will build a wheel for the python version you use to build it - make sure you use python 3.x).
|
||||
(this will build a wheel for the python version you use to build it).
|
||||
|
||||
For the sources, run: "python setup.py sdist"
|
||||
You should now have a /dist directory with both .whl and .tar.gz source versions.
|
||||
@ -33,7 +33,6 @@ To create the package for pypi.
|
||||
7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
|
||||
|
||||
"""
|
||||
from io import open
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
|
@ -34,8 +34,8 @@ class ANSI:
|
||||
Helper for en.wikipedia.org/wiki/ANSI_escape_code
|
||||
"""
|
||||
|
||||
_bold = u"\u001b[1m"
|
||||
_reset = u"\u001b[0m"
|
||||
_bold = "\u001b[1m"
|
||||
_reset = "\u001b[0m"
|
||||
|
||||
@classmethod
|
||||
def bold(cls, s):
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
""" Auto Model class. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" BERT model configuration """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" CamemBERT configuration """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
""" Salesforce CTRL configuration """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" DistilBERT model configuration """
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" OpenAI GPT-2 configuration """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" MMBT configuration """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" OpenAI GPT configuration """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" RoBERTa configuration """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
""" T5 model configuration """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" Transformer XL configuration """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,13 +15,11 @@
|
||||
# limitations under the License.
|
||||
""" Configuration base class and utilities."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from io import open
|
||||
|
||||
from .file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" XLM configuration """
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" XLM-RoBERTa configuration """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -14,7 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" XLNet configuration """
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""Convert ALBERT checkpoint."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""Convert BERT checkpoint."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
@ -14,11 +14,9 @@
|
||||
# limitations under the License.
|
||||
"""Convert OpenAI GPT checkpoint."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
|
||||
|
@ -14,11 +14,9 @@
|
||||
# limitations under the License.
|
||||
"""Convert OpenAI GPT checkpoint."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
""" Convert pytorch checkpoints to TensorFlow """
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""Convert RoBERTa checkpoint."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""Convert T5 checkpoint."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
@ -14,13 +14,12 @@
|
||||
# limitations under the License.
|
||||
"""Convert Transformer XL checkpoint and datasets."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
|
||||
@ -35,12 +34,6 @@ from transformers import (
|
||||
from transformers.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
|
||||
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
else:
|
||||
import pickle
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# We do this to be able to load python 2 datasets pickles
|
||||
|
@ -14,12 +14,10 @@
|
||||
# limitations under the License.
|
||||
"""Convert OpenAI GPT checkpoint."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from io import open
|
||||
|
||||
import numpy
|
||||
import torch
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""Convert BERT checkpoint."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
@ -14,7 +14,6 @@ import logging
|
||||
import math
|
||||
import re
|
||||
import string
|
||||
from io import open
|
||||
|
||||
from transformers.tokenization_bert import BasicTokenizer
|
||||
|
||||
|
@ -18,7 +18,6 @@ import copy
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from ...file_utils import is_tf_available, is_torch_available
|
||||
|
||||
@ -98,13 +97,7 @@ class DataProcessor(object):
|
||||
def _read_tsv(cls, input_file, quotechar=None):
|
||||
"""Reads a tab separated value file."""
|
||||
with open(input_file, "r", encoding="utf-8-sig") as f:
|
||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
|
||||
lines = []
|
||||
for line in reader:
|
||||
if sys.version_info[0] == 2:
|
||||
line = list(unicode(cell, "utf-8") for cell in line) # noqa: F821
|
||||
lines.append(line)
|
||||
return lines
|
||||
return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
|
||||
|
||||
|
||||
class SingleSentenceClassificationProcessor(DataProcessor):
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" XNLI utils (dataset loading and evaluation) """
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
@ -3,7 +3,7 @@ Utilities for working with the local dataset cache.
|
||||
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
|
||||
Copyright by the AllenNLP authors.
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import fnmatch
|
||||
import json
|
||||
@ -14,11 +14,10 @@ import tempfile
|
||||
from contextlib import contextmanager
|
||||
from functools import partial, wraps
|
||||
from hashlib import sha256
|
||||
from io import open
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import boto3
|
||||
import requests
|
||||
import six
|
||||
from botocore.config import Config
|
||||
from botocore.exceptions import ClientError
|
||||
from filelock import FileLock
|
||||
@ -66,10 +65,6 @@ except ImportError:
|
||||
)
|
||||
default_cache_path = os.path.join(torch_cache_home, "transformers")
|
||||
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
except ImportError:
|
||||
from urlparse import urlparse
|
||||
|
||||
try:
|
||||
from pathlib import Path
|
||||
@ -107,36 +102,20 @@ def is_tf_available():
|
||||
return _tf_available
|
||||
|
||||
|
||||
if not six.PY2:
|
||||
def add_start_docstrings(*docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = "".join(docstr) + fn.__doc__
|
||||
return fn
|
||||
|
||||
def add_start_docstrings(*docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = "".join(docstr) + fn.__doc__
|
||||
return fn
|
||||
|
||||
return docstring_decorator
|
||||
|
||||
def add_end_docstrings(*docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = fn.__doc__ + "".join(docstr)
|
||||
return fn
|
||||
|
||||
return docstring_decorator
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
else:
|
||||
# Not possible to update class docstrings on python2
|
||||
def add_start_docstrings(*docstr):
|
||||
def docstring_decorator(fn):
|
||||
return fn
|
||||
def add_end_docstrings(*docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = fn.__doc__ + "".join(docstr)
|
||||
return fn
|
||||
|
||||
return docstring_decorator
|
||||
|
||||
def add_end_docstrings(*docstr):
|
||||
def docstring_decorator(fn):
|
||||
return fn
|
||||
|
||||
return docstring_decorator
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
def is_remote_url(url_or_filename):
|
||||
@ -183,7 +162,7 @@ def filename_to_url(filename, cache_dir=None):
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = TRANSFORMERS_CACHE
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
if isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
cache_path = os.path.join(cache_dir, filename)
|
||||
@ -218,9 +197,9 @@ def cached_path(
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = TRANSFORMERS_CACHE
|
||||
if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
|
||||
if isinstance(url_or_filename, Path):
|
||||
url_or_filename = str(url_or_filename)
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
if isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
if is_remote_url(url_or_filename):
|
||||
@ -297,7 +276,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
|
||||
ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
|
||||
if isinstance(user_agent, dict):
|
||||
ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
|
||||
elif isinstance(user_agent, six.string_types):
|
||||
elif isinstance(user_agent, str):
|
||||
ua += "; " + user_agent
|
||||
headers = {"user-agent": ua}
|
||||
if resume_size > 0:
|
||||
@ -331,9 +310,7 @@ def get_from_cache(
|
||||
"""
|
||||
if cache_dir is None:
|
||||
cache_dir = TRANSFORMERS_CACHE
|
||||
if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
|
||||
if isinstance(cache_dir, Path):
|
||||
cache_dir = str(cache_dir)
|
||||
|
||||
if not os.path.exists(cache_dir):
|
||||
@ -352,8 +329,6 @@ def get_from_cache(
|
||||
except (EnvironmentError, requests.exceptions.Timeout):
|
||||
etag = None
|
||||
|
||||
if sys.version_info[0] == 2 and etag is not None:
|
||||
etag = etag.decode("utf-8")
|
||||
filename = url_to_filename(url, etag)
|
||||
|
||||
# get cache path to put the file
|
||||
@ -417,9 +392,6 @@ def get_from_cache(
|
||||
meta = {"url": url, "etag": etag}
|
||||
meta_path = cache_path + ".json"
|
||||
with open(meta_path, "w") as meta_file:
|
||||
output_string = json.dumps(meta)
|
||||
if sys.version_info[0] == 2 and isinstance(output_string, str):
|
||||
output_string = unicode(output_string, "utf-8") # noqa: F821
|
||||
meta_file.write(output_string)
|
||||
json.dump(meta, meta_file)
|
||||
|
||||
return cache_path
|
||||
|
@ -12,7 +12,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
|
||||
import io
|
||||
import os
|
||||
@ -20,7 +20,6 @@ from os.path import expanduser
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
import six
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
@ -28,14 +27,7 @@ ENDPOINT = "https://huggingface.co"
|
||||
|
||||
|
||||
class S3Obj:
|
||||
def __init__(
|
||||
self,
|
||||
filename, # type: str
|
||||
LastModified, # type: str
|
||||
ETag, # type: str
|
||||
Size, # type: int
|
||||
**kwargs
|
||||
):
|
||||
def __init__(self, filename: str, LastModified: str, ETag: str, Size: int, **kwargs):
|
||||
self.filename = filename
|
||||
self.LastModified = LastModified
|
||||
self.ETag = ETag
|
||||
@ -43,13 +35,7 @@ class S3Obj:
|
||||
|
||||
|
||||
class PresignedUrl:
|
||||
def __init__(
|
||||
self,
|
||||
write, # type: str
|
||||
access, # type: str
|
||||
type, # type: str
|
||||
**kwargs
|
||||
):
|
||||
def __init__(self, write: str, access: str, type: str, **kwargs):
|
||||
self.write = write
|
||||
self.access = access
|
||||
self.type = type # mime-type to send to S3.
|
||||
@ -59,12 +45,7 @@ class HfApi:
|
||||
def __init__(self, endpoint=None):
|
||||
self.endpoint = endpoint if endpoint is not None else ENDPOINT
|
||||
|
||||
def login(
|
||||
self,
|
||||
username, # type: str
|
||||
password, # type: str
|
||||
):
|
||||
# type: (...) -> str
|
||||
def login(self, username: str, password: str) -> str:
|
||||
"""
|
||||
Call HF API to sign in a user and get a token if credentials are valid.
|
||||
|
||||
@ -80,10 +61,7 @@ class HfApi:
|
||||
d = r.json()
|
||||
return d["token"]
|
||||
|
||||
def whoami(
|
||||
self, token, # type: str
|
||||
):
|
||||
# type: (...) -> str
|
||||
def whoami(self, token: str) -> str:
|
||||
"""
|
||||
Call HF API to know "whoami"
|
||||
"""
|
||||
@ -93,8 +71,7 @@ class HfApi:
|
||||
d = r.json()
|
||||
return d["user"]
|
||||
|
||||
def logout(self, token):
|
||||
# type: (...) -> None
|
||||
def logout(self, token: str) -> None:
|
||||
"""
|
||||
Call HF API to log out.
|
||||
"""
|
||||
@ -102,19 +79,17 @@ class HfApi:
|
||||
r = requests.post(path, headers={"authorization": "Bearer {}".format(token)})
|
||||
r.raise_for_status()
|
||||
|
||||
def presign(self, token, filename):
|
||||
# type: (...) -> PresignedUrl
|
||||
def presign(self, token: str, filename) -> PresignedUrl:
|
||||
"""
|
||||
Call HF API to get a presigned url to upload `filename` to S3.
|
||||
"""
|
||||
path = "{}/api/presign".format(self.endpoint)
|
||||
r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename},)
|
||||
r = requests.post(path, headers={"authorization": "Bearer {}".format(token)}, json={"filename": filename})
|
||||
r.raise_for_status()
|
||||
d = r.json()
|
||||
return PresignedUrl(**d)
|
||||
|
||||
def presign_and_upload(self, token, filename, filepath):
|
||||
# type: (...) -> str
|
||||
def presign_and_upload(self, token: str, filename, filepath) -> str:
|
||||
"""
|
||||
Get a presigned url, then upload file to S3.
|
||||
|
||||
@ -158,13 +133,10 @@ class TqdmProgressFileReader:
|
||||
|
||||
def __init__(self, f: io.BufferedReader):
|
||||
self.f = f
|
||||
self.total_size = os.fstat(f.fileno()).st_size # type: int
|
||||
self.total_size = os.fstat(f.fileno()).st_size
|
||||
self.pbar = tqdm(total=self.total_size, leave=False)
|
||||
if six.PY3:
|
||||
# does not work unless PY3
|
||||
# no big deal as the CLI does not currently support PY2 anyways.
|
||||
self.read = f.read
|
||||
f.read = self._read
|
||||
self.read = f.read
|
||||
f.read = self._read
|
||||
|
||||
def _read(self, n=-1):
|
||||
self.pbar.update(n)
|
||||
@ -182,16 +154,7 @@ class HfFolder:
|
||||
"""
|
||||
Save token, creating folder as needed.
|
||||
"""
|
||||
if six.PY3:
|
||||
os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
|
||||
else:
|
||||
# Python 2
|
||||
try:
|
||||
os.makedirs(os.path.dirname(cls.path_token))
|
||||
except OSError as e:
|
||||
if e.errno != os.errno.EEXIST:
|
||||
raise e
|
||||
pass
|
||||
os.makedirs(os.path.dirname(cls.path_token), exist_ok=True)
|
||||
with open(cls.path_token, "w+") as f:
|
||||
f.write(token)
|
||||
|
||||
|
@ -14,13 +14,11 @@
|
||||
# limitations under the License.
|
||||
""" Configuration base class and utilities."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import copy
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from io import open
|
||||
|
||||
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
from .file_utils import (
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
""" Auto Model class. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,12 +15,10 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch BERT model. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -339,9 +337,7 @@ class BertIntermediate(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BertIntermediate, self).__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
||||
if isinstance(config.hidden_act, str) or (
|
||||
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
|
||||
):
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.intermediate_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.intermediate_act_fn = config.hidden_act
|
||||
@ -461,9 +457,7 @@ class BertPredictionHeadTransform(nn.Module):
|
||||
def __init__(self, config):
|
||||
super(BertPredictionHeadTransform, self).__init__()
|
||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
||||
if isinstance(config.hidden_act, str) or (
|
||||
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
|
||||
):
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.transform_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.transform_act_fn = config.hidden_act
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch CamemBERT model. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" PyTorch CTRL model."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -16,7 +16,7 @@
|
||||
adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
|
||||
and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import copy
|
||||
import logging
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
""" Classes to support Encoder-Decoder architectures """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch OpenAI GPT-2 model."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
import math
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch MMBT model. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,13 +15,11 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch OpenAI GPT model."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch RoBERTa model. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
""" PyTorch T5 model. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import copy
|
||||
import itertools
|
||||
|
@ -14,10 +14,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" TF 2.0 ALBERT model. """
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
@ -311,9 +310,7 @@ class TFAlbertLayer(tf.keras.layers.Layer):
|
||||
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn"
|
||||
)
|
||||
|
||||
if isinstance(config.hidden_act, str) or (
|
||||
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
|
||||
):
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.activation = config.hidden_act
|
||||
@ -454,9 +451,7 @@ class TFAlbertMLMHead(tf.keras.layers.Layer):
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
)
|
||||
if isinstance(config.hidden_act, str) or (
|
||||
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
|
||||
):
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.activation = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.activation = config.hidden_act
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
""" Auto Model class. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,10 +15,8 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 BERT model. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
@ -311,9 +309,7 @@ class TFBertIntermediate(tf.keras.layers.Layer):
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
)
|
||||
if isinstance(config.hidden_act, str) or (
|
||||
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
|
||||
):
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.intermediate_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.intermediate_act_fn = config.hidden_act
|
||||
@ -418,9 +414,7 @@ class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
|
||||
self.dense = tf.keras.layers.Dense(
|
||||
config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
|
||||
)
|
||||
if isinstance(config.hidden_act, str) or (
|
||||
sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode) # noqa: F821
|
||||
):
|
||||
if isinstance(config.hidden_act, str):
|
||||
self.transform_act_fn = ACT2FN[config.hidden_act]
|
||||
else:
|
||||
self.transform_act_fn = config.hidden_act
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 CTRL model."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 DistilBERT model
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
import math
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 OpenAI GPT-2 model. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 OpenAI GPT model."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" PyTorch - TF 2.0 general utilities."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 RoBERTa model. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 T5 model. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import copy
|
||||
import itertools
|
||||
|
@ -16,7 +16,6 @@
|
||||
""" TF 2.0 Transformer XL model.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
"""TF general model utils."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 XLM model.
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
|
@ -15,10 +15,9 @@
|
||||
# limitations under the License.
|
||||
""" TF 2.0 XLNet model.
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
@ -290,9 +289,7 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
|
||||
config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2"
|
||||
)
|
||||
self.dropout = tf.keras.layers.Dropout(config.dropout)
|
||||
if isinstance(config.ff_activation, str) or (
|
||||
sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode) # noqa: F821
|
||||
):
|
||||
if isinstance(config.ff_activation, str):
|
||||
self.activation_function = ACT2FN[config.ff_activation]
|
||||
else:
|
||||
self.activation_function = config.ff_activation
|
||||
|
@ -18,7 +18,6 @@
|
||||
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch BERT model."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
@ -14,7 +14,7 @@
|
||||
# limitations under the License.
|
||||
""" PyTorch XLM model.
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
|
@ -15,7 +15,6 @@
|
||||
# limitations under the License.
|
||||
"""PyTorch XLM-RoBERTa model. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -15,11 +15,10 @@
|
||||
# limitations under the License.
|
||||
""" PyTorch XLNet model.
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
import math
|
||||
import sys
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@ -420,9 +419,7 @@ class XLNetFeedForward(nn.Module):
|
||||
self.layer_1 = nn.Linear(config.d_model, config.d_inner)
|
||||
self.layer_2 = nn.Linear(config.d_inner, config.d_model)
|
||||
self.dropout = nn.Dropout(config.dropout)
|
||||
if isinstance(config.ff_activation, str) or (
|
||||
sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode) # noqa: F821
|
||||
):
|
||||
if isinstance(config.ff_activation, str):
|
||||
self.activation_function = ACT2FN[config.ff_activation]
|
||||
else:
|
||||
self.activation_function = config.ff_activation
|
||||
|
@ -14,7 +14,6 @@
|
||||
# ==============================================================================
|
||||
"""Functions and classes related to optimization (weight updates)."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import re
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import csv
|
||||
import json
|
||||
@ -26,7 +26,6 @@ from os.path import abspath, exists
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import six
|
||||
|
||||
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
|
||||
from .configuration_utils import PretrainedConfig
|
||||
@ -939,7 +938,7 @@ def pipeline(
|
||||
modelcard = config
|
||||
|
||||
# Instantiate tokenizer if needed
|
||||
if isinstance(tokenizer, six.string_types):
|
||||
if isinstance(tokenizer, str):
|
||||
tokenizer = AutoTokenizer.from_pretrained(tokenizer)
|
||||
|
||||
# Instantiate config if needed
|
||||
|
@ -13,15 +13,13 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Tokenization classes for ALBERT model."""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
import unicodedata
|
||||
from shutil import copyfile
|
||||
|
||||
import six
|
||||
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
|
||||
@ -139,9 +137,6 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
||||
outputs = inputs
|
||||
outputs = outputs.replace("``", '"').replace("''", '"')
|
||||
|
||||
if six.PY2 and isinstance(outputs, str):
|
||||
outputs = outputs.decode("utf-8")
|
||||
|
||||
if not self.keep_accents:
|
||||
outputs = unicodedata.normalize("NFKD", outputs)
|
||||
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
|
||||
@ -150,14 +145,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
||||
|
||||
return outputs
|
||||
|
||||
def _tokenize(self, text, return_unicode=True, sample=False):
|
||||
""" Tokenize a string.
|
||||
return_unicode is used only for py2
|
||||
"""
|
||||
def _tokenize(self, text, sample=False):
|
||||
""" Tokenize a string. """
|
||||
text = self.preprocess_text(text)
|
||||
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
|
||||
if six.PY2 and isinstance(text, unicode): # noqa: F821
|
||||
text = text.encode("utf-8")
|
||||
|
||||
if not sample:
|
||||
pieces = self.sp_model.EncodeAsPieces(text)
|
||||
@ -177,27 +167,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
||||
else:
|
||||
new_pieces.append(piece)
|
||||
|
||||
# note(zhiliny): convert back to unicode for py2
|
||||
if six.PY2 and return_unicode:
|
||||
ret_pieces = []
|
||||
for piece in new_pieces:
|
||||
if isinstance(piece, str):
|
||||
piece = piece.decode("utf-8")
|
||||
ret_pieces.append(piece)
|
||||
new_pieces = ret_pieces
|
||||
|
||||
return new_pieces
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
return self.sp_model.PieceToId(token)
|
||||
|
||||
def _convert_id_to_token(self, index, return_unicode=True):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
token = self.sp_model.IdToPiece(index)
|
||||
if six.PY2 and return_unicode and isinstance(token, str):
|
||||
token = token.decode("utf-8")
|
||||
return token
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.sp_model.IdToPiece(index)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
""" Auto Model class. """
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -14,13 +14,11 @@
|
||||
# limitations under the License.
|
||||
"""Tokenization classes."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import collections
|
||||
import logging
|
||||
import os
|
||||
import unicodedata
|
||||
from io import open
|
||||
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
@ -203,11 +201,11 @@ class BertTokenizer(PreTrainedTokenizer):
|
||||
return split_tokens
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.ids_to_tokens.get(index, self.unk_token)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
|
@ -14,15 +14,12 @@
|
||||
# limitations under the License.
|
||||
"""Tokenization classes."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import collections
|
||||
import logging
|
||||
import os
|
||||
import unicodedata
|
||||
|
||||
import six
|
||||
|
||||
from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
|
||||
|
||||
|
||||
@ -195,10 +192,7 @@ class MecabTokenizer(object):
|
||||
never_split = self.never_split + (never_split if never_split is not None else [])
|
||||
tokens = []
|
||||
|
||||
if six.PY2:
|
||||
mecab_output = self.mecab.parse(text.encode("utf-8")).decode("utf-8")
|
||||
else:
|
||||
mecab_output = self.mecab.parse(text)
|
||||
mecab_output = self.mecab.parse(text)
|
||||
|
||||
cursor = 0
|
||||
for line in mecab_output.split("\n"):
|
||||
|
@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License
|
||||
""" Tokenization classes for Camembert model."""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
@ -155,7 +155,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
return self.sp_model.EncodeAsPieces(text)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
if token in self.fairseq_tokens_to_ids:
|
||||
return self.fairseq_tokens_to_ids[token]
|
||||
elif self.sp_model.PieceToId(token) == 0:
|
||||
@ -164,7 +164,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
return self.fairseq_offset + self.sp_model.PieceToId(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index in self.fairseq_ids_to_tokens:
|
||||
return self.fairseq_ids_to_tokens[index]
|
||||
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
||||
|
@ -13,12 +13,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for Salesforce CTRL."""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from io import open
|
||||
|
||||
import regex as re
|
||||
|
||||
@ -204,11 +203,11 @@ class CTRLTokenizer(PreTrainedTokenizer):
|
||||
return split_tokens
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.decoder.get(index, self.unk_token)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for DistilBERT."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -13,28 +13,18 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for OpenAI GPT."""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from io import open
|
||||
from functools import lru_cache
|
||||
|
||||
import regex as re
|
||||
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
|
||||
try:
|
||||
from functools import lru_cache
|
||||
except ImportError:
|
||||
# Just a dummy decorator to get the checks to run on python2
|
||||
# because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
|
||||
def lru_cache():
|
||||
return lambda func: func
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
VOCAB_FILES_NAMES = {
|
||||
@ -80,7 +70,6 @@ def bytes_to_unicode():
|
||||
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
"""
|
||||
_chr = unichr if sys.version_info[0] == 2 else chr # noqa: F821
|
||||
bs = (
|
||||
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
|
||||
)
|
||||
@ -91,7 +80,7 @@ def bytes_to_unicode():
|
||||
bs.append(b)
|
||||
cs.append(2 ** 8 + n)
|
||||
n += 1
|
||||
cs = [_chr(n) for n in cs]
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
|
||||
@ -212,23 +201,18 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
|
||||
bpe_tokens = []
|
||||
for token in re.findall(self.pat, text):
|
||||
if sys.version_info[0] == 2:
|
||||
token = "".join(
|
||||
self.byte_encoder[ord(b)] for b in token
|
||||
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
|
||||
else:
|
||||
token = "".join(
|
||||
self.byte_encoder[b] for b in token.encode("utf-8")
|
||||
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
|
||||
token = "".join(
|
||||
self.byte_encoder[b] for b in token.encode("utf-8")
|
||||
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
|
||||
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
|
||||
return bpe_tokens
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.decoder.get(index)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
|
@ -13,13 +13,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for OpenAI GPT."""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from io import open
|
||||
|
||||
from .tokenization_bert import BasicTokenizer
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
@ -177,7 +176,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
|
||||
return split_tokens
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
|
@ -13,22 +13,13 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for RoBERTa."""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
|
||||
from .tokenization_gpt2 import GPT2Tokenizer
|
||||
|
||||
|
||||
try:
|
||||
from functools import lru_cache
|
||||
except ImportError:
|
||||
# Just a dummy decorator to get the checks to run on python2
|
||||
# because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
|
||||
def lru_cache():
|
||||
return lambda func: func
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
VOCAB_FILES_NAMES = {
|
||||
|
@ -14,15 +14,12 @@
|
||||
# limitations under the License.
|
||||
""" Tokenization class for model T5."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from shutil import copyfile
|
||||
|
||||
import six
|
||||
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
|
||||
@ -138,41 +135,29 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(self.vocab_file)
|
||||
|
||||
def _tokenize(self, text, return_unicode=True, sample=False):
|
||||
def _tokenize(self, text, sample=False):
|
||||
""" Take as input a string and return a list of strings (tokens) for words/sub-words
|
||||
"""
|
||||
if not sample:
|
||||
pieces = self.sp_model.EncodeAsPieces(text)
|
||||
else:
|
||||
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
|
||||
|
||||
# convert back to unicode for py2
|
||||
if six.PY2 and return_unicode:
|
||||
ret_pieces = []
|
||||
for piece in pieces:
|
||||
if isinstance(piece, str):
|
||||
piece = piece.decode("utf-8")
|
||||
ret_pieces.append(piece)
|
||||
pieces = ret_pieces
|
||||
|
||||
return pieces
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
if token.startswith("<extra_id_"):
|
||||
match = re.match(r"<extra_id_(\d+)>", token)
|
||||
num = int(match.group(1))
|
||||
return self.vocab_size - num - 1
|
||||
return self.sp_model.piece_to_id(token)
|
||||
|
||||
def _convert_id_to_token(self, index, return_unicode=True):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index < self.sp_model.get_piece_size():
|
||||
token = self.sp_model.IdToPiece(index)
|
||||
else:
|
||||
token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
|
||||
if six.PY2 and return_unicode and isinstance(token, str):
|
||||
token = token.decode("utf-8")
|
||||
return token
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
|
@ -16,14 +16,13 @@
|
||||
""" Tokenization classes for Transformer XL model.
|
||||
Adapted from https://github.com/kimiyoung/transformer-xl.
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import pickle
|
||||
from collections import Counter, OrderedDict
|
||||
from io import open
|
||||
|
||||
import numpy as np
|
||||
|
||||
@ -36,11 +35,6 @@ try:
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
if sys.version_info[0] == 2:
|
||||
import cPickle as pickle
|
||||
else:
|
||||
import pickle
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -238,7 +232,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
|
||||
return self.idx2sym[idx]
|
||||
|
||||
def _convert_token_to_id(self, sym):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
if sym in self.sym2idx:
|
||||
return self.sym2idx[sym]
|
||||
else:
|
||||
|
@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for OpenAI GPT."""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import copy
|
||||
import itertools
|
||||
@ -21,9 +21,6 @@ import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from io import open
|
||||
|
||||
import six
|
||||
|
||||
from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
|
||||
|
||||
@ -251,11 +248,9 @@ class PreTrainedTokenizer(object):
|
||||
for key, value in kwargs.items():
|
||||
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
|
||||
if key == "additional_special_tokens":
|
||||
assert isinstance(value, (list, tuple)) and all(
|
||||
isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821
|
||||
)
|
||||
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
|
||||
else:
|
||||
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821
|
||||
assert isinstance(value, str)
|
||||
setattr(self, key, value)
|
||||
|
||||
@classmethod
|
||||
@ -567,7 +562,7 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
to_add_tokens = []
|
||||
for token in new_tokens:
|
||||
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) # noqa: F821
|
||||
assert isinstance(token, str)
|
||||
if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
|
||||
token = token.lower()
|
||||
if (
|
||||
@ -649,12 +644,10 @@ class PreTrainedTokenizer(object):
|
||||
for key, value in special_tokens_dict.items():
|
||||
assert key in self.SPECIAL_TOKENS_ATTRIBUTES
|
||||
if key == "additional_special_tokens":
|
||||
assert isinstance(value, (list, tuple)) and all(
|
||||
isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821
|
||||
)
|
||||
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
|
||||
added_tokens += self.add_tokens(value)
|
||||
else:
|
||||
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821
|
||||
assert isinstance(value, str)
|
||||
added_tokens += self.add_tokens([value])
|
||||
logger.info("Assigning %s to the %s key of the tokenizer", value, key)
|
||||
setattr(self, key, value)
|
||||
@ -740,13 +733,13 @@ class PreTrainedTokenizer(object):
|
||||
raise NotImplementedError
|
||||
|
||||
def convert_tokens_to_ids(self, tokens):
|
||||
""" Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
|
||||
""" Converts a single token, or a sequence of tokens, (str) in a single integer id
|
||||
(resp. a sequence of ids), using the vocabulary.
|
||||
"""
|
||||
if tokens is None:
|
||||
return None
|
||||
|
||||
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): # noqa: F821
|
||||
if isinstance(tokens, str):
|
||||
return self._convert_token_to_id_with_added_voc(tokens)
|
||||
|
||||
ids = []
|
||||
@ -901,9 +894,9 @@ class PreTrainedTokenizer(object):
|
||||
"""
|
||||
|
||||
def get_input_ids(text):
|
||||
if isinstance(text, six.string_types):
|
||||
if isinstance(text, str):
|
||||
return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
|
||||
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types):
|
||||
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
|
||||
return self.convert_tokens_to_ids(text)
|
||||
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
|
||||
return text
|
||||
@ -1297,7 +1290,7 @@ class PreTrainedTokenizer(object):
|
||||
|
||||
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
||||
""" Converts a single index or a sequence of indices (integers) in a token "
|
||||
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
|
||||
(resp.) a sequence of tokens (str), using the vocabulary and added tokens.
|
||||
|
||||
Args:
|
||||
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
|
||||
|
@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for XLM."""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import json
|
||||
import logging
|
||||
@ -21,7 +21,6 @@ import os
|
||||
import re
|
||||
import sys
|
||||
import unicodedata
|
||||
from io import open
|
||||
|
||||
import sacremoses as sm
|
||||
|
||||
@ -798,11 +797,11 @@ class XLMTokenizer(PreTrainedTokenizer):
|
||||
return split_tokens
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.decoder.get(index, self.unk_token)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
|
@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License
|
||||
""" Tokenization classes for XLM-RoBERTa model."""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
@ -171,13 +171,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
||||
return self.sp_model.EncodeAsPieces(text)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
if token in self.fairseq_tokens_to_ids:
|
||||
return self.fairseq_tokens_to_ids[token]
|
||||
return self.sp_model.PieceToId(token) + self.fairseq_offset
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
if index in self.fairseq_ids_to_tokens:
|
||||
return self.fairseq_ids_to_tokens[index]
|
||||
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
||||
|
@ -13,15 +13,13 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Tokenization classes for XLNet model."""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
import unicodedata
|
||||
from shutil import copyfile
|
||||
|
||||
import six
|
||||
|
||||
from .tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
|
||||
@ -139,9 +137,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
outputs = inputs
|
||||
outputs = outputs.replace("``", '"').replace("''", '"')
|
||||
|
||||
if six.PY2 and isinstance(outputs, str):
|
||||
outputs = outputs.decode("utf-8")
|
||||
|
||||
if not self.keep_accents:
|
||||
outputs = unicodedata.normalize("NFKD", outputs)
|
||||
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
|
||||
@ -150,14 +145,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
|
||||
return outputs
|
||||
|
||||
def _tokenize(self, text, return_unicode=True, sample=False):
|
||||
""" Tokenize a string.
|
||||
return_unicode is used only for py2
|
||||
"""
|
||||
def _tokenize(self, text, sample=False):
|
||||
""" Tokenize a string. """
|
||||
text = self.preprocess_text(text)
|
||||
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
|
||||
if six.PY2 and isinstance(text, unicode): # noqa: F821
|
||||
text = text.encode("utf-8")
|
||||
|
||||
if not sample:
|
||||
pieces = self.sp_model.EncodeAsPieces(text)
|
||||
@ -177,27 +167,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
|
||||
else:
|
||||
new_pieces.append(piece)
|
||||
|
||||
# note(zhiliny): convert back to unicode for py2
|
||||
if six.PY2 and return_unicode:
|
||||
ret_pieces = []
|
||||
for piece in new_pieces:
|
||||
if isinstance(piece, str):
|
||||
piece = piece.decode("utf-8")
|
||||
ret_pieces.append(piece)
|
||||
new_pieces = ret_pieces
|
||||
|
||||
return new_pieces
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
""" Converts a token (str/unicode) in an id using the vocab. """
|
||||
""" Converts a token (str) in an id using the vocab. """
|
||||
return self.sp_model.PieceToId(token)
|
||||
|
||||
def _convert_id_to_token(self, index, return_unicode=True):
|
||||
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
|
||||
token = self.sp_model.IdToPiece(index)
|
||||
if six.PY2 and return_unicode and isinstance(token, str):
|
||||
token = token.decode("utf-8")
|
||||
return token
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
return self.sp_model.IdToPiece(index)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
||||
|
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
""" Finetuning the library models for task XXX."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
@ -156,7 +155,7 @@ def train(args, train_dataset, model, tokenizer):
|
||||
tr_loss, logging_loss = 0.0, 0.0
|
||||
model.zero_grad()
|
||||
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
||||
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
||||
set_seed(args) # Added here for reproductibility
|
||||
for _ in train_iterator:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user