[cleanup] remove old scripts in /scripts 🧹 🧹 (#37676)

* rm old files

* not this one
This commit is contained in:
Joao Gante 2025-04-22 16:59:03 +01:00 committed by GitHub
parent 6673081b21
commit 0f8c34b0a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 1 additions and 1919 deletions

View File

@ -1,448 +0,0 @@
#!/usr/bin/env python
# HF Trainer benchmarking tool
#
# This tool can be used to run and compare multiple dimensions of the HF Trainers args.
#
# It then prints a report once in github format with all the information that needs to be shared
# with others and second time in a console-friendly format, so it's easier to use for tuning things up.
#
# The main idea is:
#
# ./trainer-benchmark.py --base-cmd '<cmd args that don't change>' \
# --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \
# --target-metric-key train_samples_per_second
#
# The variations can be any command line argument that you want to compare and not just dtype as in
# the example.
#
# --variations allows you to compare variations in multiple dimensions.
#
# as the first dimension has 2 options and the second 3 in our example, this will run the trainer 6
# times adding one of:
#
# 1. --tf32 0 --fp16 0
# 2. --tf32 0 --fp16 1
# 3. --tf32 0 --bf16 1
# 4. --tf32 1 --fp16 0
# 5. --tf32 1 --fp16 1
# 6. --tf32 1 --bf16 1
#
# and print the results. This is just a cartesian product - and more than 2 dimensions can be used.
#
# If you want to rely on defaults, this:
# --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1'
# is identical to this:
# --variations '--tf32 0|--tf32 1' '|--fp16|--bf16'
#
# the leading empty variation in the 2nd dimension is a valid variation.
#
# So here we get the following 6 variations:
#
# 1. --tf32 0
# 2. --tf32 0 --fp16
# 3. --tf32 0 --bf16
# 4. --tf32 1
# 5. --tf32 1 --fp16
# 6. --tf32 1 --bf16
#
# In this particular case we don't know what the default tf32 setting is as it's normally
# pytorch-version dependent). That's why it's best to do an explicit setting of each variation:
# `--tf32 0|--tf32 1`
#
# Here is a full example of a train:
#
# CUDA_VISIBLE_DEVICES=0 python ./scripts/benchmark/trainer-benchmark.py \
# --base-cmd \
# ' examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small \
# --output_dir output_dir --do_train --label_smoothing 0.1 --logging_strategy no \
# --save_strategy no --per_device_train_batch_size 32 --max_source_length 512 \
# --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \
# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \
# --source_prefix "translate English to Romanian: " --warmup_steps 50 \
# --max_train_samples 20000 --dataloader_num_workers 2 ' \
# --target-metric-key train_samples_per_second --repeat-times 1 --variations \
# '|--fp16|--bf16' '--tf32 0|--tf32 1' --report-metric-keys train_loss \
# --repeat-times 1 --base-variation '--tf32 0'
#
# and here is a possible output:
#
#
# | Variation | Train | Diff | Train |
# | | samples | % | loss |
# | | per | | |
# | | second | | |
# |:----------------|----------:|-------:|--------:|
# | --tf32 0 | 285.11 | 0 | 2.51 |
# | --tf32 1 | 342.09 | 20 | 2.51 |
# | --fp16 --tf32 0 | 423.49 | 49 | 2.51 |
# | --fp16 --tf32 1 | 423.13 | 48 | 2.51 |
# | --bf16 --tf32 0 | 416.80 | 46 | 2.52 |
# | --bf16 --tf32 1 | 415.87 | 46 | 2.52 |
#
#
# So you can quickly compare the different outcomes.
#
# Typically running each experiment once is enough, but if the environment is unstable you can
# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the averaged results.
#
# By default it'll use the lowest result as the base line to use as 100% and then compare the rest to
# it as can be seen from the table above, but you can also specify which combination is the one to use as
# the baseline, e.g., to change to another entry use: --base-variation '--tf32 1 --fp16 0'
#
# --target-metric-key is there to tell the program which metrics to compare - the different metric keys are
# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use:
# --target-metric-key eval_samples_per_second
# but of course you will need to adjust the --base-cmd value in the example to perform evaluation as
# well (as currently it doesn't)
#
import argparse
import datetime
import io
import itertools
import json
import math
import os
import platform
import re
import shlex
import subprocess
import sys
from pathlib import Path
from statistics import fmean
import pandas as pd
import torch
from tqdm import tqdm
import transformers
nan = float("nan")
class Tee:
"""
A helper class to tee print's output into a file.
Usage:
sys.stdout = Tee(filename)
"""
def __init__(self, filename):
self.stdout = sys.stdout
self.file = open(filename, "a")
def __getattr__(self, attr):
return getattr(self.stdout, attr)
def write(self, msg):
self.stdout.write(msg)
# strip tqdm codes
self.file.write(re.sub(r"^.*\r", "", msg, 0, re.M))
def get_original_command(max_width=80, full_python_path=False):
"""
Return the original command line string that can be replayed nicely and wrapped for 80 char width.
Args:
max_width (`int`, *optional*, defaults to 80):
The width to wrap for.
full_python_path (`bool`, `optional`, defaults to `False`):
Whether to replicate the full path or just the last segment (i.e. `python`).
"""
cmd = []
# deal with critical env vars
env_keys = ["CUDA_VISIBLE_DEVICES"]
for key in env_keys:
val = os.environ.get(key, None)
if val is not None:
cmd.append(f"{key}={val}")
# python executable (not always needed if the script is executable)
python = sys.executable if full_python_path else sys.executable.split("/")[-1]
cmd.append(python)
# now the normal args
cmd += list(map(shlex.quote, sys.argv))
# split up into up to MAX_WIDTH lines with shell multi-line escapes
lines = []
current_line = ""
while len(cmd) > 0:
current_line += f"{cmd.pop(0)} "
if len(cmd) == 0 or len(current_line) + len(cmd[0]) + 1 > max_width - 1:
lines.append(current_line)
current_line = ""
return "\\\n".join(lines)
def get_base_command(args, output_dir):
# unwrap multi-line input
args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd)
# remove --output_dir if any and set our own
args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd)
args.base_cmd += f" --output_dir {output_dir}"
# ensure we have --overwrite_output_dir
args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd)
args.base_cmd += " --overwrite_output_dir"
return [sys.executable] + shlex.split(args.base_cmd)
def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose):
# Enable to debug everything but the run itself, to do it fast and see the progress.
# This is useful for debugging the output formatting quickly - we can remove it later once
# everybody is happy with the output
if 0:
import random
from time import sleep
sleep(0)
return dict(
{k: random.uniform(0, 100) for k in metric_keys},
**{target_metric_key: random.choice([nan, 10.31, 100.2, 55.6666, 222.22222222])},
)
result = subprocess.run(cmd, capture_output=True, text=True)
if verbose:
print("STDOUT", result.stdout)
print("STDERR", result.stderr)
# save the streams
prefix = variation.replace(" ", "-")
with open(Path(output_dir) / f"log.{prefix}.stdout.txt", "w") as f:
f.write(result.stdout)
with open(Path(output_dir) / f"log.{prefix}.stderr.txt", "w") as f:
f.write(result.stderr)
if result.returncode != 0:
if verbose:
print("failed")
return {target_metric_key: nan}
with io.open(f"{output_dir}/all_results.json", "r", encoding="utf-8") as f:
metrics = json.load(f)
# filter out just the keys we want
return {k: v for k, v in metrics.items() if k in metric_keys}
def process_run(
id,
cmd,
variation_key,
variation,
longest_variation_len,
target_metric_key,
report_metric_keys,
repeat_times,
output_dir,
verbose,
):
results = []
metrics = []
preamble = f"{id}: {variation:<{longest_variation_len}}"
outcome = f"{preamble}: "
metric_keys = set(report_metric_keys + [target_metric_key])
for i in tqdm(range(repeat_times), desc=preamble, leave=False):
single_run_metrics = process_run_single(
id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose
)
result = single_run_metrics[target_metric_key]
if not math.isnan(result):
metrics.append(single_run_metrics)
results.append(result)
outcome += ""
else:
outcome += ""
outcome = f"\33[2K\r{outcome}"
if len(metrics) > 0:
mean_metrics = {k: fmean([x[k] for x in metrics]) for k in metrics[0].keys()}
mean_target = round(mean_metrics[target_metric_key], 2)
results_str = f"{outcome} {mean_target}"
if len(metrics) > 1:
results_str += f" {tuple(round(x, 2) for x in results)}"
print(results_str)
mean_metrics[variation_key] = variation
return mean_metrics
else:
print(outcome)
return {variation_key: variation, target_metric_key: nan}
def get_versions():
properties = torch.cuda.get_device_properties(torch.device("cuda"))
return f"""
Datetime : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Software:
transformers: {transformers.__version__}
torch : {torch.__version__}
cuda : {torch.version.cuda}
python : {platform.python_version()}
Hardware:
{torch.cuda.device_count()} GPUs : {properties.name}, {properties.total_memory/2**30:0.2f}GB
"""
def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir):
df = pd.DataFrame(results)
variation_key = "variation"
diff_key = "diff_%"
sentinel_value = nan
if base_variation is not None and len(df[df[variation_key] == base_variation]):
# this may still return nan
sentinel_value = df.loc[df[variation_key] == base_variation][target_metric_key].item()
if math.isnan(sentinel_value):
# as a fallback, use the minimal value as the sentinel
sentinel_value = df.loc[df[target_metric_key] != nan][target_metric_key].min()
# create diff column if possible
if not math.isnan(sentinel_value):
df[diff_key] = df.apply(
lambda r: round(100 * (r[target_metric_key] - sentinel_value) / sentinel_value)
if not math.isnan(r[target_metric_key])
else 0,
axis="columns",
)
# re-order columns
cols = [variation_key, target_metric_key, diff_key, *report_metric_keys]
df = df.reindex(cols, axis="columns") # reorder cols
# capitalize
df = df.rename(str.capitalize, axis="columns")
# make the cols as narrow as possible
df_github = df.rename(lambda c: c.replace("_", "<br>"), axis="columns")
df_console = df.rename(lambda c: c.replace("_", "\n"), axis="columns")
report = ["", "Copy between the cut-here-lines and paste as is to github or a forum"]
report += ["----------8<-----------------8<--------"]
report += ["*** Results:", df_github.to_markdown(index=False, floatfmt=".2f")]
report += ["```"]
report += ["*** Setup:", get_versions()]
report += ["*** The benchmark command line was:", get_original_command()]
report += ["```"]
report += ["----------8<-----------------8<--------"]
report += ["*** Results (console):", df_console.to_markdown(index=False, floatfmt=".2f")]
print("\n\n".join(report))
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--base-cmd",
default=None,
type=str,
required=True,
help="Base cmd",
)
parser.add_argument(
"--variations",
default=None,
type=str,
nargs="+",
required=True,
help="Multi-dimensional variations, example: '|--fp16|--bf16' '|--tf32'",
)
parser.add_argument(
"--base-variation",
default=None,
type=str,
help="Baseline variation to compare to. if None the minimal target value will be used to compare against",
)
parser.add_argument(
"--target-metric-key",
default=None,
type=str,
required=True,
help="Target metric key in output_dir/all_results.json, e.g., train_samples_per_second",
)
parser.add_argument(
"--report-metric-keys",
default="",
type=str,
help="Report metric keys - other metric keys from output_dir/all_results.json to report, e.g., train_loss. Use a single argument e.g., 'train_loss train_samples",
)
parser.add_argument(
"--repeat-times",
default=1,
type=int,
help="How many times to re-run each variation - an average will be reported",
)
parser.add_argument(
"--output_dir",
default="output_benchmark",
type=str,
help="The output directory where all the benchmark reports will go to and additionally this directory will be used to override --output_dir in the script that is being benchmarked",
)
parser.add_argument(
"--verbose",
default=False,
action="store_true",
help="Whether to show the outputs of each run or just the benchmark progress",
)
args = parser.parse_args()
output_dir = args.output_dir
Path(output_dir).mkdir(exist_ok=True)
base_cmd = get_base_command(args, output_dir)
# split each dimension into its --foo variations
dims = [list(map(str.strip, re.split(r"\|", x))) for x in args.variations]
# build a cartesian product of dimensions and convert those back into cmd-line arg strings,
# while stripping white space for inputs that were empty
variations = list(map(str.strip, map(" ".join, itertools.product(*dims))))
longest_variation_len = max(len(x) for x in variations)
# split wanted keys
report_metric_keys = args.report_metric_keys.split()
# capture prints into a log file for convenience
report_fn = f"benchmark-report-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"
print(f"\nNote: each run's output is also logged under {output_dir}/log.*.std*.txt")
print(f"and this script's output is also piped into {report_fn}")
sys.stdout = Tee(report_fn)
print(f"\n*** Running {len(variations)} benchmarks:")
print(f"Base command: {' '.join(base_cmd)}")
variation_key = "variation"
results = []
for id, variation in enumerate(tqdm(variations, desc="Total completion: ", leave=False)):
cmd = base_cmd + variation.split()
results.append(
process_run(
id + 1,
cmd,
variation_key,
variation,
longest_variation_len,
args.target_metric_key,
report_metric_keys,
args.repeat_times,
output_dir,
args.verbose,
)
)
process_results(results, args.target_metric_key, report_metric_keys, args.base_variation, output_dir)
if __name__ == "__main__":
main()

View File

@ -1,85 +0,0 @@
import time
import torch
from transformers import AutoModel, AutoTokenizer, pipeline
test_sentence = 'Do you [MASK] the muffin man?'
# for comparison
bert = pipeline('fill-mask', model = 'bert-base-uncased')
print('\n'.join([d['sequence'] for d in bert(test_sentence)]))
deberta = pipeline('fill-mask', model = 'microsoft/deberta-v3-base', model_kwargs={"legacy": False})
print('\n'.join([d['sequence'] for d in deberta(test_sentence)]))
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
tokenized_dict = tokenizer(
["Is this working",], ["Not yet",],
return_tensors="pt"
)
deberta.model.forward = torch.compile(deberta.model.forward)
start=time.time()
deberta.model(**tokenized_dict)
end=time.time()
print(end-start)
start=time.time()
deberta.model(**tokenized_dict)
end=time.time()
print(end-start)
start=time.time()
deberta.model(**tokenized_dict)
end=time.time()
print(end-start)
model = AutoModel.from_pretrained('microsoft/deberta-base')
model.config.return_dict = False
model.config.output_hidden_states=False
input_tuple = (tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
start=time.time()
traced_model = torch.jit.trace(model, input_tuple)
end=time.time()
print(end-start)
start=time.time()
traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
end=time.time()
print(end-start)
start=time.time()
traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
end=time.time()
print(end-start)
start=time.time()
traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
end=time.time()
print(end-start)
start=time.time()
traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
end=time.time()
print(end-start)
torch.jit.save(traced_model, "compiled_deberta.pt")
# my_script_module = torch.jit.script(model)

View File

@ -1,71 +0,0 @@
#!/usr/bin/env bash
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this script acquires data and converts it to fsmt model
# it covers:
# - allenai/wmt16-en-de-dist-12-1
# - allenai/wmt16-en-de-dist-6-1
# - allenai/wmt16-en-de-12-1
# this script needs to be run from the top level of the transformers repo
if [ ! -d "src/transformers" ]; then
echo "Error: This script needs to be run from the top of the transformers repo"
exit 1
fi
mkdir data
# get data (run once)
cd data
gdown 'https://drive.google.com/uc?id=1x_G2cjvM1nW5hjAB8-vWxRqtQTlmIaQU'
gdown 'https://drive.google.com/uc?id=1oA2aqZlVNj5FarxBlNXEHpBS4lRetTzU'
gdown 'https://drive.google.com/uc?id=1Wup2D318QYBFPW_NKI1mfP_hXOfmUI9r'
tar -xvzf trans_ende_12-1_0.2.tar.gz
tar -xvzf trans_ende-dist_12-1_0.2.tar.gz
tar -xvzf trans_ende-dist_6-1_0.2.tar.gz
gdown 'https://drive.google.com/uc?id=1mNufoynJ9-Zy1kJh2TA_lHm2squji0i9'
gdown 'https://drive.google.com/uc?id=1iO7um-HWoNoRKDtw27YUSgyeubn9uXqj'
tar -xvzf wmt16.en-de.deep-shallow.dist.tar.gz
tar -xvzf wmt16.en-de.deep-shallow.tar.gz
cp wmt16.en-de.deep-shallow/data-bin/dict.*.txt trans_ende_12-1_0.2
cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_12-1_0.2
cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_6-1_0.2
cp wmt16.en-de.deep-shallow/bpecodes trans_ende_12-1_0.2
cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_12-1_0.2
cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_6-1_0.2
cd -
# run conversions and uploads
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-12-1
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_6-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-6-1
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-12-1
# upload
cd data
transformers-cli upload -y wmt16-en-de-dist-12-1
transformers-cli upload -y wmt16-en-de-dist-6-1
transformers-cli upload -y wmt16-en-de-12-1
cd -
# if updating just small files and not the large models, here is a script to generate the right commands:
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
# add/remove files as needed

View File

@ -1,59 +0,0 @@
#!/usr/bin/env bash
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this script acquires data and converts it to fsmt model
# it covers:
# - allenai/wmt19-de-en-6-6-base
# - allenai/wmt19-de-en-6-6-big
# this script needs to be run from the top level of the transformers repo
if [ ! -d "src/transformers" ]; then
echo "Error: This script needs to be run from the top of the transformers repo"
exit 1
fi
mkdir data
# get data (run once)
cd data
gdown 'https://drive.google.com/uc?id=1j6z9fYdlUyOYsh7KJoumRlr1yHczxR5T'
gdown 'https://drive.google.com/uc?id=1yT7ZjqfvUYOBXvMjeY8uGRHQFWoSo8Q5'
gdown 'https://drive.google.com/uc?id=15gAzHeRUCs-QV8vHeTReMPEh1j8excNE'
tar -xvzf wmt19.de-en.tar.gz
tar -xvzf wmt19_deen_base_dr0.1_1.tar.gz
tar -xvzf wmt19_deen_big_dr0.1_2.tar.gz
cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_base_dr0.1_1
cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_big_dr0.1_2
cd -
# run conversions and uploads
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_base_dr0.1_1/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-base
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_big_dr0.1_2/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-big
# upload
cd data
transformers-cli upload -y wmt19-de-en-6-6-base
transformers-cli upload -y wmt19-de-en-6-6-big
cd -
# if updating just small files and not the large models, here is a script to generate the right commands:
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
# add/remove files as needed

View File

@ -1,70 +0,0 @@
#!/usr/bin/env bash
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this script acquires data and converts it to fsmt model
# it covers:
# - facebook/wmt19-ru-en
# - facebook/wmt19-en-ru
# - facebook/wmt19-de-en
# - facebook/wmt19-en-de
# this script needs to be run from the top level of the transformers repo
if [ ! -d "src/transformers" ]; then
echo "Error: This script needs to be run from the top of the transformers repo"
exit 1
fi
mkdir data
# get data (run once)
cd data
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz
tar -xvzf wmt19.en-de.joined-dict.ensemble.tar.gz
tar -xvzf wmt19.de-en.joined-dict.ensemble.tar.gz
tar -xvzf wmt19.en-ru.ensemble.tar.gz
tar -xvzf wmt19.ru-en.ensemble.tar.gz
cd -
# run conversions and uploads
export PAIR=ru-en
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
export PAIR=en-ru
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
export PAIR=de-en
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
export PAIR=en-de
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
# upload
cd data
transformers-cli upload -y wmt19-ru-en
transformers-cli upload -y wmt19-en-ru
transformers-cli upload -y wmt19-de-en
transformers-cli upload -y wmt19-en-de
cd -
# if updating just small files and not the large models, here is a script to generate the right commands:
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for map { "wmt19-$_" } ("en-ru", "ru-en", "de-en", "en-de")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
# add/remove files as needed

View File

@ -1,79 +0,0 @@
#!/usr/bin/env bash
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this script evals the following fsmt models
# it covers:
# - allenai/wmt16-en-de-dist-12-1
# - allenai/wmt16-en-de-dist-6-1
# - allenai/wmt16-en-de-12-1
# this script needs to be run from the top level of the transformers repo
if [ ! -d "src/transformers" ]; then
echo "Error: This script needs to be run from the top of the transformers repo"
exit 1
fi
# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
### Normal eval ###
export PAIR=en-de
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=64
export NUM_BEAMS=5
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
MODEL_PATH=allenai/wmt16-en-de-dist-12-1
echo $PAIR $MODEL_PATH
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
MODEL_PATH=allenai/wmt16-en-de-dist-6-1
echo $PAIR $MODEL_PATH
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
MODEL_PATH=allenai/wmt16-en-de-12-1
echo $PAIR $MODEL_PATH
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
### Searching hparams eval ###
export PAIR=en-de
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=32
export NUM_BEAMS=5
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
MODEL_PATH=allenai/wmt16-en-de-dist-12-1
echo $PAIR $MODEL_PATH
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
MODEL_PATH=allenai/wmt16-en-de-dist-6-1
echo $PAIR $MODEL_PATH
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
MODEL_PATH=allenai/wmt16-en-de-12-1
echo $PAIR $MODEL_PATH
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"

View File

@ -1,67 +0,0 @@
#!/usr/bin/env bash
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this script evals the following fsmt models
# it covers:
# - allenai/wmt19-de-en-6-6-base
# - allenai/wmt19-de-en-6-6-big
# this script needs to be run from the top level of the transformers repo
if [ ! -d "src/transformers" ]; then
echo "Error: This script needs to be run from the top of the transformers repo"
exit 1
fi
# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
### Normal eval ###
export PAIR=de-en
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=64
export NUM_BEAMS=5
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
MODEL_PATH=allenai/wmt19-de-en-6-6-base
echo $PAIR $MODEL_PATH
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
MODEL_PATH=allenai/wmt19-de-en-6-6-big
echo $PAIR $MODEL_PATH
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
### Searching hparams eval ###
export PAIR=de-en
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=16
export NUM_BEAMS=5
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
MODEL_PATH=allenai/wmt19-de-en-6-6-base
echo $PAIR $MODEL_PATH
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
MODEL_PATH=allenai/wmt19-de-en-6-6-big
echo $PAIR $MODEL_PATH
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"

View File

@ -1,161 +0,0 @@
#!/usr/bin/env bash
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this script evals the following fsmt models
# it covers:
# - facebook/wmt19-ru-en
# - facebook/wmt19-en-ru
# - facebook/wmt19-de-en
# - facebook/wmt19-en-de
# this script needs to be run from the top level of the transformers repo
if [ ! -d "src/transformers" ]; then
echo "Error: This script needs to be run from the top of the transformers repo"
exit 1
fi
# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
### a short estimate version for quick testing ###
export PAIR=en-ru
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
export NUM_BEAMS=8
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src | head -10 > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref | head -10 > $DATA_DIR/val.target
echo $PAIR
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
### Normal eval ###
# ru-en
export PAIR=ru-en
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
export NUM_BEAMS=50
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
# (target BLEU: 41.3 http://matrix.statmt.org/matrix/output/1907?run_id=6937)
# en-ru
export PAIR=en-ru
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
export NUM_BEAMS=50
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
echo $PAIR
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
# (target BLEU: 36.4 http://matrix.statmt.org/matrix/output/1914?score_id=37605)
# en-de
export PAIR=en-de
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
echo $PAIR
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
# (target BLEU: 43.1 http://matrix.statmt.org/matrix/output/1909?run_id=6862)
# de-en
export PAIR=de-en
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
export NUM_BEAMS=50
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
echo $PAIR
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
# (target BLEU: 42.3 http://matrix.statmt.org/matrix/output/1902?run_id=6750)
### Searching hparams eval ###
# en-ru
export PAIR=ru-en
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=32
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
# en-ru
export PAIR=en-ru
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=16
mkdir -p $DATA_DIR
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
# en-de
export PAIR=en-de
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=16
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
# de-en
export PAIR=de-en
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=16
mkdir -p $DATA_DIR
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"

View File

@ -1,88 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script creates a super tiny model that is useful inside tests, when we just want to test that
# the machinery works, without needing to the check the quality of the outcomes.
#
# This version creates a tiny vocab first, and then a tiny model - so the outcome is truly tiny -
# all files ~60KB. As compared to taking a full-size model, reducing to the minimum its layers and
# emb dimensions, but keeping the full vocab + merges files, leading to ~3MB in total for all files.
# The latter is done by `fsmt-make-super-tiny-model.py`.
#
# It will be used then as "stas/tiny-wmt19-en-ru"
import json
import tempfile
from pathlib import Path
from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTTokenizer
from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES
mname_tiny = "tiny-wmt19-en-ru"
# Build
# borrowed from a test
vocab = [ "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "lo", "low", "er</w>", "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>", ]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
with tempfile.TemporaryDirectory() as tmpdirname:
build_dir = Path(tmpdirname)
src_vocab_file = build_dir / VOCAB_FILES_NAMES["src_vocab_file"]
tgt_vocab_file = build_dir / VOCAB_FILES_NAMES["tgt_vocab_file"]
merges_file = build_dir / VOCAB_FILES_NAMES["merges_file"]
with open(src_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
with open(tgt_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
with open(merges_file, "w") as fp : fp.write("\n".join(merges))
tokenizer = FSMTTokenizer(
langs=["en", "ru"],
src_vocab_size = len(vocab),
tgt_vocab_size = len(vocab),
src_vocab_file=src_vocab_file,
tgt_vocab_file=tgt_vocab_file,
merges_file=merges_file,
)
config = FSMTConfig(
langs=['ru', 'en'],
src_vocab_size=1000, tgt_vocab_size=1000,
d_model=4,
encoder_layers=1, decoder_layers=1,
encoder_ffn_dim=4, decoder_ffn_dim=4,
encoder_attention_heads=1, decoder_attention_heads=1,
)
tiny_model = FSMTForConditionalGeneration(config)
print(f"num of params {tiny_model.num_parameters()}")
# Test
batch = tokenizer(["Making tiny model"], return_tensors="pt")
outputs = tiny_model(**batch)
print("test output:", len(outputs.logits[0]))
# Save
tiny_model.half() # makes it smaller
tiny_model.save_pretrained(mname_tiny)
tokenizer.save_pretrained(mname_tiny)
print(f"Generated {mname_tiny}")
# Upload
# transformers-cli upload tiny-wmt19-en-ru

View File

@ -1,61 +0,0 @@
#!/usr/bin/env python
# coding: utf-8
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script creates a super tiny model that is useful inside tests, when we just want to test that
# the machinery works, without needing to the check the quality of the outcomes.
#
# This version creates a tiny model through reduction of a normal pre-trained model, but keeping the
# full vocab, merges file, and thus also resulting in a larger model due to a large vocab size.
# This gives ~3MB in total for all files.
#
# If you want a 50 times smaller than this see `fsmt-make-super-tiny-model.py`, which is slightly more complicated
#
#
# It will be used then as "stas/tiny-wmt19-en-de"
# Build
from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTTokenizer
mname = "facebook/wmt19-en-de"
tokenizer = FSMTTokenizer.from_pretrained(mname)
# get the correct vocab sizes, etc. from the master model
config = FSMTConfig.from_pretrained(mname)
config.update({
"d_model": 4,
"encoder_layers": 1, "decoder_layers": 1,
"encoder_ffn_dim": 4, "decoder_ffn_dim": 4,
"encoder_attention_heads": 1, "decoder_attention_heads": 1})
tiny_model = FSMTForConditionalGeneration(config)
print(f"num of params {tiny_model.num_parameters()}")
# Test
batch = tokenizer(["Making tiny model"], return_tensors="pt")
outputs = tiny_model(**batch)
print("test output:", len(outputs.logits[0]))
# Save
mname_tiny = "tiny-wmt19-en-de"
tiny_model.half() # makes it smaller
tiny_model.save_pretrained(mname_tiny)
tokenizer.save_pretrained(mname_tiny)
print(f"Generated {mname_tiny}")
# Upload
# transformers-cli upload tiny-wmt19-en-de

View File

@ -1,156 +0,0 @@
#!/usr/bin/env python
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Usage:
# ./gen-card-allenai-wmt16.py
import os
from pathlib import Path
def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
texts = {
"en": "Machine learning is great, isn't it?",
"ru": "Машинное обучение - это здорово, не так ли?",
"de": "Maschinelles Lernen ist großartig, nicht wahr?",
}
# BLUE scores as follows:
# "pair": [fairseq, transformers]
scores = {
"wmt16-en-de-dist-12-1": [28.3, 27.52],
"wmt16-en-de-dist-6-1": [27.4, 27.11],
"wmt16-en-de-12-1": [26.9, 25.75],
}
pair = f"{src_lang}-{tgt_lang}"
readme = f"""
---
language:
- {src_lang}
- {tgt_lang}
thumbnail:
tags:
- translation
- wmt16
- allenai
license: apache-2.0
datasets:
- wmt16
metrics:
- bleu
---
# FSMT
## Model description
This is a ported version of fairseq-based [wmt16 transformer](https://github.com/jungokasai/deep-shallow/) for {src_lang}-{tgt_lang}.
For more details, please, see [Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation](https://arxiv.org/abs/2006.10369).
All 3 models are available:
* [wmt16-en-de-dist-12-1](https://huggingface.co/allenai/wmt16-en-de-dist-12-1)
* [wmt16-en-de-dist-6-1](https://huggingface.co/allenai/wmt16-en-de-dist-6-1)
* [wmt16-en-de-12-1](https://huggingface.co/allenai/wmt16-en-de-12-1)
## Intended uses & limitations
#### How to use
```python
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
mname = "allenai/{model_name}"
tokenizer = FSMTTokenizer.from_pretrained(mname)
model = FSMTForConditionalGeneration.from_pretrained(mname)
input = "{texts[src_lang]}"
input_ids = tokenizer.encode(input, return_tensors="pt")
outputs = model.generate(input_ids)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded) # {texts[tgt_lang]}
```
#### Limitations and bias
## Training data
Pretrained weights were left identical to the original model released by allenai. For more details, please, see the [paper](https://arxiv.org/abs/2006.10369).
## Eval results
Here are the BLEU scores:
model | fairseq | transformers
-------|---------|----------
{model_name} | {scores[model_name][0]} | {scores[model_name][1]}
The score is slightly below the score reported in the paper, as the researchers don't use `sacrebleu` and measure the score on tokenized outputs. `transformers` score was measured using `sacrebleu` on detokenized outputs.
The score was calculated using this code:
```bash
git clone https://github.com/huggingface/transformers
cd transformers
export PAIR={pair}
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
export NUM_BEAMS=5
mkdir -p $DATA_DIR
sacrebleu -t wmt16 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt16 -l $PAIR --echo ref > $DATA_DIR/val.target
echo $PAIR
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
```
## Data Sources
- [training, etc.](http://www.statmt.org/wmt16/)
- [test set](http://matrix.statmt.org/test_sets/newstest2016.tgz?1504722372)
### BibTeX entry and citation info
```
@misc{{kasai2020deep,
title={{Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation}},
author={{Jungo Kasai and Nikolaos Pappas and Hao Peng and James Cross and Noah A. Smith}},
year={{2020}},
eprint={{2006.10369}},
archivePrefix={{arXiv}},
primaryClass={{cs.CL}}
}}
```
"""
model_card_dir.mkdir(parents=True, exist_ok=True)
path = os.path.join(model_card_dir, "README.md")
print(f"Generating {path}")
with open(path, "w", encoding="utf-8") as f:
f.write(readme)
# make sure we are under the root of the project
repo_dir = Path(__file__).resolve().parent.parent.parent
model_cards_dir = repo_dir / "model_cards"
for model_name in ["wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1"]:
model_card_dir = model_cards_dir / "allenai" / model_name
write_model_card(model_card_dir, src_lang="en", tgt_lang="de", model_name=model_name)

View File

@ -1,153 +0,0 @@
#!/usr/bin/env python
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Usage:
# ./gen-card-allenai-wmt19.py
import os
from pathlib import Path
def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
texts = {
"en": "Machine learning is great, isn't it?",
"ru": "Машинное обучение - это здорово, не так ли?",
"de": "Maschinelles Lernen ist großartig, nicht wahr?",
}
# BLUE scores as follows:
# "pair": [fairseq, transformers]
scores = {
"wmt19-de-en-6-6-base": [0, 38.37],
"wmt19-de-en-6-6-big": [0, 39.90],
}
pair = f"{src_lang}-{tgt_lang}"
readme = f"""
---
language:
- {src_lang}
- {tgt_lang}
thumbnail:
tags:
- translation
- wmt19
- allenai
license: apache-2.0
datasets:
- wmt19
metrics:
- bleu
---
# FSMT
## Model description
This is a ported version of fairseq-based [wmt19 transformer](https://github.com/jungokasai/deep-shallow/) for {src_lang}-{tgt_lang}.
For more details, please, see [Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation](https://arxiv.org/abs/2006.10369).
2 models are available:
* [wmt19-de-en-6-6-big](https://huggingface.co/allenai/wmt19-de-en-6-6-big)
* [wmt19-de-en-6-6-base](https://huggingface.co/allenai/wmt19-de-en-6-6-base)
## Intended uses & limitations
#### How to use
```python
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
mname = "allenai/{model_name}"
tokenizer = FSMTTokenizer.from_pretrained(mname)
model = FSMTForConditionalGeneration.from_pretrained(mname)
input = "{texts[src_lang]}"
input_ids = tokenizer.encode(input, return_tensors="pt")
outputs = model.generate(input_ids)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded) # {texts[tgt_lang]}
```
#### Limitations and bias
## Training data
Pretrained weights were left identical to the original model released by allenai. For more details, please, see the [paper](https://arxiv.org/abs/2006.10369).
## Eval results
Here are the BLEU scores:
model | transformers
-------|---------
{model_name} | {scores[model_name][1]}
The score was calculated using this code:
```bash
git clone https://github.com/huggingface/transformers
cd transformers
export PAIR={pair}
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
export NUM_BEAMS=5
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
echo $PAIR
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
```
## Data Sources
- [training, etc.](http://www.statmt.org/wmt19/)
- [test set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561)
### BibTeX entry and citation info
```
@misc{{kasai2020deep,
title={{Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation}},
author={{Jungo Kasai and Nikolaos Pappas and Hao Peng and James Cross and Noah A. Smith}},
year={{2020}},
eprint={{2006.10369}},
archivePrefix={{arXiv}},
primaryClass={{cs.CL}}
}}
```
"""
model_card_dir.mkdir(parents=True, exist_ok=True)
path = os.path.join(model_card_dir, "README.md")
print(f"Generating {path}")
with open(path, "w", encoding="utf-8") as f:
f.write(readme)
# make sure we are under the root of the project
repo_dir = Path(__file__).resolve().parent.parent.parent
model_cards_dir = repo_dir / "model_cards"
for model_name in ["wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big"]:
model_card_dir = model_cards_dir / "allenai" / model_name
write_model_card(model_card_dir, src_lang="de", tgt_lang="en", model_name=model_name)

View File

@ -1,165 +0,0 @@
#!/usr/bin/env python
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Usage:
# ./gen-card-facebook-wmt19.py
import os
from pathlib import Path
def write_model_card(model_card_dir, src_lang, tgt_lang):
texts = {
"en": "Machine learning is great, isn't it?",
"ru": "Машинное обучение - это здорово, не так ли?",
"de": "Maschinelles Lernen ist großartig, oder?",
}
# BLUE scores as follows:
# "pair": [fairseq, transformers]
scores = {
"ru-en": ["[41.3](http://matrix.statmt.org/matrix/output/1907?run_id=6937)", "39.20"],
"en-ru": ["[36.4](http://matrix.statmt.org/matrix/output/1914?run_id=6724)", "33.47"],
"en-de": ["[43.1](http://matrix.statmt.org/matrix/output/1909?run_id=6862)", "42.83"],
"de-en": ["[42.3](http://matrix.statmt.org/matrix/output/1902?run_id=6750)", "41.35"],
}
pair = f"{src_lang}-{tgt_lang}"
readme = f"""
---
language:
- {src_lang}
- {tgt_lang}
thumbnail:
tags:
- translation
- wmt19
- facebook
license: apache-2.0
datasets:
- wmt19
metrics:
- bleu
---
# FSMT
## Model description
This is a ported version of [fairseq wmt19 transformer](https://github.com/pytorch/fairseq/blob/master/examples/wmt19/README.md) for {src_lang}-{tgt_lang}.
For more details, please see, [Facebook FAIR's WMT19 News Translation Task Submission](https://arxiv.org/abs/1907.06616).
The abbreviation FSMT stands for FairSeqMachineTranslation
All four models are available:
* [wmt19-en-ru](https://huggingface.co/facebook/wmt19-en-ru)
* [wmt19-ru-en](https://huggingface.co/facebook/wmt19-ru-en)
* [wmt19-en-de](https://huggingface.co/facebook/wmt19-en-de)
* [wmt19-de-en](https://huggingface.co/facebook/wmt19-de-en)
## Intended uses & limitations
#### How to use
```python
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
mname = "facebook/wmt19-{src_lang}-{tgt_lang}"
tokenizer = FSMTTokenizer.from_pretrained(mname)
model = FSMTForConditionalGeneration.from_pretrained(mname)
input = "{texts[src_lang]}"
input_ids = tokenizer.encode(input, return_tensors="pt")
outputs = model.generate(input_ids)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded) # {texts[tgt_lang]}
```
#### Limitations and bias
- The original (and this ported model) doesn't seem to handle well inputs with repeated sub-phrases, [content gets truncated](https://discuss.huggingface.co/t/issues-with-translating-inputs-containing-repeated-phrases/981)
## Training data
Pretrained weights were left identical to the original model released by fairseq. For more details, please, see the [paper](https://arxiv.org/abs/1907.06616).
## Eval results
pair | fairseq | transformers
-------|---------|----------
{pair} | {scores[pair][0]} | {scores[pair][1]}
The score is slightly below the score reported by `fairseq`, since `transformers`` currently doesn't support:
- model ensemble, therefore the best performing checkpoint was ported (``model4.pt``).
- re-ranking
The score was calculated using this code:
```bash
git clone https://github.com/huggingface/transformers
cd transformers
export PAIR={pair}
export DATA_DIR=data/$PAIR
export SAVE_DIR=data/$PAIR
export BS=8
export NUM_BEAMS=15
mkdir -p $DATA_DIR
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
echo $PAIR
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
```
note: fairseq reports using a beam of 50, so you should get a slightly higher score if re-run with `--num_beams 50`.
## Data Sources
- [training, etc.](http://www.statmt.org/wmt19/)
- [test set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561)
### BibTeX entry and citation info
```bibtex
@inproceedings{{...,
year={{2020}},
title={{Facebook FAIR's WMT19 News Translation Task Submission}},
author={{Ng, Nathan and Yee, Kyra and Baevski, Alexei and Ott, Myle and Auli, Michael and Edunov, Sergey}},
booktitle={{Proc. of WMT}},
}}
```
## TODO
- port model ensemble (fairseq uses 4 model checkpoints)
"""
os.makedirs(model_card_dir, exist_ok=True)
path = os.path.join(model_card_dir, "README.md")
print(f"Generating {path}")
with open(path, "w", encoding="utf-8") as f:
f.write(readme)
# make sure we are under the root of the project
repo_dir = Path(__file__).resolve().parent.parent.parent
model_cards_dir = repo_dir / "model_cards"
for model_name in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]:
base, src_lang, tgt_lang = model_name.split("-")
model_card_dir = model_cards_dir / "facebook" / model_name
write_model_card(model_card_dir, src_lang=src_lang, tgt_lang=tgt_lang)

View File

@ -1,116 +0,0 @@
# this is the process of uploading the updated models to s3. As I can't upload them directly to the correct orgs, this script shows how this is done
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
1. upload updated models to my account
transformers-cli upload -y wmt19-ru-en
transformers-cli upload -y wmt19-en-ru
transformers-cli upload -y wmt19-de-en
transformers-cli upload -y wmt19-en-de
transformers-cli upload -y wmt19-de-en-6-6-base
transformers-cli upload -y wmt19-de-en-6-6-big
transformers-cli upload -y wmt16-en-de-dist-12-1
transformers-cli upload -y wmt16-en-de-dist-6-1
transformers-cli upload -y wmt16-en-de-12-1
2. ask someone to move them to:
* to facebook: "wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"
* to allenai: "wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1", "wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big"
export b="s3://models.huggingface.co/bert"
stas_to_fb () {
src=$1
shift
aws s3 sync $b/stas/$src $b/facebook/$src $@
}
stas_to_allenai () {
src=$1
shift
aws s3 sync $b/stas/$src $b/allenai/$src $@
}
stas_to_fb wmt19-en-ru
stas_to_fb wmt19-ru-en
stas_to_fb wmt19-en-de
stas_to_fb wmt19-de-en
stas_to_allenai wmt16-en-de-dist-12-1
stas_to_allenai wmt16-en-de-dist-6-1
stas_to_allenai wmt16-en-de-6-1
stas_to_allenai wmt16-en-de-12-1
stas_to_allenai wmt19-de-en-6-6-base
stas_to_allenai wmt19-de-en-6-6-big
3. and then remove all these model files from my account
transformers-cli s3 rm wmt16-en-de-12-1/config.json
transformers-cli s3 rm wmt16-en-de-12-1/merges.txt
transformers-cli s3 rm wmt16-en-de-12-1/pytorch_model.bin
transformers-cli s3 rm wmt16-en-de-12-1/tokenizer_config.json
transformers-cli s3 rm wmt16-en-de-12-1/vocab-src.json
transformers-cli s3 rm wmt16-en-de-12-1/vocab-tgt.json
transformers-cli s3 rm wmt16-en-de-dist-12-1/config.json
transformers-cli s3 rm wmt16-en-de-dist-12-1/merges.txt
transformers-cli s3 rm wmt16-en-de-dist-12-1/pytorch_model.bin
transformers-cli s3 rm wmt16-en-de-dist-12-1/tokenizer_config.json
transformers-cli s3 rm wmt16-en-de-dist-12-1/vocab-src.json
transformers-cli s3 rm wmt16-en-de-dist-12-1/vocab-tgt.json
transformers-cli s3 rm wmt16-en-de-dist-6-1/config.json
transformers-cli s3 rm wmt16-en-de-dist-6-1/merges.txt
transformers-cli s3 rm wmt16-en-de-dist-6-1/pytorch_model.bin
transformers-cli s3 rm wmt16-en-de-dist-6-1/tokenizer_config.json
transformers-cli s3 rm wmt16-en-de-dist-6-1/vocab-src.json
transformers-cli s3 rm wmt16-en-de-dist-6-1/vocab-tgt.json
transformers-cli s3 rm wmt19-de-en-6-6-base/config.json
transformers-cli s3 rm wmt19-de-en-6-6-base/merges.txt
transformers-cli s3 rm wmt19-de-en-6-6-base/pytorch_model.bin
transformers-cli s3 rm wmt19-de-en-6-6-base/tokenizer_config.json
transformers-cli s3 rm wmt19-de-en-6-6-base/vocab-src.json
transformers-cli s3 rm wmt19-de-en-6-6-base/vocab-tgt.json
transformers-cli s3 rm wmt19-de-en-6-6-big/config.json
transformers-cli s3 rm wmt19-de-en-6-6-big/merges.txt
transformers-cli s3 rm wmt19-de-en-6-6-big/pytorch_model.bin
transformers-cli s3 rm wmt19-de-en-6-6-big/tokenizer_config.json
transformers-cli s3 rm wmt19-de-en-6-6-big/vocab-src.json
transformers-cli s3 rm wmt19-de-en-6-6-big/vocab-tgt.json
transformers-cli s3 rm wmt19-de-en/config.json
transformers-cli s3 rm wmt19-de-en/merges.txt
transformers-cli s3 rm wmt19-de-en/pytorch_model.bin
transformers-cli s3 rm wmt19-de-en/tokenizer_config.json
transformers-cli s3 rm wmt19-de-en/vocab-src.json
transformers-cli s3 rm wmt19-de-en/vocab-tgt.json
transformers-cli s3 rm wmt19-en-de/config.json
transformers-cli s3 rm wmt19-en-de/merges.txt
transformers-cli s3 rm wmt19-en-de/pytorch_model.bin
transformers-cli s3 rm wmt19-en-de/tokenizer_config.json
transformers-cli s3 rm wmt19-en-de/vocab-src.json
transformers-cli s3 rm wmt19-en-de/vocab-tgt.json
transformers-cli s3 rm wmt19-en-ru/config.json
transformers-cli s3 rm wmt19-en-ru/merges.txt
transformers-cli s3 rm wmt19-en-ru/pytorch_model.bin
transformers-cli s3 rm wmt19-en-ru/tokenizer_config.json
transformers-cli s3 rm wmt19-en-ru/vocab-src.json
transformers-cli s3 rm wmt19-en-ru/vocab-tgt.json
transformers-cli s3 rm wmt19-ru-en/config.json
transformers-cli s3 rm wmt19-ru-en/merges.txt
transformers-cli s3 rm wmt19-ru-en/pytorch_model.bin
transformers-cli s3 rm wmt19-ru-en/tokenizer_config.json
transformers-cli s3 rm wmt19-ru-en/vocab-src.json
transformers-cli s3 rm wmt19-ru-en/vocab-tgt.json

View File

@ -1,19 +0,0 @@
#!/usr/bin/env bash
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# these scripts need to be run before any changes to FSMT-related code - it should cover all bases
CUDA_VISIBLE_DEVICES="" RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py
RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py

View File

@ -1,34 +0,0 @@
#!/usr/bin/env python
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# this script builds a small sample spm file tests/fixtures/test_sentencepiece_no_bos.model, with features needed by pegasus
# 1. pip install sentencepiece
#
# 2. wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt
# 3. build
import sentencepiece as spm
# pegasus:
# 1. no bos
# 2. eos_id is 1
# 3. unk_id is 2
# build a sample spm file accordingly
spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=test_sentencepiece_no_bos --bos_id=-1 --unk_id=2 --eos_id=1 --vocab_size=1000')
# 4. now update the fixture
# mv test_sentencepiece_no_bos.model ../../tests/fixtures/

View File

@ -1,72 +0,0 @@
<!---
Copyright 2020 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
Setup transformers following instructions in README.md, (I would fork first).
```bash
git clone git@github.com:huggingface/transformers.git
cd transformers
pip install -e .
pip install pandas GitPython wget
```
Get required metadata
```bash
curl https://cdn-datasets.huggingface.co/language_codes/language-codes-3b2.csv > language-codes-3b2.csv
curl https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv > iso-639-3.csv
```
Install Tatoeba-Challenge repo inside transformers
```bash
git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git
```
To convert a few models, call the conversion script from command line:
```bash
python src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py --models heb-eng eng-heb --save_dir converted
```
To convert lots of models you can pass your list of Tatoeba model names to `resolver.convert_models` in a python client or script.
```python
from transformers.convert_marian_tatoeba_to_pytorch import TatoebaConverter
resolver = TatoebaConverter(save_dir='converted')
resolver.convert_models(['heb-eng', 'eng-heb'])
```
### Upload converted models
Since version v3.5.0, the model sharing workflow is switched to git-based system . Refer to [model sharing doc](https://huggingface.co/transformers/main/model_sharing.html#model-sharing-and-uploading) for more details.
To upload all converted models,
1. Install [git-lfs](https://git-lfs.github.com/).
2. Login to `huggingface-cli`
```bash
huggingface-cli login
```
3. Run the `upload_models` script
```bash
./scripts/tatoeba/upload_models.sh
```
### Modifications
- To change naming logic, change the code near `os.rename`. The model card creation code may also need to change.
- To change model card content, you must modify `TatoebaCodeResolver.write_model_card`

View File

@ -1,12 +0,0 @@
#!/bin/bash
for FILE in converted/*; do
model_name=`basename $FILE`
huggingface-cli repo create $model_name -y
git clone https://huggingface.co/Helsinki-NLP/$model_name
mv $FILE/* $model_name/
cd $model_name
git add . && git commit -m "initial commit"
git push
cd ..
done

View File

@ -163,7 +163,6 @@ _deps = [
"rjieba",
"rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
"ruff==0.11.2",
"sacrebleu>=1.4.12,<2.0.0",
"sacremoses",
"safetensors>=0.4.3",
"sagemaker>=2.31.0",
@ -344,7 +343,6 @@ extras["testing"] = (
"evaluate",
"pytest-timeout",
"ruff",
"sacrebleu",
"rouge-score",
"nltk",
"GitPython",

View File

@ -69,7 +69,6 @@ deps = {
"rjieba": "rjieba",
"rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
"ruff": "ruff==0.11.2",
"sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
"sacremoses": "sacremoses",
"safetensors": "safetensors>=0.4.3",
"sagemaker": "sagemaker>=2.31.0",

View File

@ -112,6 +112,7 @@ _CONFIG_FOR_DOC = "FSMTConfig"
"""
Here is how to compare BLEU scores against fairseq implementation:
(don't forget to install sacrebleu: `pip install sacrebleu`)
# en-ru