mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
[cleanup] remove old scripts in /scripts
🧹 🧹 (#37676)
* rm old files * not this one
This commit is contained in:
parent
6673081b21
commit
0f8c34b0a0
@ -1,448 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# HF Trainer benchmarking tool
|
||||
#
|
||||
# This tool can be used to run and compare multiple dimensions of the HF Trainers args.
|
||||
#
|
||||
# It then prints a report once in github format with all the information that needs to be shared
|
||||
# with others and second time in a console-friendly format, so it's easier to use for tuning things up.
|
||||
#
|
||||
# The main idea is:
|
||||
#
|
||||
# ./trainer-benchmark.py --base-cmd '<cmd args that don't change>' \
|
||||
# --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \
|
||||
# --target-metric-key train_samples_per_second
|
||||
#
|
||||
# The variations can be any command line argument that you want to compare and not just dtype as in
|
||||
# the example.
|
||||
#
|
||||
# --variations allows you to compare variations in multiple dimensions.
|
||||
#
|
||||
# as the first dimension has 2 options and the second 3 in our example, this will run the trainer 6
|
||||
# times adding one of:
|
||||
#
|
||||
# 1. --tf32 0 --fp16 0
|
||||
# 2. --tf32 0 --fp16 1
|
||||
# 3. --tf32 0 --bf16 1
|
||||
# 4. --tf32 1 --fp16 0
|
||||
# 5. --tf32 1 --fp16 1
|
||||
# 6. --tf32 1 --bf16 1
|
||||
#
|
||||
# and print the results. This is just a cartesian product - and more than 2 dimensions can be used.
|
||||
#
|
||||
# If you want to rely on defaults, this:
|
||||
# --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1'
|
||||
# is identical to this:
|
||||
# --variations '--tf32 0|--tf32 1' '|--fp16|--bf16'
|
||||
#
|
||||
# the leading empty variation in the 2nd dimension is a valid variation.
|
||||
#
|
||||
# So here we get the following 6 variations:
|
||||
#
|
||||
# 1. --tf32 0
|
||||
# 2. --tf32 0 --fp16
|
||||
# 3. --tf32 0 --bf16
|
||||
# 4. --tf32 1
|
||||
# 5. --tf32 1 --fp16
|
||||
# 6. --tf32 1 --bf16
|
||||
#
|
||||
# In this particular case we don't know what the default tf32 setting is as it's normally
|
||||
# pytorch-version dependent). That's why it's best to do an explicit setting of each variation:
|
||||
# `--tf32 0|--tf32 1`
|
||||
#
|
||||
# Here is a full example of a train:
|
||||
#
|
||||
# CUDA_VISIBLE_DEVICES=0 python ./scripts/benchmark/trainer-benchmark.py \
|
||||
# --base-cmd \
|
||||
# ' examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small \
|
||||
# --output_dir output_dir --do_train --label_smoothing 0.1 --logging_strategy no \
|
||||
# --save_strategy no --per_device_train_batch_size 32 --max_source_length 512 \
|
||||
# --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \
|
||||
# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \
|
||||
# --source_prefix "translate English to Romanian: " --warmup_steps 50 \
|
||||
# --max_train_samples 20000 --dataloader_num_workers 2 ' \
|
||||
# --target-metric-key train_samples_per_second --repeat-times 1 --variations \
|
||||
# '|--fp16|--bf16' '--tf32 0|--tf32 1' --report-metric-keys train_loss \
|
||||
# --repeat-times 1 --base-variation '--tf32 0'
|
||||
#
|
||||
# and here is a possible output:
|
||||
#
|
||||
#
|
||||
# | Variation | Train | Diff | Train |
|
||||
# | | samples | % | loss |
|
||||
# | | per | | |
|
||||
# | | second | | |
|
||||
# |:----------------|----------:|-------:|--------:|
|
||||
# | --tf32 0 | 285.11 | 0 | 2.51 |
|
||||
# | --tf32 1 | 342.09 | 20 | 2.51 |
|
||||
# | --fp16 --tf32 0 | 423.49 | 49 | 2.51 |
|
||||
# | --fp16 --tf32 1 | 423.13 | 48 | 2.51 |
|
||||
# | --bf16 --tf32 0 | 416.80 | 46 | 2.52 |
|
||||
# | --bf16 --tf32 1 | 415.87 | 46 | 2.52 |
|
||||
#
|
||||
#
|
||||
# So you can quickly compare the different outcomes.
|
||||
#
|
||||
# Typically running each experiment once is enough, but if the environment is unstable you can
|
||||
# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the averaged results.
|
||||
#
|
||||
# By default it'll use the lowest result as the base line to use as 100% and then compare the rest to
|
||||
# it as can be seen from the table above, but you can also specify which combination is the one to use as
|
||||
# the baseline, e.g., to change to another entry use: --base-variation '--tf32 1 --fp16 0'
|
||||
#
|
||||
# --target-metric-key is there to tell the program which metrics to compare - the different metric keys are
|
||||
# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use:
|
||||
# --target-metric-key eval_samples_per_second
|
||||
# but of course you will need to adjust the --base-cmd value in the example to perform evaluation as
|
||||
# well (as currently it doesn't)
|
||||
#
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import io
|
||||
import itertools
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from statistics import fmean
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
import transformers
|
||||
|
||||
|
||||
nan = float("nan")
|
||||
|
||||
|
||||
class Tee:
|
||||
"""
|
||||
A helper class to tee print's output into a file.
|
||||
Usage:
|
||||
sys.stdout = Tee(filename)
|
||||
"""
|
||||
|
||||
def __init__(self, filename):
|
||||
self.stdout = sys.stdout
|
||||
self.file = open(filename, "a")
|
||||
|
||||
def __getattr__(self, attr):
|
||||
return getattr(self.stdout, attr)
|
||||
|
||||
def write(self, msg):
|
||||
self.stdout.write(msg)
|
||||
# strip tqdm codes
|
||||
self.file.write(re.sub(r"^.*\r", "", msg, 0, re.M))
|
||||
|
||||
|
||||
def get_original_command(max_width=80, full_python_path=False):
|
||||
"""
|
||||
Return the original command line string that can be replayed nicely and wrapped for 80 char width.
|
||||
|
||||
Args:
|
||||
max_width (`int`, *optional*, defaults to 80):
|
||||
The width to wrap for.
|
||||
full_python_path (`bool`, `optional`, defaults to `False`):
|
||||
Whether to replicate the full path or just the last segment (i.e. `python`).
|
||||
"""
|
||||
|
||||
cmd = []
|
||||
|
||||
# deal with critical env vars
|
||||
env_keys = ["CUDA_VISIBLE_DEVICES"]
|
||||
for key in env_keys:
|
||||
val = os.environ.get(key, None)
|
||||
if val is not None:
|
||||
cmd.append(f"{key}={val}")
|
||||
|
||||
# python executable (not always needed if the script is executable)
|
||||
python = sys.executable if full_python_path else sys.executable.split("/")[-1]
|
||||
cmd.append(python)
|
||||
|
||||
# now the normal args
|
||||
cmd += list(map(shlex.quote, sys.argv))
|
||||
|
||||
# split up into up to MAX_WIDTH lines with shell multi-line escapes
|
||||
lines = []
|
||||
current_line = ""
|
||||
while len(cmd) > 0:
|
||||
current_line += f"{cmd.pop(0)} "
|
||||
if len(cmd) == 0 or len(current_line) + len(cmd[0]) + 1 > max_width - 1:
|
||||
lines.append(current_line)
|
||||
current_line = ""
|
||||
return "\\\n".join(lines)
|
||||
|
||||
|
||||
def get_base_command(args, output_dir):
|
||||
|
||||
# unwrap multi-line input
|
||||
args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd)
|
||||
|
||||
# remove --output_dir if any and set our own
|
||||
args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd)
|
||||
args.base_cmd += f" --output_dir {output_dir}"
|
||||
|
||||
# ensure we have --overwrite_output_dir
|
||||
args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd)
|
||||
args.base_cmd += " --overwrite_output_dir"
|
||||
|
||||
return [sys.executable] + shlex.split(args.base_cmd)
|
||||
|
||||
|
||||
def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose):
|
||||
|
||||
# Enable to debug everything but the run itself, to do it fast and see the progress.
|
||||
# This is useful for debugging the output formatting quickly - we can remove it later once
|
||||
# everybody is happy with the output
|
||||
if 0:
|
||||
import random
|
||||
from time import sleep
|
||||
|
||||
sleep(0)
|
||||
return dict(
|
||||
{k: random.uniform(0, 100) for k in metric_keys},
|
||||
**{target_metric_key: random.choice([nan, 10.31, 100.2, 55.6666, 222.22222222])},
|
||||
)
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
|
||||
if verbose:
|
||||
print("STDOUT", result.stdout)
|
||||
print("STDERR", result.stderr)
|
||||
|
||||
# save the streams
|
||||
prefix = variation.replace(" ", "-")
|
||||
with open(Path(output_dir) / f"log.{prefix}.stdout.txt", "w") as f:
|
||||
f.write(result.stdout)
|
||||
with open(Path(output_dir) / f"log.{prefix}.stderr.txt", "w") as f:
|
||||
f.write(result.stderr)
|
||||
|
||||
if result.returncode != 0:
|
||||
if verbose:
|
||||
print("failed")
|
||||
return {target_metric_key: nan}
|
||||
|
||||
with io.open(f"{output_dir}/all_results.json", "r", encoding="utf-8") as f:
|
||||
metrics = json.load(f)
|
||||
|
||||
# filter out just the keys we want
|
||||
return {k: v for k, v in metrics.items() if k in metric_keys}
|
||||
|
||||
|
||||
def process_run(
|
||||
id,
|
||||
cmd,
|
||||
variation_key,
|
||||
variation,
|
||||
longest_variation_len,
|
||||
target_metric_key,
|
||||
report_metric_keys,
|
||||
repeat_times,
|
||||
output_dir,
|
||||
verbose,
|
||||
):
|
||||
results = []
|
||||
metrics = []
|
||||
preamble = f"{id}: {variation:<{longest_variation_len}}"
|
||||
outcome = f"{preamble}: "
|
||||
metric_keys = set(report_metric_keys + [target_metric_key])
|
||||
for i in tqdm(range(repeat_times), desc=preamble, leave=False):
|
||||
single_run_metrics = process_run_single(
|
||||
id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose
|
||||
)
|
||||
result = single_run_metrics[target_metric_key]
|
||||
if not math.isnan(result):
|
||||
metrics.append(single_run_metrics)
|
||||
results.append(result)
|
||||
outcome += "✓"
|
||||
else:
|
||||
outcome += "✘"
|
||||
outcome = f"\33[2K\r{outcome}"
|
||||
if len(metrics) > 0:
|
||||
mean_metrics = {k: fmean([x[k] for x in metrics]) for k in metrics[0].keys()}
|
||||
mean_target = round(mean_metrics[target_metric_key], 2)
|
||||
results_str = f"{outcome} {mean_target}"
|
||||
if len(metrics) > 1:
|
||||
results_str += f" {tuple(round(x, 2) for x in results)}"
|
||||
print(results_str)
|
||||
mean_metrics[variation_key] = variation
|
||||
return mean_metrics
|
||||
else:
|
||||
print(outcome)
|
||||
return {variation_key: variation, target_metric_key: nan}
|
||||
|
||||
|
||||
def get_versions():
|
||||
properties = torch.cuda.get_device_properties(torch.device("cuda"))
|
||||
return f"""
|
||||
Datetime : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
||||
|
||||
Software:
|
||||
transformers: {transformers.__version__}
|
||||
torch : {torch.__version__}
|
||||
cuda : {torch.version.cuda}
|
||||
python : {platform.python_version()}
|
||||
|
||||
Hardware:
|
||||
{torch.cuda.device_count()} GPUs : {properties.name}, {properties.total_memory/2**30:0.2f}GB
|
||||
"""
|
||||
|
||||
|
||||
def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir):
|
||||
|
||||
df = pd.DataFrame(results)
|
||||
variation_key = "variation"
|
||||
diff_key = "diff_%"
|
||||
|
||||
sentinel_value = nan
|
||||
if base_variation is not None and len(df[df[variation_key] == base_variation]):
|
||||
# this may still return nan
|
||||
sentinel_value = df.loc[df[variation_key] == base_variation][target_metric_key].item()
|
||||
if math.isnan(sentinel_value):
|
||||
# as a fallback, use the minimal value as the sentinel
|
||||
sentinel_value = df.loc[df[target_metric_key] != nan][target_metric_key].min()
|
||||
|
||||
# create diff column if possible
|
||||
if not math.isnan(sentinel_value):
|
||||
df[diff_key] = df.apply(
|
||||
lambda r: round(100 * (r[target_metric_key] - sentinel_value) / sentinel_value)
|
||||
if not math.isnan(r[target_metric_key])
|
||||
else 0,
|
||||
axis="columns",
|
||||
)
|
||||
|
||||
# re-order columns
|
||||
cols = [variation_key, target_metric_key, diff_key, *report_metric_keys]
|
||||
df = df.reindex(cols, axis="columns") # reorder cols
|
||||
|
||||
# capitalize
|
||||
df = df.rename(str.capitalize, axis="columns")
|
||||
|
||||
# make the cols as narrow as possible
|
||||
df_github = df.rename(lambda c: c.replace("_", "<br>"), axis="columns")
|
||||
df_console = df.rename(lambda c: c.replace("_", "\n"), axis="columns")
|
||||
|
||||
report = ["", "Copy between the cut-here-lines and paste as is to github or a forum"]
|
||||
report += ["----------8<-----------------8<--------"]
|
||||
report += ["*** Results:", df_github.to_markdown(index=False, floatfmt=".2f")]
|
||||
report += ["```"]
|
||||
report += ["*** Setup:", get_versions()]
|
||||
report += ["*** The benchmark command line was:", get_original_command()]
|
||||
report += ["```"]
|
||||
report += ["----------8<-----------------8<--------"]
|
||||
report += ["*** Results (console):", df_console.to_markdown(index=False, floatfmt=".2f")]
|
||||
|
||||
print("\n\n".join(report))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--base-cmd",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Base cmd",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--variations",
|
||||
default=None,
|
||||
type=str,
|
||||
nargs="+",
|
||||
required=True,
|
||||
help="Multi-dimensional variations, example: '|--fp16|--bf16' '|--tf32'",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-variation",
|
||||
default=None,
|
||||
type=str,
|
||||
help="Baseline variation to compare to. if None the minimal target value will be used to compare against",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--target-metric-key",
|
||||
default=None,
|
||||
type=str,
|
||||
required=True,
|
||||
help="Target metric key in output_dir/all_results.json, e.g., train_samples_per_second",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report-metric-keys",
|
||||
default="",
|
||||
type=str,
|
||||
help="Report metric keys - other metric keys from output_dir/all_results.json to report, e.g., train_loss. Use a single argument e.g., 'train_loss train_samples",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repeat-times",
|
||||
default=1,
|
||||
type=int,
|
||||
help="How many times to re-run each variation - an average will be reported",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_dir",
|
||||
default="output_benchmark",
|
||||
type=str,
|
||||
help="The output directory where all the benchmark reports will go to and additionally this directory will be used to override --output_dir in the script that is being benchmarked",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Whether to show the outputs of each run or just the benchmark progress",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_dir = args.output_dir
|
||||
Path(output_dir).mkdir(exist_ok=True)
|
||||
base_cmd = get_base_command(args, output_dir)
|
||||
|
||||
# split each dimension into its --foo variations
|
||||
dims = [list(map(str.strip, re.split(r"\|", x))) for x in args.variations]
|
||||
# build a cartesian product of dimensions and convert those back into cmd-line arg strings,
|
||||
# while stripping white space for inputs that were empty
|
||||
variations = list(map(str.strip, map(" ".join, itertools.product(*dims))))
|
||||
longest_variation_len = max(len(x) for x in variations)
|
||||
|
||||
# split wanted keys
|
||||
report_metric_keys = args.report_metric_keys.split()
|
||||
|
||||
# capture prints into a log file for convenience
|
||||
report_fn = f"benchmark-report-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"
|
||||
print(f"\nNote: each run's output is also logged under {output_dir}/log.*.std*.txt")
|
||||
print(f"and this script's output is also piped into {report_fn}")
|
||||
|
||||
sys.stdout = Tee(report_fn)
|
||||
|
||||
print(f"\n*** Running {len(variations)} benchmarks:")
|
||||
print(f"Base command: {' '.join(base_cmd)}")
|
||||
|
||||
variation_key = "variation"
|
||||
results = []
|
||||
for id, variation in enumerate(tqdm(variations, desc="Total completion: ", leave=False)):
|
||||
cmd = base_cmd + variation.split()
|
||||
results.append(
|
||||
process_run(
|
||||
id + 1,
|
||||
cmd,
|
||||
variation_key,
|
||||
variation,
|
||||
longest_variation_len,
|
||||
args.target_metric_key,
|
||||
report_metric_keys,
|
||||
args.repeat_times,
|
||||
output_dir,
|
||||
args.verbose,
|
||||
)
|
||||
)
|
||||
|
||||
process_results(results, args.target_metric_key, report_metric_keys, args.base_variation, output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -1,85 +0,0 @@
|
||||
import time
|
||||
|
||||
import torch
|
||||
|
||||
from transformers import AutoModel, AutoTokenizer, pipeline
|
||||
|
||||
|
||||
test_sentence = 'Do you [MASK] the muffin man?'
|
||||
|
||||
# for comparison
|
||||
bert = pipeline('fill-mask', model = 'bert-base-uncased')
|
||||
print('\n'.join([d['sequence'] for d in bert(test_sentence)]))
|
||||
|
||||
|
||||
deberta = pipeline('fill-mask', model = 'microsoft/deberta-v3-base', model_kwargs={"legacy": False})
|
||||
print('\n'.join([d['sequence'] for d in deberta(test_sentence)]))
|
||||
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
|
||||
|
||||
tokenized_dict = tokenizer(
|
||||
["Is this working",], ["Not yet",],
|
||||
return_tensors="pt"
|
||||
)
|
||||
|
||||
deberta.model.forward = torch.compile(deberta.model.forward)
|
||||
start=time.time()
|
||||
deberta.model(**tokenized_dict)
|
||||
end=time.time()
|
||||
print(end-start)
|
||||
|
||||
|
||||
start=time.time()
|
||||
deberta.model(**tokenized_dict)
|
||||
end=time.time()
|
||||
print(end-start)
|
||||
|
||||
|
||||
start=time.time()
|
||||
deberta.model(**tokenized_dict)
|
||||
end=time.time()
|
||||
print(end-start)
|
||||
|
||||
|
||||
model = AutoModel.from_pretrained('microsoft/deberta-base')
|
||||
model.config.return_dict = False
|
||||
model.config.output_hidden_states=False
|
||||
input_tuple = (tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
|
||||
|
||||
|
||||
start=time.time()
|
||||
traced_model = torch.jit.trace(model, input_tuple)
|
||||
end=time.time()
|
||||
print(end-start)
|
||||
|
||||
|
||||
start=time.time()
|
||||
traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
|
||||
end=time.time()
|
||||
print(end-start)
|
||||
|
||||
|
||||
start=time.time()
|
||||
traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
|
||||
end=time.time()
|
||||
print(end-start)
|
||||
|
||||
|
||||
start=time.time()
|
||||
traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
|
||||
end=time.time()
|
||||
print(end-start)
|
||||
|
||||
|
||||
start=time.time()
|
||||
traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
|
||||
end=time.time()
|
||||
print(end-start)
|
||||
|
||||
|
||||
torch.jit.save(traced_model, "compiled_deberta.pt")
|
||||
|
||||
|
||||
|
||||
# my_script_module = torch.jit.script(model)
|
@ -1,71 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# this script acquires data and converts it to fsmt model
|
||||
# it covers:
|
||||
# - allenai/wmt16-en-de-dist-12-1
|
||||
# - allenai/wmt16-en-de-dist-6-1
|
||||
# - allenai/wmt16-en-de-12-1
|
||||
|
||||
# this script needs to be run from the top level of the transformers repo
|
||||
if [ ! -d "src/transformers" ]; then
|
||||
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir data
|
||||
|
||||
# get data (run once)
|
||||
|
||||
cd data
|
||||
gdown 'https://drive.google.com/uc?id=1x_G2cjvM1nW5hjAB8-vWxRqtQTlmIaQU'
|
||||
gdown 'https://drive.google.com/uc?id=1oA2aqZlVNj5FarxBlNXEHpBS4lRetTzU'
|
||||
gdown 'https://drive.google.com/uc?id=1Wup2D318QYBFPW_NKI1mfP_hXOfmUI9r'
|
||||
tar -xvzf trans_ende_12-1_0.2.tar.gz
|
||||
tar -xvzf trans_ende-dist_12-1_0.2.tar.gz
|
||||
tar -xvzf trans_ende-dist_6-1_0.2.tar.gz
|
||||
gdown 'https://drive.google.com/uc?id=1mNufoynJ9-Zy1kJh2TA_lHm2squji0i9'
|
||||
gdown 'https://drive.google.com/uc?id=1iO7um-HWoNoRKDtw27YUSgyeubn9uXqj'
|
||||
tar -xvzf wmt16.en-de.deep-shallow.dist.tar.gz
|
||||
tar -xvzf wmt16.en-de.deep-shallow.tar.gz
|
||||
cp wmt16.en-de.deep-shallow/data-bin/dict.*.txt trans_ende_12-1_0.2
|
||||
cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_12-1_0.2
|
||||
cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_6-1_0.2
|
||||
cp wmt16.en-de.deep-shallow/bpecodes trans_ende_12-1_0.2
|
||||
cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_12-1_0.2
|
||||
cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_6-1_0.2
|
||||
cd -
|
||||
|
||||
# run conversions and uploads
|
||||
|
||||
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-12-1
|
||||
|
||||
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_6-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-6-1
|
||||
|
||||
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-12-1
|
||||
|
||||
|
||||
# upload
|
||||
cd data
|
||||
transformers-cli upload -y wmt16-en-de-dist-12-1
|
||||
transformers-cli upload -y wmt16-en-de-dist-6-1
|
||||
transformers-cli upload -y wmt16-en-de-12-1
|
||||
cd -
|
||||
|
||||
|
||||
# if updating just small files and not the large models, here is a script to generate the right commands:
|
||||
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
|
||||
# add/remove files as needed
|
||||
|
@ -1,59 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# this script acquires data and converts it to fsmt model
|
||||
# it covers:
|
||||
# - allenai/wmt19-de-en-6-6-base
|
||||
# - allenai/wmt19-de-en-6-6-big
|
||||
|
||||
# this script needs to be run from the top level of the transformers repo
|
||||
if [ ! -d "src/transformers" ]; then
|
||||
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir data
|
||||
|
||||
# get data (run once)
|
||||
|
||||
cd data
|
||||
gdown 'https://drive.google.com/uc?id=1j6z9fYdlUyOYsh7KJoumRlr1yHczxR5T'
|
||||
gdown 'https://drive.google.com/uc?id=1yT7ZjqfvUYOBXvMjeY8uGRHQFWoSo8Q5'
|
||||
gdown 'https://drive.google.com/uc?id=15gAzHeRUCs-QV8vHeTReMPEh1j8excNE'
|
||||
tar -xvzf wmt19.de-en.tar.gz
|
||||
tar -xvzf wmt19_deen_base_dr0.1_1.tar.gz
|
||||
tar -xvzf wmt19_deen_big_dr0.1_2.tar.gz
|
||||
cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_base_dr0.1_1
|
||||
cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_big_dr0.1_2
|
||||
cd -
|
||||
|
||||
# run conversions and uploads
|
||||
|
||||
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_base_dr0.1_1/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-base
|
||||
|
||||
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_big_dr0.1_2/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-big
|
||||
|
||||
|
||||
# upload
|
||||
cd data
|
||||
transformers-cli upload -y wmt19-de-en-6-6-base
|
||||
transformers-cli upload -y wmt19-de-en-6-6-big
|
||||
cd -
|
||||
|
||||
|
||||
# if updating just small files and not the large models, here is a script to generate the right commands:
|
||||
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
|
||||
# add/remove files as needed
|
||||
|
@ -1,70 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# this script acquires data and converts it to fsmt model
|
||||
# it covers:
|
||||
# - facebook/wmt19-ru-en
|
||||
# - facebook/wmt19-en-ru
|
||||
# - facebook/wmt19-de-en
|
||||
# - facebook/wmt19-en-de
|
||||
|
||||
# this script needs to be run from the top level of the transformers repo
|
||||
if [ ! -d "src/transformers" ]; then
|
||||
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir data
|
||||
|
||||
# get data (run once)
|
||||
|
||||
cd data
|
||||
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz
|
||||
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz
|
||||
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz
|
||||
wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz
|
||||
tar -xvzf wmt19.en-de.joined-dict.ensemble.tar.gz
|
||||
tar -xvzf wmt19.de-en.joined-dict.ensemble.tar.gz
|
||||
tar -xvzf wmt19.en-ru.ensemble.tar.gz
|
||||
tar -xvzf wmt19.ru-en.ensemble.tar.gz
|
||||
cd -
|
||||
|
||||
# run conversions and uploads
|
||||
|
||||
export PAIR=ru-en
|
||||
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
|
||||
|
||||
export PAIR=en-ru
|
||||
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
|
||||
|
||||
export PAIR=de-en
|
||||
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
|
||||
|
||||
export PAIR=en-de
|
||||
PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
|
||||
|
||||
|
||||
# upload
|
||||
cd data
|
||||
transformers-cli upload -y wmt19-ru-en
|
||||
transformers-cli upload -y wmt19-en-ru
|
||||
transformers-cli upload -y wmt19-de-en
|
||||
transformers-cli upload -y wmt19-en-de
|
||||
cd -
|
||||
|
||||
# if updating just small files and not the large models, here is a script to generate the right commands:
|
||||
perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for map { "wmt19-$_" } ("en-ru", "ru-en", "de-en", "en-de")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
|
||||
# add/remove files as needed
|
||||
|
@ -1,79 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# this script evals the following fsmt models
|
||||
# it covers:
|
||||
# - allenai/wmt16-en-de-dist-12-1
|
||||
# - allenai/wmt16-en-de-dist-6-1
|
||||
# - allenai/wmt16-en-de-12-1
|
||||
|
||||
# this script needs to be run from the top level of the transformers repo
|
||||
if [ ! -d "src/transformers" ]; then
|
||||
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
|
||||
|
||||
### Normal eval ###
|
||||
|
||||
export PAIR=en-de
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=64
|
||||
export NUM_BEAMS=5
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
|
||||
MODEL_PATH=allenai/wmt16-en-de-dist-12-1
|
||||
echo $PAIR $MODEL_PATH
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
|
||||
MODEL_PATH=allenai/wmt16-en-de-dist-6-1
|
||||
echo $PAIR $MODEL_PATH
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
|
||||
MODEL_PATH=allenai/wmt16-en-de-12-1
|
||||
echo $PAIR $MODEL_PATH
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
|
||||
|
||||
|
||||
### Searching hparams eval ###
|
||||
|
||||
|
||||
export PAIR=en-de
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=32
|
||||
export NUM_BEAMS=5
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
|
||||
MODEL_PATH=allenai/wmt16-en-de-dist-12-1
|
||||
echo $PAIR $MODEL_PATH
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
||||
|
||||
|
||||
MODEL_PATH=allenai/wmt16-en-de-dist-6-1
|
||||
echo $PAIR $MODEL_PATH
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
||||
|
||||
|
||||
MODEL_PATH=allenai/wmt16-en-de-12-1
|
||||
echo $PAIR $MODEL_PATH
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
@ -1,67 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# this script evals the following fsmt models
|
||||
# it covers:
|
||||
# - allenai/wmt19-de-en-6-6-base
|
||||
# - allenai/wmt19-de-en-6-6-big
|
||||
|
||||
# this script needs to be run from the top level of the transformers repo
|
||||
if [ ! -d "src/transformers" ]; then
|
||||
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
|
||||
|
||||
### Normal eval ###
|
||||
|
||||
export PAIR=de-en
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=64
|
||||
export NUM_BEAMS=5
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
|
||||
MODEL_PATH=allenai/wmt19-de-en-6-6-base
|
||||
echo $PAIR $MODEL_PATH
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
|
||||
MODEL_PATH=allenai/wmt19-de-en-6-6-big
|
||||
echo $PAIR $MODEL_PATH
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
|
||||
|
||||
|
||||
### Searching hparams eval ###
|
||||
|
||||
export PAIR=de-en
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=16
|
||||
export NUM_BEAMS=5
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
|
||||
MODEL_PATH=allenai/wmt19-de-en-6-6-base
|
||||
echo $PAIR $MODEL_PATH
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
||||
|
||||
MODEL_PATH=allenai/wmt19-de-en-6-6-big
|
||||
echo $PAIR $MODEL_PATH
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
@ -1,161 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# this script evals the following fsmt models
|
||||
# it covers:
|
||||
# - facebook/wmt19-ru-en
|
||||
# - facebook/wmt19-en-ru
|
||||
# - facebook/wmt19-de-en
|
||||
# - facebook/wmt19-en-de
|
||||
|
||||
|
||||
# this script needs to be run from the top level of the transformers repo
|
||||
if [ ! -d "src/transformers" ]; then
|
||||
echo "Error: This script needs to be run from the top of the transformers repo"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
|
||||
|
||||
### a short estimate version for quick testing ###
|
||||
|
||||
export PAIR=en-ru
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=8
|
||||
export NUM_BEAMS=8
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src | head -10 > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref | head -10 > $DATA_DIR/val.target
|
||||
echo $PAIR
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
|
||||
|
||||
|
||||
### Normal eval ###
|
||||
|
||||
# ru-en
|
||||
|
||||
export PAIR=ru-en
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=8
|
||||
export NUM_BEAMS=50
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
|
||||
|
||||
# (target BLEU: 41.3 http://matrix.statmt.org/matrix/output/1907?run_id=6937)
|
||||
|
||||
|
||||
# en-ru
|
||||
|
||||
export PAIR=en-ru
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=8
|
||||
export NUM_BEAMS=50
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
echo $PAIR
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
|
||||
# (target BLEU: 36.4 http://matrix.statmt.org/matrix/output/1914?score_id=37605)
|
||||
|
||||
|
||||
|
||||
# en-de
|
||||
|
||||
export PAIR=en-de
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=8
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
echo $PAIR
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
|
||||
# (target BLEU: 43.1 http://matrix.statmt.org/matrix/output/1909?run_id=6862)
|
||||
|
||||
|
||||
# de-en
|
||||
|
||||
export PAIR=de-en
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=8
|
||||
export NUM_BEAMS=50
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
echo $PAIR
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
|
||||
# (target BLEU: 42.3 http://matrix.statmt.org/matrix/output/1902?run_id=6750)
|
||||
|
||||
|
||||
### Searching hparams eval ###
|
||||
|
||||
# en-ru
|
||||
|
||||
export PAIR=ru-en
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=32
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
|
||||
|
||||
|
||||
# en-ru
|
||||
|
||||
export PAIR=en-ru
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=16
|
||||
mkdir -p $DATA_DIR
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
|
||||
|
||||
# en-de
|
||||
|
||||
export PAIR=en-de
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=16
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
|
||||
|
||||
# de-en
|
||||
|
||||
export PAIR=de-en
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=16
|
||||
mkdir -p $DATA_DIR
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
|
@ -1,88 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This script creates a super tiny model that is useful inside tests, when we just want to test that
|
||||
# the machinery works, without needing to the check the quality of the outcomes.
|
||||
#
|
||||
# This version creates a tiny vocab first, and then a tiny model - so the outcome is truly tiny -
|
||||
# all files ~60KB. As compared to taking a full-size model, reducing to the minimum its layers and
|
||||
# emb dimensions, but keeping the full vocab + merges files, leading to ~3MB in total for all files.
|
||||
# The latter is done by `fsmt-make-super-tiny-model.py`.
|
||||
#
|
||||
# It will be used then as "stas/tiny-wmt19-en-ru"
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTTokenizer
|
||||
from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES
|
||||
|
||||
|
||||
mname_tiny = "tiny-wmt19-en-ru"
|
||||
|
||||
# Build
|
||||
|
||||
# borrowed from a test
|
||||
vocab = [ "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "lo", "low", "er</w>", "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>", ]
|
||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
build_dir = Path(tmpdirname)
|
||||
src_vocab_file = build_dir / VOCAB_FILES_NAMES["src_vocab_file"]
|
||||
tgt_vocab_file = build_dir / VOCAB_FILES_NAMES["tgt_vocab_file"]
|
||||
merges_file = build_dir / VOCAB_FILES_NAMES["merges_file"]
|
||||
with open(src_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
|
||||
with open(tgt_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
|
||||
with open(merges_file, "w") as fp : fp.write("\n".join(merges))
|
||||
|
||||
tokenizer = FSMTTokenizer(
|
||||
langs=["en", "ru"],
|
||||
src_vocab_size = len(vocab),
|
||||
tgt_vocab_size = len(vocab),
|
||||
src_vocab_file=src_vocab_file,
|
||||
tgt_vocab_file=tgt_vocab_file,
|
||||
merges_file=merges_file,
|
||||
)
|
||||
|
||||
config = FSMTConfig(
|
||||
langs=['ru', 'en'],
|
||||
src_vocab_size=1000, tgt_vocab_size=1000,
|
||||
d_model=4,
|
||||
encoder_layers=1, decoder_layers=1,
|
||||
encoder_ffn_dim=4, decoder_ffn_dim=4,
|
||||
encoder_attention_heads=1, decoder_attention_heads=1,
|
||||
)
|
||||
|
||||
tiny_model = FSMTForConditionalGeneration(config)
|
||||
print(f"num of params {tiny_model.num_parameters()}")
|
||||
|
||||
# Test
|
||||
batch = tokenizer(["Making tiny model"], return_tensors="pt")
|
||||
outputs = tiny_model(**batch)
|
||||
|
||||
print("test output:", len(outputs.logits[0]))
|
||||
|
||||
# Save
|
||||
tiny_model.half() # makes it smaller
|
||||
tiny_model.save_pretrained(mname_tiny)
|
||||
tokenizer.save_pretrained(mname_tiny)
|
||||
|
||||
print(f"Generated {mname_tiny}")
|
||||
|
||||
# Upload
|
||||
# transformers-cli upload tiny-wmt19-en-ru
|
@ -1,61 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This script creates a super tiny model that is useful inside tests, when we just want to test that
|
||||
# the machinery works, without needing to the check the quality of the outcomes.
|
||||
#
|
||||
# This version creates a tiny model through reduction of a normal pre-trained model, but keeping the
|
||||
# full vocab, merges file, and thus also resulting in a larger model due to a large vocab size.
|
||||
# This gives ~3MB in total for all files.
|
||||
#
|
||||
# If you want a 50 times smaller than this see `fsmt-make-super-tiny-model.py`, which is slightly more complicated
|
||||
#
|
||||
#
|
||||
# It will be used then as "stas/tiny-wmt19-en-de"
|
||||
|
||||
# Build
|
||||
from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTTokenizer
|
||||
|
||||
|
||||
mname = "facebook/wmt19-en-de"
|
||||
tokenizer = FSMTTokenizer.from_pretrained(mname)
|
||||
# get the correct vocab sizes, etc. from the master model
|
||||
config = FSMTConfig.from_pretrained(mname)
|
||||
config.update({
|
||||
"d_model": 4,
|
||||
"encoder_layers": 1, "decoder_layers": 1,
|
||||
"encoder_ffn_dim": 4, "decoder_ffn_dim": 4,
|
||||
"encoder_attention_heads": 1, "decoder_attention_heads": 1})
|
||||
|
||||
tiny_model = FSMTForConditionalGeneration(config)
|
||||
print(f"num of params {tiny_model.num_parameters()}")
|
||||
|
||||
# Test
|
||||
batch = tokenizer(["Making tiny model"], return_tensors="pt")
|
||||
outputs = tiny_model(**batch)
|
||||
|
||||
print("test output:", len(outputs.logits[0]))
|
||||
|
||||
# Save
|
||||
mname_tiny = "tiny-wmt19-en-de"
|
||||
tiny_model.half() # makes it smaller
|
||||
tiny_model.save_pretrained(mname_tiny)
|
||||
tokenizer.save_pretrained(mname_tiny)
|
||||
|
||||
print(f"Generated {mname_tiny}")
|
||||
|
||||
# Upload
|
||||
# transformers-cli upload tiny-wmt19-en-de
|
@ -1,156 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Usage:
|
||||
# ./gen-card-allenai-wmt16.py
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
|
||||
|
||||
texts = {
|
||||
"en": "Machine learning is great, isn't it?",
|
||||
"ru": "Машинное обучение - это здорово, не так ли?",
|
||||
"de": "Maschinelles Lernen ist großartig, nicht wahr?",
|
||||
}
|
||||
|
||||
# BLUE scores as follows:
|
||||
# "pair": [fairseq, transformers]
|
||||
scores = {
|
||||
"wmt16-en-de-dist-12-1": [28.3, 27.52],
|
||||
"wmt16-en-de-dist-6-1": [27.4, 27.11],
|
||||
"wmt16-en-de-12-1": [26.9, 25.75],
|
||||
}
|
||||
pair = f"{src_lang}-{tgt_lang}"
|
||||
|
||||
readme = f"""
|
||||
---
|
||||
language:
|
||||
- {src_lang}
|
||||
- {tgt_lang}
|
||||
thumbnail:
|
||||
tags:
|
||||
- translation
|
||||
- wmt16
|
||||
- allenai
|
||||
license: apache-2.0
|
||||
datasets:
|
||||
- wmt16
|
||||
metrics:
|
||||
- bleu
|
||||
---
|
||||
|
||||
# FSMT
|
||||
|
||||
## Model description
|
||||
|
||||
This is a ported version of fairseq-based [wmt16 transformer](https://github.com/jungokasai/deep-shallow/) for {src_lang}-{tgt_lang}.
|
||||
|
||||
For more details, please, see [Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation](https://arxiv.org/abs/2006.10369).
|
||||
|
||||
All 3 models are available:
|
||||
|
||||
* [wmt16-en-de-dist-12-1](https://huggingface.co/allenai/wmt16-en-de-dist-12-1)
|
||||
* [wmt16-en-de-dist-6-1](https://huggingface.co/allenai/wmt16-en-de-dist-6-1)
|
||||
* [wmt16-en-de-12-1](https://huggingface.co/allenai/wmt16-en-de-12-1)
|
||||
|
||||
|
||||
## Intended uses & limitations
|
||||
|
||||
#### How to use
|
||||
|
||||
```python
|
||||
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
|
||||
mname = "allenai/{model_name}"
|
||||
tokenizer = FSMTTokenizer.from_pretrained(mname)
|
||||
model = FSMTForConditionalGeneration.from_pretrained(mname)
|
||||
|
||||
input = "{texts[src_lang]}"
|
||||
input_ids = tokenizer.encode(input, return_tensors="pt")
|
||||
outputs = model.generate(input_ids)
|
||||
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(decoded) # {texts[tgt_lang]}
|
||||
|
||||
```
|
||||
|
||||
#### Limitations and bias
|
||||
|
||||
|
||||
## Training data
|
||||
|
||||
Pretrained weights were left identical to the original model released by allenai. For more details, please, see the [paper](https://arxiv.org/abs/2006.10369).
|
||||
|
||||
## Eval results
|
||||
|
||||
Here are the BLEU scores:
|
||||
|
||||
model | fairseq | transformers
|
||||
-------|---------|----------
|
||||
{model_name} | {scores[model_name][0]} | {scores[model_name][1]}
|
||||
|
||||
The score is slightly below the score reported in the paper, as the researchers don't use `sacrebleu` and measure the score on tokenized outputs. `transformers` score was measured using `sacrebleu` on detokenized outputs.
|
||||
|
||||
The score was calculated using this code:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/transformers
|
||||
cd transformers
|
||||
export PAIR={pair}
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=8
|
||||
export NUM_BEAMS=5
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt16 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt16 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
echo $PAIR
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
```
|
||||
|
||||
## Data Sources
|
||||
|
||||
- [training, etc.](http://www.statmt.org/wmt16/)
|
||||
- [test set](http://matrix.statmt.org/test_sets/newstest2016.tgz?1504722372)
|
||||
|
||||
|
||||
### BibTeX entry and citation info
|
||||
|
||||
```
|
||||
@misc{{kasai2020deep,
|
||||
title={{Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation}},
|
||||
author={{Jungo Kasai and Nikolaos Pappas and Hao Peng and James Cross and Noah A. Smith}},
|
||||
year={{2020}},
|
||||
eprint={{2006.10369}},
|
||||
archivePrefix={{arXiv}},
|
||||
primaryClass={{cs.CL}}
|
||||
}}
|
||||
```
|
||||
|
||||
"""
|
||||
model_card_dir.mkdir(parents=True, exist_ok=True)
|
||||
path = os.path.join(model_card_dir, "README.md")
|
||||
print(f"Generating {path}")
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write(readme)
|
||||
|
||||
# make sure we are under the root of the project
|
||||
repo_dir = Path(__file__).resolve().parent.parent.parent
|
||||
model_cards_dir = repo_dir / "model_cards"
|
||||
|
||||
for model_name in ["wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1"]:
|
||||
model_card_dir = model_cards_dir / "allenai" / model_name
|
||||
write_model_card(model_card_dir, src_lang="en", tgt_lang="de", model_name=model_name)
|
@ -1,153 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Usage:
|
||||
# ./gen-card-allenai-wmt19.py
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
|
||||
|
||||
texts = {
|
||||
"en": "Machine learning is great, isn't it?",
|
||||
"ru": "Машинное обучение - это здорово, не так ли?",
|
||||
"de": "Maschinelles Lernen ist großartig, nicht wahr?",
|
||||
}
|
||||
|
||||
# BLUE scores as follows:
|
||||
# "pair": [fairseq, transformers]
|
||||
scores = {
|
||||
"wmt19-de-en-6-6-base": [0, 38.37],
|
||||
"wmt19-de-en-6-6-big": [0, 39.90],
|
||||
}
|
||||
pair = f"{src_lang}-{tgt_lang}"
|
||||
|
||||
readme = f"""
|
||||
---
|
||||
|
||||
language:
|
||||
- {src_lang}
|
||||
- {tgt_lang}
|
||||
thumbnail:
|
||||
tags:
|
||||
- translation
|
||||
- wmt19
|
||||
- allenai
|
||||
license: apache-2.0
|
||||
datasets:
|
||||
- wmt19
|
||||
metrics:
|
||||
- bleu
|
||||
---
|
||||
|
||||
# FSMT
|
||||
|
||||
## Model description
|
||||
|
||||
This is a ported version of fairseq-based [wmt19 transformer](https://github.com/jungokasai/deep-shallow/) for {src_lang}-{tgt_lang}.
|
||||
|
||||
For more details, please, see [Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation](https://arxiv.org/abs/2006.10369).
|
||||
|
||||
2 models are available:
|
||||
|
||||
* [wmt19-de-en-6-6-big](https://huggingface.co/allenai/wmt19-de-en-6-6-big)
|
||||
* [wmt19-de-en-6-6-base](https://huggingface.co/allenai/wmt19-de-en-6-6-base)
|
||||
|
||||
|
||||
## Intended uses & limitations
|
||||
|
||||
#### How to use
|
||||
|
||||
```python
|
||||
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
|
||||
mname = "allenai/{model_name}"
|
||||
tokenizer = FSMTTokenizer.from_pretrained(mname)
|
||||
model = FSMTForConditionalGeneration.from_pretrained(mname)
|
||||
|
||||
input = "{texts[src_lang]}"
|
||||
input_ids = tokenizer.encode(input, return_tensors="pt")
|
||||
outputs = model.generate(input_ids)
|
||||
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(decoded) # {texts[tgt_lang]}
|
||||
|
||||
```
|
||||
|
||||
#### Limitations and bias
|
||||
|
||||
|
||||
## Training data
|
||||
|
||||
Pretrained weights were left identical to the original model released by allenai. For more details, please, see the [paper](https://arxiv.org/abs/2006.10369).
|
||||
|
||||
## Eval results
|
||||
|
||||
Here are the BLEU scores:
|
||||
|
||||
model | transformers
|
||||
-------|---------
|
||||
{model_name} | {scores[model_name][1]}
|
||||
|
||||
The score was calculated using this code:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/transformers
|
||||
cd transformers
|
||||
export PAIR={pair}
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=8
|
||||
export NUM_BEAMS=5
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
echo $PAIR
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
```
|
||||
|
||||
## Data Sources
|
||||
|
||||
- [training, etc.](http://www.statmt.org/wmt19/)
|
||||
- [test set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561)
|
||||
|
||||
|
||||
### BibTeX entry and citation info
|
||||
|
||||
```
|
||||
@misc{{kasai2020deep,
|
||||
title={{Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation}},
|
||||
author={{Jungo Kasai and Nikolaos Pappas and Hao Peng and James Cross and Noah A. Smith}},
|
||||
year={{2020}},
|
||||
eprint={{2006.10369}},
|
||||
archivePrefix={{arXiv}},
|
||||
primaryClass={{cs.CL}}
|
||||
}}
|
||||
```
|
||||
|
||||
"""
|
||||
model_card_dir.mkdir(parents=True, exist_ok=True)
|
||||
path = os.path.join(model_card_dir, "README.md")
|
||||
print(f"Generating {path}")
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write(readme)
|
||||
|
||||
# make sure we are under the root of the project
|
||||
repo_dir = Path(__file__).resolve().parent.parent.parent
|
||||
model_cards_dir = repo_dir / "model_cards"
|
||||
|
||||
for model_name in ["wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big"]:
|
||||
model_card_dir = model_cards_dir / "allenai" / model_name
|
||||
write_model_card(model_card_dir, src_lang="de", tgt_lang="en", model_name=model_name)
|
@ -1,165 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Usage:
|
||||
# ./gen-card-facebook-wmt19.py
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def write_model_card(model_card_dir, src_lang, tgt_lang):
|
||||
|
||||
texts = {
|
||||
"en": "Machine learning is great, isn't it?",
|
||||
"ru": "Машинное обучение - это здорово, не так ли?",
|
||||
"de": "Maschinelles Lernen ist großartig, oder?",
|
||||
}
|
||||
|
||||
# BLUE scores as follows:
|
||||
# "pair": [fairseq, transformers]
|
||||
scores = {
|
||||
"ru-en": ["[41.3](http://matrix.statmt.org/matrix/output/1907?run_id=6937)", "39.20"],
|
||||
"en-ru": ["[36.4](http://matrix.statmt.org/matrix/output/1914?run_id=6724)", "33.47"],
|
||||
"en-de": ["[43.1](http://matrix.statmt.org/matrix/output/1909?run_id=6862)", "42.83"],
|
||||
"de-en": ["[42.3](http://matrix.statmt.org/matrix/output/1902?run_id=6750)", "41.35"],
|
||||
}
|
||||
pair = f"{src_lang}-{tgt_lang}"
|
||||
|
||||
readme = f"""
|
||||
---
|
||||
language:
|
||||
- {src_lang}
|
||||
- {tgt_lang}
|
||||
thumbnail:
|
||||
tags:
|
||||
- translation
|
||||
- wmt19
|
||||
- facebook
|
||||
license: apache-2.0
|
||||
datasets:
|
||||
- wmt19
|
||||
metrics:
|
||||
- bleu
|
||||
---
|
||||
|
||||
# FSMT
|
||||
|
||||
## Model description
|
||||
|
||||
This is a ported version of [fairseq wmt19 transformer](https://github.com/pytorch/fairseq/blob/master/examples/wmt19/README.md) for {src_lang}-{tgt_lang}.
|
||||
|
||||
For more details, please see, [Facebook FAIR's WMT19 News Translation Task Submission](https://arxiv.org/abs/1907.06616).
|
||||
|
||||
The abbreviation FSMT stands for FairSeqMachineTranslation
|
||||
|
||||
All four models are available:
|
||||
|
||||
* [wmt19-en-ru](https://huggingface.co/facebook/wmt19-en-ru)
|
||||
* [wmt19-ru-en](https://huggingface.co/facebook/wmt19-ru-en)
|
||||
* [wmt19-en-de](https://huggingface.co/facebook/wmt19-en-de)
|
||||
* [wmt19-de-en](https://huggingface.co/facebook/wmt19-de-en)
|
||||
|
||||
## Intended uses & limitations
|
||||
|
||||
#### How to use
|
||||
|
||||
```python
|
||||
from transformers import FSMTForConditionalGeneration, FSMTTokenizer
|
||||
mname = "facebook/wmt19-{src_lang}-{tgt_lang}"
|
||||
tokenizer = FSMTTokenizer.from_pretrained(mname)
|
||||
model = FSMTForConditionalGeneration.from_pretrained(mname)
|
||||
|
||||
input = "{texts[src_lang]}"
|
||||
input_ids = tokenizer.encode(input, return_tensors="pt")
|
||||
outputs = model.generate(input_ids)
|
||||
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
print(decoded) # {texts[tgt_lang]}
|
||||
|
||||
```
|
||||
|
||||
#### Limitations and bias
|
||||
|
||||
- The original (and this ported model) doesn't seem to handle well inputs with repeated sub-phrases, [content gets truncated](https://discuss.huggingface.co/t/issues-with-translating-inputs-containing-repeated-phrases/981)
|
||||
|
||||
## Training data
|
||||
|
||||
Pretrained weights were left identical to the original model released by fairseq. For more details, please, see the [paper](https://arxiv.org/abs/1907.06616).
|
||||
|
||||
## Eval results
|
||||
|
||||
pair | fairseq | transformers
|
||||
-------|---------|----------
|
||||
{pair} | {scores[pair][0]} | {scores[pair][1]}
|
||||
|
||||
The score is slightly below the score reported by `fairseq`, since `transformers`` currently doesn't support:
|
||||
- model ensemble, therefore the best performing checkpoint was ported (``model4.pt``).
|
||||
- re-ranking
|
||||
|
||||
The score was calculated using this code:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/transformers
|
||||
cd transformers
|
||||
export PAIR={pair}
|
||||
export DATA_DIR=data/$PAIR
|
||||
export SAVE_DIR=data/$PAIR
|
||||
export BS=8
|
||||
export NUM_BEAMS=15
|
||||
mkdir -p $DATA_DIR
|
||||
sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
|
||||
sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
|
||||
echo $PAIR
|
||||
PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
|
||||
```
|
||||
note: fairseq reports using a beam of 50, so you should get a slightly higher score if re-run with `--num_beams 50`.
|
||||
|
||||
## Data Sources
|
||||
|
||||
- [training, etc.](http://www.statmt.org/wmt19/)
|
||||
- [test set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561)
|
||||
|
||||
|
||||
### BibTeX entry and citation info
|
||||
|
||||
```bibtex
|
||||
@inproceedings{{...,
|
||||
year={{2020}},
|
||||
title={{Facebook FAIR's WMT19 News Translation Task Submission}},
|
||||
author={{Ng, Nathan and Yee, Kyra and Baevski, Alexei and Ott, Myle and Auli, Michael and Edunov, Sergey}},
|
||||
booktitle={{Proc. of WMT}},
|
||||
}}
|
||||
```
|
||||
|
||||
|
||||
## TODO
|
||||
|
||||
- port model ensemble (fairseq uses 4 model checkpoints)
|
||||
|
||||
"""
|
||||
os.makedirs(model_card_dir, exist_ok=True)
|
||||
path = os.path.join(model_card_dir, "README.md")
|
||||
print(f"Generating {path}")
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write(readme)
|
||||
|
||||
# make sure we are under the root of the project
|
||||
repo_dir = Path(__file__).resolve().parent.parent.parent
|
||||
model_cards_dir = repo_dir / "model_cards"
|
||||
|
||||
for model_name in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]:
|
||||
base, src_lang, tgt_lang = model_name.split("-")
|
||||
model_card_dir = model_cards_dir / "facebook" / model_name
|
||||
write_model_card(model_card_dir, src_lang=src_lang, tgt_lang=tgt_lang)
|
@ -1,116 +0,0 @@
|
||||
|
||||
# this is the process of uploading the updated models to s3. As I can't upload them directly to the correct orgs, this script shows how this is done
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
1. upload updated models to my account
|
||||
|
||||
transformers-cli upload -y wmt19-ru-en
|
||||
transformers-cli upload -y wmt19-en-ru
|
||||
transformers-cli upload -y wmt19-de-en
|
||||
transformers-cli upload -y wmt19-en-de
|
||||
transformers-cli upload -y wmt19-de-en-6-6-base
|
||||
transformers-cli upload -y wmt19-de-en-6-6-big
|
||||
transformers-cli upload -y wmt16-en-de-dist-12-1
|
||||
transformers-cli upload -y wmt16-en-de-dist-6-1
|
||||
transformers-cli upload -y wmt16-en-de-12-1
|
||||
|
||||
|
||||
2. ask someone to move them to:
|
||||
|
||||
* to facebook: "wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"
|
||||
* to allenai: "wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1", "wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big"
|
||||
|
||||
export b="s3://models.huggingface.co/bert"
|
||||
stas_to_fb () {
|
||||
src=$1
|
||||
shift
|
||||
aws s3 sync $b/stas/$src $b/facebook/$src $@
|
||||
}
|
||||
|
||||
stas_to_allenai () {
|
||||
src=$1
|
||||
shift
|
||||
aws s3 sync $b/stas/$src $b/allenai/$src $@
|
||||
}
|
||||
|
||||
stas_to_fb wmt19-en-ru
|
||||
stas_to_fb wmt19-ru-en
|
||||
stas_to_fb wmt19-en-de
|
||||
stas_to_fb wmt19-de-en
|
||||
|
||||
stas_to_allenai wmt16-en-de-dist-12-1
|
||||
stas_to_allenai wmt16-en-de-dist-6-1
|
||||
stas_to_allenai wmt16-en-de-6-1
|
||||
stas_to_allenai wmt16-en-de-12-1
|
||||
stas_to_allenai wmt19-de-en-6-6-base
|
||||
stas_to_allenai wmt19-de-en-6-6-big
|
||||
|
||||
|
||||
3. and then remove all these model files from my account
|
||||
|
||||
transformers-cli s3 rm wmt16-en-de-12-1/config.json
|
||||
transformers-cli s3 rm wmt16-en-de-12-1/merges.txt
|
||||
transformers-cli s3 rm wmt16-en-de-12-1/pytorch_model.bin
|
||||
transformers-cli s3 rm wmt16-en-de-12-1/tokenizer_config.json
|
||||
transformers-cli s3 rm wmt16-en-de-12-1/vocab-src.json
|
||||
transformers-cli s3 rm wmt16-en-de-12-1/vocab-tgt.json
|
||||
transformers-cli s3 rm wmt16-en-de-dist-12-1/config.json
|
||||
transformers-cli s3 rm wmt16-en-de-dist-12-1/merges.txt
|
||||
transformers-cli s3 rm wmt16-en-de-dist-12-1/pytorch_model.bin
|
||||
transformers-cli s3 rm wmt16-en-de-dist-12-1/tokenizer_config.json
|
||||
transformers-cli s3 rm wmt16-en-de-dist-12-1/vocab-src.json
|
||||
transformers-cli s3 rm wmt16-en-de-dist-12-1/vocab-tgt.json
|
||||
transformers-cli s3 rm wmt16-en-de-dist-6-1/config.json
|
||||
transformers-cli s3 rm wmt16-en-de-dist-6-1/merges.txt
|
||||
transformers-cli s3 rm wmt16-en-de-dist-6-1/pytorch_model.bin
|
||||
transformers-cli s3 rm wmt16-en-de-dist-6-1/tokenizer_config.json
|
||||
transformers-cli s3 rm wmt16-en-de-dist-6-1/vocab-src.json
|
||||
transformers-cli s3 rm wmt16-en-de-dist-6-1/vocab-tgt.json
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-base/config.json
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-base/merges.txt
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-base/pytorch_model.bin
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-base/tokenizer_config.json
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-base/vocab-src.json
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-base/vocab-tgt.json
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-big/config.json
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-big/merges.txt
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-big/pytorch_model.bin
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-big/tokenizer_config.json
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-big/vocab-src.json
|
||||
transformers-cli s3 rm wmt19-de-en-6-6-big/vocab-tgt.json
|
||||
transformers-cli s3 rm wmt19-de-en/config.json
|
||||
transformers-cli s3 rm wmt19-de-en/merges.txt
|
||||
transformers-cli s3 rm wmt19-de-en/pytorch_model.bin
|
||||
transformers-cli s3 rm wmt19-de-en/tokenizer_config.json
|
||||
transformers-cli s3 rm wmt19-de-en/vocab-src.json
|
||||
transformers-cli s3 rm wmt19-de-en/vocab-tgt.json
|
||||
transformers-cli s3 rm wmt19-en-de/config.json
|
||||
transformers-cli s3 rm wmt19-en-de/merges.txt
|
||||
transformers-cli s3 rm wmt19-en-de/pytorch_model.bin
|
||||
transformers-cli s3 rm wmt19-en-de/tokenizer_config.json
|
||||
transformers-cli s3 rm wmt19-en-de/vocab-src.json
|
||||
transformers-cli s3 rm wmt19-en-de/vocab-tgt.json
|
||||
transformers-cli s3 rm wmt19-en-ru/config.json
|
||||
transformers-cli s3 rm wmt19-en-ru/merges.txt
|
||||
transformers-cli s3 rm wmt19-en-ru/pytorch_model.bin
|
||||
transformers-cli s3 rm wmt19-en-ru/tokenizer_config.json
|
||||
transformers-cli s3 rm wmt19-en-ru/vocab-src.json
|
||||
transformers-cli s3 rm wmt19-en-ru/vocab-tgt.json
|
||||
transformers-cli s3 rm wmt19-ru-en/config.json
|
||||
transformers-cli s3 rm wmt19-ru-en/merges.txt
|
||||
transformers-cli s3 rm wmt19-ru-en/pytorch_model.bin
|
||||
transformers-cli s3 rm wmt19-ru-en/tokenizer_config.json
|
||||
transformers-cli s3 rm wmt19-ru-en/vocab-src.json
|
||||
transformers-cli s3 rm wmt19-ru-en/vocab-tgt.json
|
@ -1,19 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# these scripts need to be run before any changes to FSMT-related code - it should cover all bases
|
||||
|
||||
CUDA_VISIBLE_DEVICES="" RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py
|
||||
RUN_SLOW=1 pytest --disable-warnings tests/test_tokenization_fsmt.py tests/test_configuration_auto.py tests/test_modeling_fsmt.py examples/seq2seq/test_fsmt_bleu_score.py
|
@ -1,34 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# this script builds a small sample spm file tests/fixtures/test_sentencepiece_no_bos.model, with features needed by pegasus
|
||||
|
||||
# 1. pip install sentencepiece
|
||||
#
|
||||
# 2. wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt
|
||||
|
||||
# 3. build
|
||||
import sentencepiece as spm
|
||||
|
||||
|
||||
# pegasus:
|
||||
# 1. no bos
|
||||
# 2. eos_id is 1
|
||||
# 3. unk_id is 2
|
||||
# build a sample spm file accordingly
|
||||
spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=test_sentencepiece_no_bos --bos_id=-1 --unk_id=2 --eos_id=1 --vocab_size=1000')
|
||||
|
||||
# 4. now update the fixture
|
||||
# mv test_sentencepiece_no_bos.model ../../tests/fixtures/
|
@ -1,72 +0,0 @@
|
||||
<!---
|
||||
Copyright 2020 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
Setup transformers following instructions in README.md, (I would fork first).
|
||||
```bash
|
||||
git clone git@github.com:huggingface/transformers.git
|
||||
cd transformers
|
||||
pip install -e .
|
||||
pip install pandas GitPython wget
|
||||
```
|
||||
|
||||
Get required metadata
|
||||
```bash
|
||||
curl https://cdn-datasets.huggingface.co/language_codes/language-codes-3b2.csv > language-codes-3b2.csv
|
||||
curl https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv > iso-639-3.csv
|
||||
```
|
||||
|
||||
Install Tatoeba-Challenge repo inside transformers
|
||||
```bash
|
||||
git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git
|
||||
```
|
||||
|
||||
To convert a few models, call the conversion script from command line:
|
||||
```bash
|
||||
python src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py --models heb-eng eng-heb --save_dir converted
|
||||
```
|
||||
|
||||
To convert lots of models you can pass your list of Tatoeba model names to `resolver.convert_models` in a python client or script.
|
||||
|
||||
```python
|
||||
from transformers.convert_marian_tatoeba_to_pytorch import TatoebaConverter
|
||||
resolver = TatoebaConverter(save_dir='converted')
|
||||
resolver.convert_models(['heb-eng', 'eng-heb'])
|
||||
```
|
||||
|
||||
|
||||
### Upload converted models
|
||||
Since version v3.5.0, the model sharing workflow is switched to git-based system . Refer to [model sharing doc](https://huggingface.co/transformers/main/model_sharing.html#model-sharing-and-uploading) for more details.
|
||||
|
||||
To upload all converted models,
|
||||
|
||||
1. Install [git-lfs](https://git-lfs.github.com/).
|
||||
|
||||
2. Login to `huggingface-cli`
|
||||
|
||||
```bash
|
||||
huggingface-cli login
|
||||
```
|
||||
|
||||
3. Run the `upload_models` script
|
||||
|
||||
```bash
|
||||
./scripts/tatoeba/upload_models.sh
|
||||
```
|
||||
|
||||
|
||||
### Modifications
|
||||
- To change naming logic, change the code near `os.rename`. The model card creation code may also need to change.
|
||||
- To change model card content, you must modify `TatoebaCodeResolver.write_model_card`
|
@ -1,12 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
for FILE in converted/*; do
|
||||
model_name=`basename $FILE`
|
||||
huggingface-cli repo create $model_name -y
|
||||
git clone https://huggingface.co/Helsinki-NLP/$model_name
|
||||
mv $FILE/* $model_name/
|
||||
cd $model_name
|
||||
git add . && git commit -m "initial commit"
|
||||
git push
|
||||
cd ..
|
||||
done
|
2
setup.py
2
setup.py
@ -163,7 +163,6 @@ _deps = [
|
||||
"rjieba",
|
||||
"rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
|
||||
"ruff==0.11.2",
|
||||
"sacrebleu>=1.4.12,<2.0.0",
|
||||
"sacremoses",
|
||||
"safetensors>=0.4.3",
|
||||
"sagemaker>=2.31.0",
|
||||
@ -344,7 +343,6 @@ extras["testing"] = (
|
||||
"evaluate",
|
||||
"pytest-timeout",
|
||||
"ruff",
|
||||
"sacrebleu",
|
||||
"rouge-score",
|
||||
"nltk",
|
||||
"GitPython",
|
||||
|
@ -69,7 +69,6 @@ deps = {
|
||||
"rjieba": "rjieba",
|
||||
"rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
|
||||
"ruff": "ruff==0.11.2",
|
||||
"sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
|
||||
"sacremoses": "sacremoses",
|
||||
"safetensors": "safetensors>=0.4.3",
|
||||
"sagemaker": "sagemaker>=2.31.0",
|
||||
|
@ -112,6 +112,7 @@ _CONFIG_FOR_DOC = "FSMTConfig"
|
||||
"""
|
||||
|
||||
Here is how to compare BLEU scores against fairseq implementation:
|
||||
(don't forget to install sacrebleu: `pip install sacrebleu`)
|
||||
|
||||
# en-ru
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user