mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 03:01:07 +06:00
Add line by line option to mlm/plm scripts (#8240)
* Make line by line optional in run_mlm * Add option to disable dynamic padding * Add option to plm too and update README * Typos * More typos * Even more typos * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
parent
ebec410c71
commit
e1b1b614b1
@ -77,10 +77,16 @@ python run_clm.py \
|
|||||||
--output_dir /tmp/test-clm
|
--output_dir /tmp/test-clm
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
|
||||||
|
concatenates all texts and then splits them in blocks of the same length).
|
||||||
|
|
||||||
|
**Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make
|
||||||
|
sure all your batches have the same length.
|
||||||
|
|
||||||
### Whole word masking
|
### Whole word masking
|
||||||
|
|
||||||
The BERT authors released a new version of BERT using Whole Word Masking in May 2019. Instead of masking randomly
|
The BERT authors released a new version of BERT using Whole Word Masking in May 2019. Instead of masking randomly
|
||||||
selected tokens (which may be aprt of words), they mask randomly selected words (masking all the tokens corresponding
|
selected tokens (which may be part of words), they mask randomly selected words (masking all the tokens corresponding
|
||||||
to that word). This technique has been refined for Chinese in [this paper](https://arxiv.org/abs/1906.08101).
|
to that word). This technique has been refined for Chinese in [this paper](https://arxiv.org/abs/1906.08101).
|
||||||
|
|
||||||
To fine-tune a model using whole word masking, use the following script:
|
To fine-tune a model using whole word masking, use the following script:
|
||||||
@ -111,8 +117,8 @@ It works well on so many Chines Task like CLUE (Chinese GLUE). They use LTP, so
|
|||||||
we need LTP.
|
we need LTP.
|
||||||
|
|
||||||
Now LTP only only works well on `transformers==3.2.0`. So we don't add it to requirements.txt.
|
Now LTP only only works well on `transformers==3.2.0`. So we don't add it to requirements.txt.
|
||||||
You need to create a separate enviromnent with this version of Transformers to run the `run_chinese_ref.py` script that
|
You need to create a separate environment with this version of Transformers to run the `run_chinese_ref.py` script that
|
||||||
will create the reference files. The script is in `examples/contrib`. Once in the proper enviromnent, run the
|
will create the reference files. The script is in `examples/contrib`. Once in the proper environment, run the
|
||||||
following:
|
following:
|
||||||
|
|
||||||
|
|
||||||
@ -144,6 +150,8 @@ python run_mlm_wwm.py \
|
|||||||
--output_dir /tmp/test-mlm-wwm
|
--output_dir /tmp/test-mlm-wwm
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Note:** On TPU, you should the flag `--pad_to_max_length` to make sure all your batches have the same length.
|
||||||
|
|
||||||
### XLNet and permutation language modeling
|
### XLNet and permutation language modeling
|
||||||
|
|
||||||
XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method
|
XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method
|
||||||
@ -179,3 +187,9 @@ python run_plm.py \
|
|||||||
--do_eval \
|
--do_eval \
|
||||||
--output_dir /tmp/test-plm
|
--output_dir /tmp/test-plm
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
|
||||||
|
concatenates all texts and then splits them in blocks of the same length).
|
||||||
|
|
||||||
|
**Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make
|
||||||
|
sure all your batches have the same length.
|
||||||
|
@ -116,6 +116,17 @@ class DataTrainingArguments:
|
|||||||
mlm_probability: float = field(
|
mlm_probability: float = field(
|
||||||
default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
|
default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
|
||||||
)
|
)
|
||||||
|
line_by_line: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
|
||||||
|
)
|
||||||
|
pad_to_max_length: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={
|
||||||
|
"help": "Whether to pad all samples to `max_seq_length`. "
|
||||||
|
"If False, will pad the samples dynamically when batching to the maximum length in the batch."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
||||||
@ -246,18 +257,73 @@ def main():
|
|||||||
column_names = datasets["validation"].column_names
|
column_names = datasets["validation"].column_names
|
||||||
text_column_name = "text" if "text" in column_names else column_names[0]
|
text_column_name = "text" if "text" in column_names else column_names[0]
|
||||||
|
|
||||||
def tokenize_function(examples):
|
if data_args.line_by_line:
|
||||||
# Remove empty lines
|
# When using line_by_line, we just tokenize each nonempty line.
|
||||||
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
|
padding = "max_length" if data_args.pad_to_max_length else False
|
||||||
return tokenizer(examples["text"], truncation=True, max_length=data_args.max_seq_length)
|
|
||||||
|
|
||||||
tokenized_datasets = datasets.map(
|
def tokenize_function(examples):
|
||||||
tokenize_function,
|
# Remove empty lines
|
||||||
batched=True,
|
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)
|
||||||
remove_columns=[text_column_name],
|
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
tokenized_datasets = datasets.map(
|
||||||
)
|
tokenize_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
|
remove_columns=[text_column_name],
|
||||||
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
||||||
|
def tokenize_function(examples):
|
||||||
|
return tokenizer(examples[text_column_name])
|
||||||
|
|
||||||
|
tokenized_datasets = datasets.map(
|
||||||
|
tokenize_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
|
remove_columns=[text_column_name],
|
||||||
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
|
||||||
|
if data_args.max_seq_length is None:
|
||||||
|
max_seq_length = tokenizer.model_max_length
|
||||||
|
else:
|
||||||
|
if data_args.max_seq_length > tokenizer.model_max_length:
|
||||||
|
logger.warn(
|
||||||
|
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
|
||||||
|
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
|
||||||
|
)
|
||||||
|
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
|
||||||
|
|
||||||
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
||||||
|
# max_seq_length.
|
||||||
|
def group_texts(examples):
|
||||||
|
# Concatenate all texts.
|
||||||
|
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||||
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
|
# customize this part to your needs.
|
||||||
|
total_length = (total_length // max_seq_length) * max_seq_length
|
||||||
|
# Split by chunks of max_len.
|
||||||
|
result = {
|
||||||
|
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
||||||
|
for k, t in concatenated_examples.items()
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
|
||||||
|
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
|
||||||
|
# might be slower to preprocess.
|
||||||
|
#
|
||||||
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
tokenized_datasets = tokenized_datasets.map(
|
||||||
|
group_texts,
|
||||||
|
batched=True,
|
||||||
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
# This one will take care of randomly masking the tokens.
|
# This one will take care of randomly masking the tokens.
|
||||||
|
@ -120,6 +120,13 @@ class DataTrainingArguments:
|
|||||||
mlm_probability: float = field(
|
mlm_probability: float = field(
|
||||||
default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
|
default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
|
||||||
)
|
)
|
||||||
|
pad_to_max_length: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={
|
||||||
|
"help": "Whether to pad all samples to `max_seq_length`. "
|
||||||
|
"If False, will pad the samples dynamically when batching to the maximum length in the batch."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
if self.train_file is not None:
|
if self.train_file is not None:
|
||||||
@ -253,10 +260,12 @@ def main():
|
|||||||
column_names = datasets["validation"].column_names
|
column_names = datasets["validation"].column_names
|
||||||
text_column_name = "text" if "text" in column_names else column_names[0]
|
text_column_name = "text" if "text" in column_names else column_names[0]
|
||||||
|
|
||||||
|
padding = "max_length" if data_args.pad_to_max_length else False
|
||||||
|
|
||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
# Remove empty lines
|
# Remove empty lines
|
||||||
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
|
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
|
||||||
return tokenizer(examples["text"], truncation=True, max_length=data_args.max_seq_length)
|
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)
|
||||||
|
|
||||||
tokenized_datasets = datasets.map(
|
tokenized_datasets = datasets.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
|
@ -113,6 +113,17 @@ class DataTrainingArguments:
|
|||||||
max_span_length: int = field(
|
max_span_length: int = field(
|
||||||
default=5, metadata={"help": "Maximum length of a span of masked tokens for permutation language modeling."}
|
default=5, metadata={"help": "Maximum length of a span of masked tokens for permutation language modeling."}
|
||||||
)
|
)
|
||||||
|
line_by_line: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
|
||||||
|
)
|
||||||
|
pad_to_max_length: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={
|
||||||
|
"help": "Whether to pad all samples to `max_seq_length`. "
|
||||||
|
"If False, will pad the samples dynamically when batching to the maximum length in the batch."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
if self.dataset_name is None and self.train_file is None and self.validation_file is None:
|
||||||
@ -243,18 +254,73 @@ def main():
|
|||||||
column_names = datasets["validation"].column_names
|
column_names = datasets["validation"].column_names
|
||||||
text_column_name = "text" if "text" in column_names else column_names[0]
|
text_column_name = "text" if "text" in column_names else column_names[0]
|
||||||
|
|
||||||
def tokenize_function(examples):
|
if data_args.line_by_line:
|
||||||
# Remove empty lines
|
# When using line_by_line, we just tokenize each nonempty line.
|
||||||
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
|
padding = "max_length" if data_args.pad_to_max_length else False
|
||||||
return tokenizer(examples["text"], truncation=True, max_length=data_args.max_seq_length)
|
|
||||||
|
|
||||||
tokenized_datasets = datasets.map(
|
def tokenize_function(examples):
|
||||||
tokenize_function,
|
# Remove empty lines
|
||||||
batched=True,
|
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)
|
||||||
remove_columns=[text_column_name],
|
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
tokenized_datasets = datasets.map(
|
||||||
)
|
tokenize_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
|
remove_columns=[text_column_name],
|
||||||
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
||||||
|
def tokenize_function(examples):
|
||||||
|
return tokenizer(examples[text_column_name])
|
||||||
|
|
||||||
|
tokenized_datasets = datasets.map(
|
||||||
|
tokenize_function,
|
||||||
|
batched=True,
|
||||||
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
|
remove_columns=[text_column_name],
|
||||||
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
|
||||||
|
if data_args.max_seq_length is None:
|
||||||
|
max_seq_length = tokenizer.model_max_length
|
||||||
|
else:
|
||||||
|
if data_args.max_seq_length > tokenizer.model_max_length:
|
||||||
|
logger.warn(
|
||||||
|
f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
|
||||||
|
f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
|
||||||
|
)
|
||||||
|
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
|
||||||
|
|
||||||
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
||||||
|
# max_seq_length.
|
||||||
|
def group_texts(examples):
|
||||||
|
# Concatenate all texts.
|
||||||
|
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
||||||
|
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
||||||
|
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
||||||
|
# customize this part to your needs.
|
||||||
|
total_length = (total_length // max_seq_length) * max_seq_length
|
||||||
|
# Split by chunks of max_len.
|
||||||
|
result = {
|
||||||
|
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
|
||||||
|
for k, t in concatenated_examples.items()
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
|
||||||
|
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
|
||||||
|
# might be slower to preprocess.
|
||||||
|
#
|
||||||
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
tokenized_datasets = tokenized_datasets.map(
|
||||||
|
group_texts,
|
||||||
|
batched=True,
|
||||||
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
data_collator = DataCollatorForPermutationLanguageModeling(
|
data_collator = DataCollatorForPermutationLanguageModeling(
|
||||||
|
Loading…
Reference in New Issue
Block a user