mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 21:00:08 +06:00
chore: enhance message descriptions in parameters,comments,logs and docstrings (#36554)
* chore: enhance message descriptons in parameters,comments,logs and docstrings * chore: enhance message descriptons in parameters,comments,logs and docstrings * Update src/transformers/hf_argparser.py * Update src/transformers/keras_callbacks.py --------- Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>
This commit is contained in:
parent
6966fa1901
commit
9e84b38135
@ -191,7 +191,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
v5.
|
||||
loss_type (`str`, *optional*):
|
||||
The type of loss that the model should use. It should be in `LOSS_MAPPING`'s keys, otherwise the loss will
|
||||
be automatically infered from the model architecture.
|
||||
be automatically inferred from the model architecture.
|
||||
"""
|
||||
|
||||
model_type: str = ""
|
||||
@ -254,7 +254,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
if num_labels is not None and len(self.id2label) != num_labels:
|
||||
logger.warning(
|
||||
f"You passed along `num_labels={num_labels}` with an incompatible id to label map: "
|
||||
f"{self.id2label}. The number of labels wil be overwritten to {self.num_labels}."
|
||||
f"{self.id2label}. The number of labels will be overwritten to {self.num_labels}."
|
||||
)
|
||||
self.id2label = {int(key): value for key, value in self.id2label.items()}
|
||||
# Keys are always strings in JSON so convert ids to int here.
|
||||
@ -1094,7 +1094,7 @@ class PretrainedConfig(PushToHubMixin):
|
||||
is_default_in_config = is_default_generation_value = None
|
||||
parameter_value = getattr(self_decoder_config, parameter_name)
|
||||
# Three cases in which is okay for the model config to hold generation config parameters:
|
||||
# 1. The parameter is set to `None`, effectivelly delegating its value to the generation config
|
||||
# 1. The parameter is set to `None`, effectively delegating its value to the generation config
|
||||
if parameter_value is None:
|
||||
continue
|
||||
# 2. If we have a default config, then the instance should hold the same generation defaults
|
||||
|
@ -1727,5 +1727,5 @@ def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokeni
|
||||
raise ValueError(
|
||||
f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path "
|
||||
f"with a SentencePiece tokenizer.model file."
|
||||
f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
|
||||
f"Currently available slow->fast converters: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
|
||||
)
|
||||
|
@ -201,7 +201,7 @@ class HfArgumentParser(ArgumentParser):
|
||||
else:
|
||||
kwargs["required"] = True
|
||||
elif field.type is bool or field.type == Optional[bool]:
|
||||
# Copy the currect kwargs to use to instantiate a `no_*` complement argument below.
|
||||
# Copy the correct kwargs to use to instantiate a `no_*` complement argument below.
|
||||
# We do not initialize it here because the `no_*` alternative must be instantiated after the real argument
|
||||
bool_kwargs = copy(kwargs)
|
||||
|
||||
|
@ -585,7 +585,7 @@ def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
|
||||
|
||||
center format: contains the coordinate for the center of the box and its width, height dimensions
|
||||
(center_x, center_y, width, height)
|
||||
corners format: contains the coodinates for the top-left and bottom-right corners of the box
|
||||
corners format: contains the coordinates for the top-left and bottom-right corners of the box
|
||||
(top_left_x, top_left_y, bottom_right_x, bottom_right_y)
|
||||
"""
|
||||
# Function is used during model forward pass, so we use the input framework if possible, without
|
||||
|
@ -545,7 +545,7 @@ def default_sample_indices_fn(metadata: VideoMetadata, num_frames=None, fps=None
|
||||
|
||||
Args:
|
||||
metadata (`VideoMetadata`):
|
||||
`VideoMetadata` object containing metadat about the video, such as "total_num_frames" or "fps".
|
||||
`VideoMetadata` object containing metadata about the video, such as "total_num_frames" or "fps".
|
||||
num_frames (`int`, *optional*):
|
||||
Number of frames to sample uniformly.
|
||||
fps (`int`, *optional*):
|
||||
|
@ -137,9 +137,9 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids):
|
||||
"""
|
||||
This function returns necessary arguments to call `flash_attn_varlen_func`.
|
||||
All three query, key, value states will be flattened.
|
||||
Cummulative lengths of each examples in the batch will be extracted from position_ids.
|
||||
Cumulative lengths of each examples in the batch will be extracted from position_ids.
|
||||
|
||||
NOTE: ideally cummulative lengths should be prepared at the data collator stage
|
||||
NOTE: ideally cumulative lengths should be prepared at the data collator stage
|
||||
|
||||
Arguments:
|
||||
query (`torch.Tensor`):
|
||||
@ -268,7 +268,7 @@ def _flash_attention_forward(
|
||||
softmax_scale (`float`, *optional*):
|
||||
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
|
||||
use_top_left_mask (`bool`, defaults to `False`):
|
||||
flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
|
||||
flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
|
||||
softcap (`float`, *optional*):
|
||||
Softcap for the attention logits, used e.g. in gemma2.
|
||||
deterministic (`bool`, *optional*):
|
||||
@ -374,9 +374,9 @@ class FlashAttentionKwargs(TypedDict, total=False):
|
||||
|
||||
Attributes:
|
||||
cu_seq_lens_q (`torch.LongTensor`, *optional*)
|
||||
Gets cumlative sequence length for query state.
|
||||
Gets cumulative sequence length for query state.
|
||||
cu_seq_lens_k (`torch.LongTensor`, *optional*)
|
||||
Gets cumlative sequence length for key state.
|
||||
Gets cumulative sequence length for key state.
|
||||
max_length_q (`int`, *optional*):
|
||||
Maximum sequence length for query state.
|
||||
max_length_k (`int`, *optional*):
|
||||
|
@ -367,7 +367,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
|
||||
|
||||
def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
|
||||
r"""
|
||||
Cast the floating-point `parmas` to `jax.numpy.float32`. This method can be used to explicitly convert the
|
||||
Cast the floating-point `params` to `jax.numpy.float32`. This method can be used to explicitly convert the
|
||||
model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in place.
|
||||
|
||||
Arguments:
|
||||
@ -394,7 +394,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
|
||||
|
||||
def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
|
||||
r"""
|
||||
Cast the floating-point `parmas` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
|
||||
Cast the floating-point `params` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
|
||||
`params` in place.
|
||||
|
||||
This method can be used on GPU to explicitly convert the model parameters to float16 precision to do full
|
||||
@ -510,7 +510,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
|
||||
`bool`: Whether this model can generate sequences with `.generate()`.
|
||||
"""
|
||||
# Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
|
||||
# Alternativelly, the model can also have a custom `generate` function.
|
||||
# Alternatively, the model can also have a custom `generate` function.
|
||||
if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
|
||||
return False
|
||||
return True
|
||||
@ -968,7 +968,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
|
||||
)
|
||||
cls._missing_keys = missing_keys
|
||||
|
||||
# Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
|
||||
# Mismatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
|
||||
# matching the weights in the model.
|
||||
mismatched_keys = []
|
||||
for key in state.keys():
|
||||
|
@ -373,7 +373,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
|
||||
# to add this patch to ensure things work correctly on our side.
|
||||
if "llama" in architecture and "mistral" in model_name:
|
||||
updated_architecture = "mistral"
|
||||
# FIXME: Currnetly this implementation is only for flan-t5 architecture.
|
||||
# FIXME: Currently this implementation is only for flan-t5 architecture.
|
||||
# It needs to be developed for supporting legacy t5.
|
||||
elif "t5" in architecture or "t5encoder" in architecture:
|
||||
parsed_parameters["config"]["is_gated_act"] = True
|
||||
@ -437,7 +437,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
|
||||
logger.info(f"Some keys were not parsed and added into account {gguf_key} | {value}")
|
||||
|
||||
# retrieve config vocab_size from tokenizer
|
||||
# Pleas refer to https://github.com/huggingface/transformers/issues/32526 for more details
|
||||
# Please refer to https://github.com/huggingface/transformers/issues/32526 for more details
|
||||
if "vocab_size" not in parsed_parameters["config"]:
|
||||
tokenizer_parameters = parsed_parameters["tokenizer"]
|
||||
if "tokens" in tokenizer_parameters:
|
||||
|
@ -795,7 +795,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
|
||||
ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys
|
||||
|
||||
Returns:
|
||||
`keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the
|
||||
`keras.models.Model`: Three lists, one for the layers that were found and successfully restored (from the
|
||||
shard file), one for the mismatched layers, and another one for the unexpected layers.
|
||||
"""
|
||||
saved_weight_names_set = set()
|
||||
@ -868,7 +868,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
|
||||
f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' "
|
||||
f"at '{resolved_archive_file}'. "
|
||||
"If you tried to load a TF model from a sharded checkpoint, you should try converting the model "
|
||||
"by loading it in pytorch and saving it localy. A convertion script should be realeased soon."
|
||||
"by loading it in pytorch and saving it locally. A convertion script should be released soon."
|
||||
)
|
||||
|
||||
|
||||
@ -1391,7 +1391,7 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
|
||||
`bool`: Whether this model can generate sequences with `.generate()`.
|
||||
"""
|
||||
# Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation.
|
||||
# Alternativelly, the model can also have a custom `generate` function.
|
||||
# Alternatively, the model can also have a custom `generate` function.
|
||||
if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate):
|
||||
return False
|
||||
return True
|
||||
|
@ -1324,7 +1324,7 @@ def _find_mismatched_keys(
|
||||
and state_dict[checkpoint_key].numel() * 2 == model_state_dict[model_key].numel()
|
||||
):
|
||||
# This skips size mismatches for 4-bit weights. Two 4-bit values share an 8-bit container, causing size differences.
|
||||
# Without matching with module type or paramter type it seems like a practical way to detect valid 4bit weights.
|
||||
# Without matching with module type or parameter type it seems like a practical way to detect valid 4bit weights.
|
||||
pass
|
||||
else:
|
||||
mismatched_keys.append(
|
||||
@ -1616,7 +1616,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
3. SDPA implementation, if available and supported by the model type. (`LlamaSdpaAttention` for example)
|
||||
4. The default model's implementation otherwise (`LlamaAttention` for example) .
|
||||
"""
|
||||
# Here we use config._attn_implementation_internal to check whether the attention implementation was explicitely set by the user.
|
||||
# Here we use config._attn_implementation_internal to check whether the attention implementation was explicitly set by the user.
|
||||
# The property `PretrainedConfig._attn_implementation` is never `None`, for backward compatibility (always fall back on "eager").
|
||||
# The `hasattr` here is used as some Transformers tests for some reason do not call PretrainedConfig __init__ (e.g. test_no_super_init_config_and_model)
|
||||
requested_attn_implementation = None
|
||||
@ -2207,7 +2207,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
if new_num_tokens is None and pad_to_multiple_of is None:
|
||||
return model_embeds
|
||||
|
||||
# Since we are basically resuing the same old embeddings with new weight values, gathering is required
|
||||
# Since we are basically reusing the same old embeddings with new weight values, gathering is required
|
||||
is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None
|
||||
if is_deepspeed_zero3_enabled() and not is_quantized:
|
||||
import deepspeed
|
||||
@ -2574,7 +2574,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
sample_shape=(added_num_tokens,)
|
||||
).to(old_embeddings.weight.dtype)
|
||||
else:
|
||||
# Otherwise, just initialize with the mean. because distribtion will not be created.
|
||||
# Otherwise, just initialize with the mean. because distribution will not be created.
|
||||
new_embeddings.weight.data[-1 * added_num_tokens :, :] = (
|
||||
mean_embeddings[None, :].repeat(added_num_tokens, 1).to(old_embeddings.weight.dtype)
|
||||
)
|
||||
@ -2593,7 +2593,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
new_lm_head.weight.data = new_lm_head.weight.data.T
|
||||
old_lm_head.weight.data = old_lm_head.weight.data.T
|
||||
|
||||
# The same initilization logic as Embeddings.
|
||||
# The same initialization logic as Embeddings.
|
||||
self._init_added_embeddings_weights_with_mean(
|
||||
old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens
|
||||
)
|
||||
@ -2740,7 +2740,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
"""
|
||||
if self.supports_gradient_checkpointing:
|
||||
# For old GC format (transformers < 4.35.0) for models that live on the Hub
|
||||
# we will fall back to the overwritten `_set_gradient_checkpointing` methid
|
||||
# we will fall back to the overwritten `_set_gradient_checkpointing` method
|
||||
_is_using_old_format = "value" in inspect.signature(self._set_gradient_checkpointing).parameters
|
||||
if not _is_using_old_format:
|
||||
self._set_gradient_checkpointing(enable=False)
|
||||
@ -2979,7 +2979,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
if ignore_key in state_dict.keys():
|
||||
del state_dict[ignore_key]
|
||||
|
||||
# Rename state_dict keys before saving to file. Do nothing unless overriden in a particular model.
|
||||
# Rename state_dict keys before saving to file. Do nothing unless overridden in a particular model.
|
||||
# (initially introduced with TimmWrapperModel to remove prefix and make checkpoints compatible with timm)
|
||||
state_dict = self._fix_state_dict_keys_on_save(state_dict)
|
||||
|
||||
@ -4998,7 +4998,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
shard_file, is_quantized=is_quantized, map_location="meta", weights_only=weights_only
|
||||
)
|
||||
|
||||
# Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
|
||||
# Mismatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
|
||||
# matching the weights in the model.
|
||||
mismatched_keys += _find_mismatched_keys(
|
||||
state_dict,
|
||||
@ -5321,13 +5321,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
||||
"""
|
||||
Tensor parallelize the model across the given device mesh. This function is a helper to be called after the model
|
||||
was already loaded in memory, note however that this means that each process will first initialize the whole model,
|
||||
then parallelize it accross devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
|
||||
then parallelize it across devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
|
||||
|
||||
Calling `from_pretrained(..., tp_plan="auto")` is prefered, and will parallelize module-by-module during initialization,
|
||||
Calling `from_pretrained(..., tp_plan="auto")` is preferred, and will parallelize module-by-module during initialization,
|
||||
so that the expected per-device memory spike at loading time is not larger than the final model size on each device.
|
||||
Tensor parallelize the model across the given device mesh. This function is a helper to be called after the model
|
||||
was already loaded in memory, note however that this means that each process will first initialize the whole model,
|
||||
then parallelize it accross devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
|
||||
then parallelize it across devices. Thus there is a huge waste of GPU memory, and this can lead to OOM at loading time.
|
||||
|
||||
Args:
|
||||
device_mesh (`torch.distributed.DeviceMesh`):
|
||||
@ -5869,7 +5869,7 @@ def unwrap_model(model: nn.Module, recursive: bool = False) -> nn.Module:
|
||||
|
||||
def expand_device_map(device_map, param_names, start_prefix):
|
||||
"""
|
||||
Expand a device map to return the correspondance parameter name to device.
|
||||
Expand a device map to return the correspondence parameter name to device.
|
||||
"""
|
||||
new_device_map = {}
|
||||
param_names = [p[len(start_prefix) :] for p in param_names if p.startswith(start_prefix)]
|
||||
|
@ -901,7 +901,7 @@ class ProcessorMixin(PushToHubMixin):
|
||||
```python
|
||||
tokenizer = tokenizer_class(..., {"padding": "max_length"})
|
||||
image_processor = image_processor_class(...)
|
||||
processor(tokenizer, image_processor) # will pass max_length unless overriden by kwargs at call
|
||||
processor(tokenizer, image_processor) # will pass max_length unless overridden by kwargs at call
|
||||
```
|
||||
4) defaults kwargs specified at processor level have lowest priority.
|
||||
```python
|
||||
@ -1205,7 +1205,7 @@ class ProcessorMixin(PushToHubMixin):
|
||||
video models might want to specify in the prompt the duration of video or which frame indices at which timestamps
|
||||
were sampled. This information cannot be accessed before the video is loaded.
|
||||
|
||||
For most models it is a no-op, and must be overriden by model processors which require special processing.
|
||||
For most models it is a no-op, and must be overridden by model processors which require special processing.
|
||||
|
||||
Args:
|
||||
conversation (`List[Dict, str, str]`):
|
||||
@ -1372,7 +1372,7 @@ class ProcessorMixin(PushToHubMixin):
|
||||
if tokenize:
|
||||
# Tokenizer's `apply_chat_template` never adds special tokens when tokenizing
|
||||
# But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt
|
||||
# and pass it to the processor. Users thus never worried about special tokens relying on processor hadnling
|
||||
# and pass it to the processor. Users thus never worried about special tokens relying on processor handling
|
||||
# everything internally. The below line is to keep BC for that and be able to work with model that have
|
||||
# special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line
|
||||
# without actionable solution for users
|
||||
|
@ -2407,7 +2407,7 @@ class SubprocessCallException(Exception):
|
||||
def run_command(command: List[str], return_stdout=False):
|
||||
"""
|
||||
Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
|
||||
if an error occured while running `command`
|
||||
if an error occurred while running `command`
|
||||
"""
|
||||
try:
|
||||
output = subprocess.check_output(command, stderr=subprocess.STDOUT)
|
||||
@ -2541,7 +2541,7 @@ def hub_retry(max_attempts: int = 5, wait_before_retry: Optional[float] = 2):
|
||||
requests.exceptions.RequestException,
|
||||
) as err:
|
||||
logger.error(
|
||||
f"Test failed with {err} at try {retry_count}/{max_attempts} as it couldn't connect to the specied Hub repository."
|
||||
f"Test failed with {err} at try {retry_count}/{max_attempts} as it couldn't connect to the specified Hub repository."
|
||||
)
|
||||
if wait_before_retry is not None:
|
||||
time.sleep(wait_before_retry)
|
||||
@ -2661,7 +2661,7 @@ def run_test_using_subprocess(func):
|
||||
The following contains utils to run the documentation tests without having to overwrite any files.
|
||||
|
||||
The `preprocess_string` function adds `# doctest: +IGNORE_RESULT` markers on the fly anywhere a `load_dataset` call is
|
||||
made as a print would otherwise fail the corresonding line.
|
||||
made as a print would otherwise fail the corresponding line.
|
||||
|
||||
To skip cuda tests, make sure to call `SKIP_CUDA_DOCTEST=1 pytest --doctest-modules <path_to_files_to_test>
|
||||
"""
|
||||
|
@ -708,7 +708,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
added_tokens_file = os.path.join(
|
||||
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
|
||||
)
|
||||
# make sure to be foward compatible
|
||||
# make sure to be forward compatible
|
||||
added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
|
||||
if added_vocab:
|
||||
with open(added_tokens_file, "w", encoding="utf-8") as f:
|
||||
|
@ -2266,7 +2266,7 @@ class Trainer:
|
||||
(self.model_wrapped,) = release_memory(self.model_wrapped)
|
||||
self.model_wrapped = self.model
|
||||
|
||||
# Check for DeepSpeed *after* the intial pass and modify the config
|
||||
# Check for DeepSpeed *after* the initial pass and modify the config
|
||||
if self.is_deepspeed_enabled:
|
||||
# Temporarily unset `self.args.train_batch_size`
|
||||
original_bs = self.args.per_device_train_batch_size
|
||||
@ -2826,7 +2826,7 @@ class Trainer:
|
||||
# Checkpoint must have been saved with the old smp api.
|
||||
if hasattr(self.args, "fp16") and self.args.fp16 is True:
|
||||
logger.warning(
|
||||
"Enabling FP16 and loading from smp < 1.10 checkpoint together is not suppported."
|
||||
"Enabling FP16 and loading from smp < 1.10 checkpoint together is not supported."
|
||||
)
|
||||
state_dict = torch.load(
|
||||
weights_file,
|
||||
@ -4091,7 +4091,7 @@ class Trainer:
|
||||
A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
|
||||
dictionary also contains the epoch number which comes from the training state.
|
||||
"""
|
||||
# handle multipe eval datasets
|
||||
# handle multiple eval datasets
|
||||
override = eval_dataset is not None
|
||||
eval_dataset = eval_dataset if override else self.eval_dataset
|
||||
if isinstance(eval_dataset, dict):
|
||||
|
@ -88,7 +88,7 @@ class TrainerState:
|
||||
impact the way data will be logged in TensorBoard.
|
||||
stateful_callbacks (`List[StatefulTrainerCallback]`, *optional*):
|
||||
Callbacks attached to the `Trainer` that should have their states be saved or restored.
|
||||
Relevent callbacks should implement a `state` and `from_state` function.
|
||||
Relevant callbacks should implement a `state` and `from_state` function.
|
||||
"""
|
||||
|
||||
epoch: Optional[float] = None
|
||||
|
@ -1231,8 +1231,8 @@ class AcceleratorConfig:
|
||||
all workers.
|
||||
use_seedable_sampler (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures
|
||||
training results are fully reproducable using a different sampling technique. While seed-to-seed results
|
||||
may differ, on average the differences are neglible when using multiple different seeds to compare. Should
|
||||
training results are fully reproducible using a different sampling technique. While seed-to-seed results
|
||||
may differ, on average the differences are negligible when using multiple different seeds to compare. Should
|
||||
also be ran with [`~utils.set_seed`] for the best results.
|
||||
gradient_accumulation_kwargs (`dict`, *optional*):
|
||||
Additional kwargs to configure gradient accumulation, see [`accelerate.utils.GradientAccumulationPlugin`].
|
||||
@ -1284,8 +1284,8 @@ class AcceleratorConfig:
|
||||
default=True,
|
||||
metadata={
|
||||
"help": "Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`])."
|
||||
"Ensures training results are fully reproducable using a different sampling technique. "
|
||||
"While seed-to-seed results may differ, on average the differences are neglible when using"
|
||||
"Ensures training results are fully reproducible using a different sampling technique. "
|
||||
"While seed-to-seed results may differ, on average the differences are negligible when using"
|
||||
"multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results."
|
||||
},
|
||||
)
|
||||
|
@ -542,7 +542,7 @@ class TrainingArguments:
|
||||
all-gathers.
|
||||
- use_orig_params (`bool`, *optional*, defaults to `True`)
|
||||
If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed
|
||||
frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please
|
||||
frozen and trainable parameters. Useful in cases such as parameter-efficient fine-tuning. Please
|
||||
refer this
|
||||
[blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019
|
||||
- sync_module_states (`bool`, *optional*, defaults to `True`)
|
||||
@ -604,8 +604,8 @@ class TrainingArguments:
|
||||
all workers.
|
||||
- use_seedable_sampler (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures
|
||||
training results are fully reproducable using a different sampling technique. While seed-to-seed results
|
||||
may differ, on average the differences are neglible when using multiple different seeds to compare. Should
|
||||
training results are fully reproducible using a different sampling technique. While seed-to-seed results
|
||||
may differ, on average the differences are negligible when using multiple different seeds to compare. Should
|
||||
also be ran with [`~utils.set_seed`] for the best results.
|
||||
- use_configured_state (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined before calling `TrainingArguments`.
|
||||
@ -1278,7 +1278,7 @@ class TrainingArguments:
|
||||
default=None,
|
||||
metadata={
|
||||
"help": (
|
||||
"Config to be used with the internal Accelerator object initializtion. The value is either a "
|
||||
"Config to be used with the internal Accelerator object initialization. The value is either a "
|
||||
"accelerator json config file (e.g., `accelerator_config.json`) or an already loaded json file as `dict`."
|
||||
)
|
||||
},
|
||||
@ -1528,7 +1528,7 @@ class TrainingArguments:
|
||||
neftune_noise_alpha: Optional[float] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "Activates neftune noise embeddings into the model. NEFTune has been proven to drastically improve model performances for instrcution fine-tuning. Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
|
||||
"help": "Activates neftune noise embeddings into the model. NEFTune has been proven to drastically improve model performances for instruction fine-tuning. Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes."
|
||||
},
|
||||
)
|
||||
|
||||
@ -1584,7 +1584,7 @@ class TrainingArguments:
|
||||
# Parse in args that could be `dict` sent in from the CLI as a string
|
||||
for field in _VALID_DICT_FIELDS:
|
||||
passed_value = getattr(self, field)
|
||||
# We only want to do this if the str starts with a bracket to indiciate a `dict`
|
||||
# We only want to do this if the str starts with a bracket to indicate a `dict`
|
||||
# else its likely a filename if supported
|
||||
if isinstance(passed_value, str) and passed_value.startswith("{"):
|
||||
loaded_dict = json.loads(passed_value)
|
||||
@ -1849,7 +1849,7 @@ class TrainingArguments:
|
||||
torch.backends.cudnn.allow_tf32 = True
|
||||
else:
|
||||
logger.warning(
|
||||
"The speedups for torchdynamo mostly come wih GPU Ampere or higher and which is not detected here."
|
||||
"The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here."
|
||||
)
|
||||
if self.framework == "pt" and is_torch_available() and self.tf32 is not None:
|
||||
if self.tf32:
|
||||
|
Loading…
Reference in New Issue
Block a user