From 9e599d1d944c3e58b9f6ed6681e87cc32256af7c Mon Sep 17 00:00:00 2001
From: Boris Feld <lothiraldan@gmail.com>
Date: Fri, 5 Jul 2024 08:13:46 +0200
Subject: [PATCH] Update CometCallback to allow reusing of the running
 experiment (#31366)

* Update CometCallback to allow reusing of the running experiment

* Fixups

* Remove useless TODO

* Add checks for minimum version of the Comet SDK

* Fix documentation and links.

Also simplify how the Comet Experiment name is passed
---
 docs/source/en/main_classes/callback.md       |   2 +-
 docs/source/ja/main_classes/callback.md       |   2 +-
 docs/source/zh/main_classes/callback.md       |   2 +-
 examples/pytorch/README.md                    |   4 +-
 .../integrations/integration_utils.py         | 186 +++++++++++++-----
 src/transformers/training_args.py             |   8 +-
 src/transformers/training_args_tf.py          |   2 +-
 7 files changed, 150 insertions(+), 56 deletions(-)

diff --git a/docs/source/en/main_classes/callback.md b/docs/source/en/main_classes/callback.md
index bc7323f5911..ee91737ef05 100644
--- a/docs/source/en/main_classes/callback.md
+++ b/docs/source/en/main_classes/callback.md
@@ -34,7 +34,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
 - [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
   or tensorboardX).
 - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
-- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.ml/site/) is installed.
+- [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
 - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
 - [`~integrations.NeptuneCallback`] if [neptune](https://neptune.ai/) is installed.
 - [`~integrations.AzureMLCallback`] if [azureml-sdk](https://pypi.org/project/azureml-sdk/) is
diff --git a/docs/source/ja/main_classes/callback.md b/docs/source/ja/main_classes/callback.md
index 3ea4938841e..a90044b6cd3 100644
--- a/docs/source/ja/main_classes/callback.md
+++ b/docs/source/ja/main_classes/callback.md
@@ -35,7 +35,7 @@ rendered properly in your Markdown viewer.
 - [`~integrations.TensorBoardCallback`] (PyTorch >= 1.4 を介して) tensorboard にアクセスできる場合
   またはテンソルボードX）。
 - [`~integrations.WandbCallback`] [wandb](https://www.wandb.com/) がインストールされている場合。
-- [`~integrations.CometCallback`] [comet_ml](https://www.comet.ml/site/) がインストールされている場合。
+- [`~integrations.CometCallback`] [comet_ml](https://www.comet.com/site/) がインストールされている場合。
 - [mlflow](https://www.mlflow.org/) がインストールされている場合は [`~integrations.MLflowCallback`]。
 - [`~integrations.NeptuneCallback`] [neptune](https://neptune.ai/) がインストールされている場合。
 - [`~integrations.AzureMLCallback`] [azureml-sdk](https://pypi.org/project/azureml-sdk/) の場合
diff --git a/docs/source/zh/main_classes/callback.md b/docs/source/zh/main_classes/callback.md
index be05c37aec9..3642207d75b 100644
--- a/docs/source/zh/main_classes/callback.md
+++ b/docs/source/zh/main_classes/callback.md
@@ -28,7 +28,7 @@ Callbacks是“只读”的代码片段，除了它们返回的[TrainerControl]
 - [`PrinterCallback`] 或 [`ProgressCallback`]，用于显示进度和打印日志（如果通过[`TrainingArguments`]停用tqdm，则使用第一个函数；否则使用第二个）。
 - [`~integrations.TensorBoardCallback`]，如果TensorBoard可访问（通过PyTorch版本 >= 1.4 或者 tensorboardX）。
 - [`~integrations.WandbCallback`]，如果安装了[wandb](https://www.wandb.com/)。
-- [`~integrations.CometCallback`]，如果安装了[comet_ml](https://www.comet.ml/site/)。
+- [`~integrations.CometCallback`]，如果安装了[comet_ml](https://www.comet.com/site/)。
 - [`~integrations.MLflowCallback`]，如果安装了[mlflow](https://www.mlflow.org/)。
 - [`~integrations.NeptuneCallback`]，如果安装了[neptune](https://neptune.ai/)。
 - [`~integrations.AzureMLCallback`]，如果安装了[azureml-sdk](https://pypi.org/project/azureml-sdk/)。
diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index 178102ec092..4e318b3edb9 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -200,7 +200,7 @@ You can easily log and monitor your runs code. The following are currently suppo
 
 * [TensorBoard](https://www.tensorflow.org/tensorboard)
 * [Weights & Biases](https://docs.wandb.ai/integrations/huggingface)
-* [Comet ML](https://www.comet.ml/docs/python-sdk/huggingface/)
+* [Comet ML](https://www.comet.com/docs/v2/integrations/ml-frameworks/transformers/)
 * [Neptune](https://docs.neptune.ai/integrations-and-supported-tools/model-training/hugging-face)
 * [ClearML](https://clear.ml/docs/latest/docs/getting_started/ds/ds_first_steps)
 * [DVCLive](https://dvc.org/doc/dvclive/ml-frameworks/huggingface)
@@ -244,7 +244,7 @@ Additional configuration options are available through generic [wandb environmen
 
 Refer to related [documentation & examples](https://docs.wandb.ai/integrations/huggingface).
 
-### Comet.ml
+### Comet
 
 To use `comet_ml`, install the Python package with:
 
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 29528feb515..e9c91192ecf 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -51,19 +51,25 @@ if is_torch_available():
     import torch
 
 # comet_ml requires to be imported before any ML frameworks
-_has_comet = importlib.util.find_spec("comet_ml") is not None and os.getenv("COMET_MODE", "").upper() != "DISABLED"
-if _has_comet:
-    try:
-        import comet_ml  # noqa: F401
+_MIN_COMET_VERSION = "3.43.2"
+try:
+    _comet_version = importlib.metadata.version("comet_ml")
+    _is_comet_installed = True
 
-        if hasattr(comet_ml, "config") and comet_ml.config.get_config("comet.api_key"):
-            _has_comet = True
-        else:
-            if os.getenv("COMET_MODE", "").upper() != "DISABLED":
-                logger.warning("comet_ml is installed but `COMET_API_KEY` is not set.")
-            _has_comet = False
-    except (ImportError, ValueError):
-        _has_comet = False
+    _is_comet_recent_enough = packaging.version.parse(_comet_version) >= packaging.version.parse(_MIN_COMET_VERSION)
+
+    # Check if the Comet API Key is set
+    import comet_ml
+
+    if comet_ml.config.get_config("comet.api_key") is not None:
+        _is_comet_configured = True
+    else:
+        _is_comet_configured = False
+except (importlib.metadata.PackageNotFoundError, ImportError, ValueError, TypeError, AttributeError, KeyError):
+    _comet_version = None
+    _is_comet_installed = False
+    _is_comet_recent_enough = False
+    _is_comet_configured = False
 
 _has_neptune = (
     importlib.util.find_spec("neptune") is not None or importlib.util.find_spec("neptune-client") is not None
@@ -103,7 +109,36 @@ def is_clearml_available():
 
 
 def is_comet_available():
-    return _has_comet
+    if os.getenv("COMET_MODE", "").upper() == "DISABLED":
+        logger.warning(
+            "Using the `COMET_MODE=DISABLED` environment variable is deprecated and will be removed in v5. Use the "
+            "--report_to flag to control the integrations used for logging result (for instance --report_to none)."
+        )
+        return False
+
+    if _is_comet_installed is False:
+        return False
+
+    if _is_comet_recent_enough is False:
+        logger.warning(
+            "comet_ml version %s is installed, but version %s or higher is required. "
+            "Please update comet_ml to the latest version to enable Comet logging with pip install 'comet-ml>=%s'.",
+            _comet_version,
+            _MIN_COMET_VERSION,
+            _MIN_COMET_VERSION,
+        )
+        return False
+
+    if _is_comet_configured is False:
+        logger.warning(
+            "comet_ml is installed but the Comet API Key is not configured. "
+            "Please set the `COMET_API_KEY` environment variable to enable Comet logging. "
+            "Check out the documentation for other ways of configuring it: "
+            "https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#set-the-api-key"
+        )
+        return False
+
+    return True
 
 
 def is_tensorboard_available():
@@ -936,56 +971,109 @@ class WandbCallback(TrainerCallback):
 
 class CometCallback(TrainerCallback):
     """
-    A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.ml/site/).
+    A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.com/site/).
     """
 
     def __init__(self):
-        if not _has_comet:
-            raise RuntimeError("CometCallback requires comet-ml to be installed. Run `pip install comet-ml`.")
+        if _is_comet_installed is False or _is_comet_recent_enough is False:
+            raise RuntimeError(
+                f"CometCallback requires comet-ml>={_MIN_COMET_VERSION} to be installed. Run `pip install comet-ml>={_MIN_COMET_VERSION}`."
+            )
         self._initialized = False
         self._log_assets = False
+        self._experiment = None
 
     def setup(self, args, state, model):
         """
-        Setup the optional Comet.ml integration.
+        Setup the optional Comet integration.
 
         Environment:
-        - **COMET_MODE** (`str`, *optional*, defaults to `ONLINE`):
-            Whether to create an online, offline experiment or disable Comet logging. Can be `OFFLINE`, `ONLINE`, or
-            `DISABLED`.
+        - **COMET_MODE** (`str`, *optional*, default to `get_or_create`):
+            Control whether to create and log to a new Comet experiment or append to an existing experiment.
+            It accepts the following values:
+                * `get_or_create`: Decides automatically depending if
+                  `COMET_EXPERIMENT_KEY` is set and whether an Experiment
+                  with that key already exists or not.
+                * `create`: Always create a new Comet Experiment.
+                * `get`: Always try to append to an Existing Comet Experiment.
+                  Requires `COMET_EXPERIMENT_KEY` to be set.
+                * `ONLINE`: **deprecated**, used to create an online
+                  Experiment. Use `COMET_START_ONLINE=1` instead.
+                * `OFFLINE`: **deprecated**, used to created an offline
+                  Experiment. Use `COMET_START_ONLINE=0` instead.
+                * `DISABLED`: **deprecated**, used to disable Comet logging.
+                  Use the `--report_to` flag to control the integrations used
+                  for logging result instead.
         - **COMET_PROJECT_NAME** (`str`, *optional*):
             Comet project name for experiments.
-        - **COMET_OFFLINE_DIRECTORY** (`str`, *optional*):
-            Folder to use for saving offline experiments when `COMET_MODE` is `OFFLINE`.
         - **COMET_LOG_ASSETS** (`str`, *optional*, defaults to `TRUE`):
             Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be `TRUE`, or
             `FALSE`.
 
         For a number of configurable items in the environment, see
-        [here](https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables).
+        [here](https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options).
         """
         self._initialized = True
         log_assets = os.getenv("COMET_LOG_ASSETS", "FALSE").upper()
         if log_assets in {"TRUE", "1"}:
             self._log_assets = True
         if state.is_world_process_zero:
-            comet_mode = os.getenv("COMET_MODE", "ONLINE").upper()
-            experiment = None
-            experiment_kwargs = {"project_name": os.getenv("COMET_PROJECT_NAME", "huggingface")}
-            if comet_mode == "ONLINE":
-                experiment = comet_ml.Experiment(**experiment_kwargs)
-                experiment.log_other("Created from", "transformers")
-                logger.info("Automatic Comet.ml online logging enabled")
-            elif comet_mode == "OFFLINE":
-                experiment_kwargs["offline_directory"] = os.getenv("COMET_OFFLINE_DIRECTORY", "./")
-                experiment = comet_ml.OfflineExperiment(**experiment_kwargs)
-                experiment.log_other("Created from", "transformers")
-                logger.info("Automatic Comet.ml offline logging enabled; use `comet upload` when finished")
-            if experiment is not None:
-                experiment._set_model_graph(model, framework="transformers")
-                experiment._log_parameters(args, prefix="args/", framework="transformers")
-                if hasattr(model, "config"):
-                    experiment._log_parameters(model.config, prefix="config/", framework="transformers")
+            comet_old_mode = os.getenv("COMET_MODE")
+
+            mode = None
+            online = None
+
+            if comet_old_mode is not None:
+                comet_old_mode = comet_old_mode.lower()
+
+                if comet_old_mode == "online":
+                    online = True
+                elif comet_old_mode == "offline":
+                    online = False
+                elif comet_old_mode in ("get", "get_or_create", "create"):
+                    mode = comet_old_mode
+                elif comet_old_mode:
+                    logger.warning("Invalid COMET_MODE env value %r, Comet logging is disabled", comet_old_mode)
+                    return
+
+            # For HPO, we always create a new experiment for each trial
+            if state.is_hyper_param_search:
+                if mode is not None:
+                    logger.warning(
+                        "Hyperparameter Search is enabled, forcing the creation of new experimetns, COMET_MODE value %r  is ignored",
+                        comet_old_mode,
+                    )
+                mode = "create"
+
+            import comet_ml
+
+            # Do not use the default run_name as the experiment name
+            if args.run_name is not None and args.run_name != args.output_dir:
+                experiment_config = comet_ml.ExperimentConfig(name=args.run_name)
+            else:
+                experiment_config = comet_ml.ExperimentConfig()
+
+            self._experiment = comet_ml.start(online=online, mode=mode, experiment_config=experiment_config)
+            self._experiment.__internal_api__set_model_graph__(model, framework="transformers")
+
+            params = {"args": args.to_dict()}
+
+            if hasattr(model, "config") and model.config is not None:
+                model_config = model.config.to_dict()
+                params["config"] = model_config
+            if hasattr(model, "peft_config") and model.peft_config is not None:
+                peft_config = model.peft_config
+                params["peft_config"] = peft_config
+
+            self._experiment.__internal_api__log_parameters__(
+                params, framework="transformers", source="manual", flatten_nested=True
+            )
+
+            if state.is_hyper_param_search:
+                optimization_id = getattr(state, "trial_name", None)
+                optimization_params = getattr(state, "trial_params", None)
+
+                self._experiment.log_optimization(optimization_id=optimization_id, parameters=optimization_params)
 
     def on_train_begin(self, args, state, control, model=None, **kwargs):
         if not self._initialized:
@@ -995,20 +1083,24 @@ class CometCallback(TrainerCallback):
         if not self._initialized:
             self.setup(args, state, model)
         if state.is_world_process_zero:
-            experiment = comet_ml.config.get_global_experiment()
-            if experiment is not None:
-                experiment._log_metrics(logs, step=state.global_step, epoch=state.epoch, framework="transformers")
+            if self._experiment is not None:
+                self._experiment.__internal_api__log_metrics__(
+                    logs, step=state.global_step, epoch=state.epoch, framework="transformers"
+                )
 
     def on_train_end(self, args, state, control, **kwargs):
         if self._initialized and state.is_world_process_zero:
-            experiment = comet_ml.config.get_global_experiment()
-            if experiment is not None:
+            if self._experiment is not None:
                 if self._log_assets is True:
                     logger.info("Logging checkpoints. This may take time.")
-                    experiment.log_asset_folder(
+                    self._experiment.log_asset_folder(
                         args.output_dir, recursive=True, log_file_name=True, step=state.global_step
                     )
-                experiment.end()
+
+            # We create one experiment per trial in HPO mode
+            if state.is_hyper_param_search:
+                self._experiment.clean()
+                self._initialized = False
 
 
 class AzureMLCallback(TrainerCallback):
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 9f305f6ce2e..48179f0e0a4 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -436,8 +436,9 @@ class TrainingArguments:
             use the corresponding output (usually index 2) as the past state and feed it to the model at the next
             training step under the keyword argument `mems`.
         run_name (`str`, *optional*, defaults to `output_dir`):
-            A descriptor for the run. Typically used for [wandb](https://www.wandb.com/) and
-            [mlflow](https://www.mlflow.org/) logging. If not specified, will be the same as `output_dir`.
+            A descriptor for the run. Typically used for [wandb](https://www.wandb.com/),
+            [mlflow](https://www.mlflow.org/) and [comet](https://www.comet.com/site) logging. If not specified, will
+            be the same as `output_dir`.
         disable_tqdm (`bool`, *optional*):
             Whether or not to disable the tqdm progress bars and table of metrics produced by
             [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
@@ -1149,7 +1150,8 @@ class TrainingArguments:
     )
 
     run_name: Optional[str] = field(
-        default=None, metadata={"help": "An optional descriptor for the run. Notably used for wandb logging."}
+        default=None,
+        metadata={"help": "An optional descriptor for the run. Notably used for wandb, mlflow and comet logging."},
     )
     disable_tqdm: Optional[bool] = field(
         default=None, metadata={"help": "Whether or not to disable the tqdm progress bars."}
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 12a6c5afe92..9df53c3f1d6 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -160,7 +160,7 @@ class TFTrainingArguments(TrainingArguments):
             Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
             automatically detect from metadata.
         run_name (`str`, *optional*):
-            A descriptor for the run. Notably used for wandb logging.
+            A descriptor for the run. Notably used for wandb, mlflow and comet logging.
         xla (`bool`, *optional*):
             Whether to activate the XLA compilation or not.
     """