From d47a9e8ce556be790ac98c0a9024dd41c6328fb0 Mon Sep 17 00:00:00 2001
From: Emin Orhan <17934496+eminorhan@users.noreply.github.com>
Date: Tue, 27 Aug 2024 09:50:00 -0400
Subject: [PATCH] fix redundant checkpointing in example training scripts
 (#33131)

* fix redundant checkpointing in example scripts

* Update examples/pytorch/image-classification/run_image_classification_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/translation/run_translation_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/token-classification/run_ner_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/text-classification/run_glue_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/summarization/run_summarization_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/language-modeling/run_mlm_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/language-modeling/run_fim_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/language-modeling/run_clm_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/image-pretraining/run_mim_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/multiple-choice/run_swag_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/question-answering/run_qa_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/object-detection/run_object_detection_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 .../image-classification/run_image_classification_no_trainer.py | 2 +-
 examples/pytorch/image-pretraining/run_mim_no_trainer.py        | 2 +-
 .../run_instance_segmentation_no_trainer.py                     | 2 +-
 examples/pytorch/language-modeling/run_clm_no_trainer.py        | 2 +-
 examples/pytorch/language-modeling/run_fim_no_trainer.py        | 2 +-
 examples/pytorch/language-modeling/run_mlm_no_trainer.py        | 2 +-
 examples/pytorch/multiple-choice/run_swag_no_trainer.py         | 2 +-
 .../pytorch/object-detection/run_object_detection_no_trainer.py | 2 +-
 .../pytorch/question-answering/run_qa_beam_search_no_trainer.py | 2 +-
 examples/pytorch/question-answering/run_qa_no_trainer.py        | 2 +-
 .../run_semantic_segmentation_no_trainer.py                     | 2 +-
 examples/pytorch/summarization/run_summarization_no_trainer.py  | 2 +-
 examples/pytorch/text-classification/run_glue_no_trainer.py     | 2 +-
 examples/pytorch/token-classification/run_ner_no_trainer.py     | 2 +-
 examples/pytorch/translation/run_translation_no_trainer.py      | 2 +-
 15 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 8f5c0870944..0c8068d4d45 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -544,7 +544,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
index f208498e8bf..e533ddfa8b0 100644
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@@ -723,7 +723,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
index 73609284da9..1605f607acb 100644
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@@ -639,7 +639,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 4bba7de4bb6..43ecba5f4d8 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -638,7 +638,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py
index f70f60b31c6..2c954a1b653 100644
--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@@ -838,7 +838,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 7d50383cfe7..c98687efadf 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -675,7 +675,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index a7eb30d47b4..8f7693ae5b0 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -619,7 +619,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
index 7913a3b4c5b..6de61be6309 100644
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@@ -677,7 +677,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index f86d5f40045..ee791c0c8dd 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -879,7 +879,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     accelerator.save_state(f"step_{completed_steps}")
 
             if completed_steps >= args.max_train_steps:
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 2bca56cad08..7ae0d488bc4 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -894,7 +894,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index 11e9d56c1ac..35c3744ab5f 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -516,7 +516,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 8af76fe8079..36cd590ea5c 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -688,7 +688,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 8dfe23dfb33..ac62edbe5e9 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -564,7 +564,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 2fa100f635b..2afb38bb44b 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -722,7 +722,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 6d8ac35268a..97da3f9541d 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -664,7 +664,7 @@ def main():
                 completed_steps += 1
 
             if isinstance(checkpointing_steps, int):
-                if completed_steps % checkpointing_steps == 0:
+                if completed_steps % checkpointing_steps == 0 and accelerator.sync_gradients:
                     output_dir = f"step_{completed_steps}"
                     if args.output_dir is not None:
                         output_dir = os.path.join(args.output_dir, output_dir)