Fix many HPU failures in the CI (#39066)

* more torch.hpu patches

* increase top_k because it results in flaky behavior when Tempreture, TopP and TopK are used together, which ends up killing beams early.

* remove temporal fix

* fix scatter operation when input and src are the same

* trigger

* fix and reduce

* skip finding batch size as it makes the hpu go loco

* fix fsdp (yay all are passing)

* fix checking equal nan values

* style

* remove models list

* order

* rename to cuda_extensions

* Update src/transformers/trainer.py
This commit is contained in:
Ilyas Moutawwakil 2025-07-03 11:17:27 +02:00 committed by GitHub
parent bff964c429
commit 18e0cae207
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 71 additions and 54 deletions

View File

@ -84,8 +84,6 @@ jobs:
machine_type: ${{ matrix.machine_type }}
folder_slices: ${{ needs.setup.outputs.folder_slices }}
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
report_name_prefix: run_models_gpu
secrets: inherit
run_trainer_and_fsdp_gpu:
@ -104,11 +102,10 @@ jobs:
folder_slices: ${{ needs.setup.outputs.folder_slices }}
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
report_name_prefix: run_trainer_and_fsdp_gpu
secrets: inherit
run_pipelines_gpu:
if: ${{ inputs.job == 'run_pipelines_gpu' }}
run_pipelines_torch_gpu:
if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
name: Pipelines
strategy:
fail-fast: false
@ -161,20 +158,20 @@ jobs:
- name: Run all pipeline tests on Intel Gaudi
run: |
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: |
cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
cat reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
path: reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
run_examples_gpu:
if: ${{ inputs.job == 'run_examples_gpu' }}
@ -248,8 +245,8 @@ jobs:
name: ${{ env.machine_type }}_run_examples_gpu_test_reports
path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
run_deepspeed_gpu:
if: ${{ inputs.job == 'run_deepspeed_gpu' }}
run_torch_cuda_extensions_gpu:
if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
name: Intel Gaudi deepspeed tests
strategy:
fail-fast: false
@ -305,20 +302,20 @@ jobs:
- name: Run all deepspeed tests on intel Gaudi
run: |
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: |
cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
cat reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
path: reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
send_results:
name: Slack Report
@ -327,8 +324,8 @@ jobs:
setup,
run_models_gpu,
run_examples_gpu,
run_pipelines_gpu,
run_deepspeed_gpu,
run_torch_cuda_extensions_gpu,
run_pipelines_torch_gpu,
run_trainer_and_fsdp_gpu,
]
if: ${{ always() }}

View File

@ -23,7 +23,7 @@ jobs:
name: Pipeline CI
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
with:
job: run_pipelines_gpu
job: run_pipelines_torch_gpu
ci_event: Scheduled CI (Intel) - Gaudi3
runner_scale_set: itac-bm-emr-gaudi3-dell
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
@ -47,7 +47,7 @@ jobs:
name: DeepSpeed CI
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
with:
job: run_deepspeed_gpu
job: run_torch_cuda_extensions_gpu
ci_event: Scheduled CI (Intel) - Gaudi3
runner_scale_set: itac-bm-emr-gaudi3-dell
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"

View File

@ -865,50 +865,59 @@ def is_torch_hpu_available():
if not hasattr(torch, "hpu") or not torch.hpu.is_available():
return False
import habana_frameworks.torch.utils.experimental as htexp # noqa: F401
# IlyasMoutawwakil: We patch masked_fill_ for int64 tensors to avoid a bug on Gaudi1
# synNodeCreateWithId failed for node: masked_fill_fwd_i64 with synStatus 26 [Generic failure]
# This can be removed once Gaudi1 support is discontinued but for now we need it to keep using
# dl1.24xlarge Gaudi1 instances on AWS for testing.
# check if the device is Gaudi1 (vs Gaudi2, Gaudi3).
if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi:
original_masked_fill_ = torch.Tensor.masked_fill_
def patched_masked_fill_(self, mask, value):
if self.dtype == torch.int64:
logger.warning_once(
"In-place tensor.masked_fill_(mask, value) is not supported for int64 tensors on Gaudi1. "
"This operation will be performed out-of-place using tensor[mask] = value."
)
self[mask] = value
else:
original_masked_fill_(self, mask, value)
torch.Tensor.masked_fill_ = patched_masked_fill_
# We patch torch.gather for int64 tensors to avoid a bug on Gaudi
# Graph compile failed with synStatus 26 [Generic failure]
# This can be removed once bug is fixed but for now we need it.
original_gather = torch.Tensor.gather
original_gather = torch.gather
def patched_gather(input: torch.Tensor, dim: int, index: torch.LongTensor) -> torch.Tensor:
if input.dtype == torch.int64 and input.device.type == "hpu":
logger.warning_once(
"torch.gather is not supported for int64 tensors on Gaudi. "
"This operation will be performed patched_gather using indexing."
)
idx = [torch.arange(size, device=input.device, dtype=input.dtype) for size in input.shape]
idx[dim] = index
idx = tuple(idx)
output = input[idx]
return output
return original_gather(input.to(torch.int32), dim, index).to(torch.int64)
else:
return original_gather(input, dim, index)
torch.gather = patched_gather
torch.Tensor.gather = patched_gather
original_take_along_dim = torch.take_along_dim
def patched_take_along_dim(
input: torch.Tensor, indices: torch.LongTensor, dim: Optional[int] = None
) -> torch.Tensor:
if input.dtype == torch.int64 and input.device.type == "hpu":
return original_take_along_dim(input.to(torch.int32), indices, dim).to(torch.int64)
else:
return original_take_along_dim(input, indices, dim)
torch.take_along_dim = patched_take_along_dim
original_cholesky = torch.linalg.cholesky
def safe_cholesky(A, *args, **kwargs):
output = original_cholesky(A, *args, **kwargs)
if torch.isnan(output).any():
jitter_value = 1e-9
diag_jitter = torch.eye(A.size(-1), dtype=A.dtype, device=A.device) * jitter_value
output = original_cholesky(A + diag_jitter, *args, **kwargs)
return output
torch.linalg.cholesky = safe_cholesky
original_scatter = torch.scatter
def patched_scatter(
input: torch.Tensor, dim: int, index: torch.Tensor, src: torch.Tensor, *args, **kwargs
) -> torch.Tensor:
if input.device.type == "hpu" and input is src:
return original_scatter(input, dim, index, src.clone(), *args, **kwargs)
else:
return original_scatter(input, dim, index, src, *args, **kwargs)
torch.scatter = patched_scatter
torch.Tensor.scatter = patched_scatter
# IlyasMoutawwakil: we patch torch.compile to use the HPU backend by default
# https://github.com/huggingface/transformers/pull/38790#discussion_r2157043944
# This is necessary for cases where torch.compile is used as a decorator (defaulting to inductor)

View File

@ -662,6 +662,11 @@ class TrainerIntegrationCommon:
metrics = trainer.evaluate()
self.assertEqual(metrics[metric], best_value)
def remove_nan_logs(self, log):
for key in list(log.keys()):
if log[key] != log[key]: # Check if the value is NaN
del log[key]
def check_trainer_state_are_the_same(self, trainer_state, trainer_state1):
# We'll pop things so operate on copies.
state = trainer_state.copy()
@ -675,6 +680,10 @@ class TrainerIntegrationCommon:
for key in skip_log_keys:
_ = log.pop(key, None)
_ = log1.pop(key, None)
self.remove_nan_logs(log)
self.remove_nan_logs(log1)
self.assertEqual(log, log1)
def convert_to_sharded_checkpoint(self, folder, save_safe=True, load_safe=True):
@ -3174,6 +3183,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
self.assertAlmostEqual(b, b1, delta=1e-5)
@slow
@require_non_hpu
@require_accelerate
@require_torch_non_multi_accelerator
def test_auto_batch_size_finder(self):

View File

@ -62,4 +62,5 @@ if __name__ == "__main__":
start = end
end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0)
model_splits.append(d[start:end])
print(model_splits)