From 29a3f5ed8c5588151419012408f394b4644d4aa6 Mon Sep 17 00:00:00 2001
From: Yao Matrix <matrix.yao@intel.com>
Date: Mon, 30 Jun 2025 23:54:05 +0800
Subject: [PATCH] switch default xpu tp backend to pytorch built-in XCCL from
 pytorch 2.8 (#39024)

* switch default xpu tp backend to pytorch built-in XCCL from pytorch 2.8

Signed-off-by: YAO Matrix <matrix.yao@intel.com>

* Update docs/source/en/perf_infer_gpu_multi.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update perf_infer_gpu_multi.md

* Update perf_infer_gpu_multi.md

* Update perf_infer_gpu_multi.md

---------

Signed-off-by: YAO Matrix <matrix.yao@intel.com>
Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/perf_infer_gpu_multi.md           | 6 +++---
 src/transformers/integrations/tensor_parallel.py | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md
index f269960d3fc..d8761befaac 100644
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@@ -15,9 +15,9 @@ rendered properly in your Markdown viewer.
 
 # Distributed inference
 
-When a model doesn't fit on a single GPU, distributed inference with [tensor parallelism](./perf_train_gpu_many#tensor-parallelism) can help. Tensor parallelism shards a model onto multiple GPUs and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each GPU can process a tensor slice.
+When a model doesn't fit on a single GPU, distributed inference with [tensor parallelism](./perf_train_gpu_many#tensor-parallelism) can help. Tensor parallelism shards a model onto multiple accelerators (CUDA GPU, Intel XPU, etc.) and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each accelerator can process a tensor slice.
 
-However, tensor parallelism adds communication overhead and should be used on single machine setups with multiple GPUs to take advantage of fast intra-node communication. For multi-node training, it may be more efficient to use pipeline or data parallelism depending on your use case.
+However, tensor parallelism adds communication overhead and should be used on single machine setups with multiple accelerators to take advantage of fast intra-node communication. For multi-node training, it may be more efficient to use pipeline or data parallelism depending on your use case.
 
 > [!TIP]
 > Refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) section on tensor parallelism to learn more.
@@ -308,4 +308,4 @@ The most important part of DTensor is the `placement` attribute because it tells
     bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs
     ```
 
-- `Partial()` - Indicates a tensor is pending a reduction operation (not typically relevant for usage in Transformers).
\ No newline at end of file
+- `Partial()` - Indicates a tensor is pending a reduction operation (not typically relevant for usage in Transformers).
diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
index ecb5c89207e..0ffce54977f 100644
--- a/src/transformers/integrations/tensor_parallel.py
+++ b/src/transformers/integrations/tensor_parallel.py
@@ -57,10 +57,12 @@ def initialize_tensor_parallelism(tp_plan, tp_size=None):
             local_rank = int(os.environ["LOCAL_RANK"])
             world_size = int(os.environ["WORLD_SIZE"])
 
-            backend_map = {"cuda": "nccl", "cpu": "gloo", "xpu": "ccl", "hpu": "hccl"}
+            backend_map = {"cuda": "nccl", "cpu": "gloo", "xpu": "xccl", "hpu": "hccl"}
             backend = backend_map.get(device_type)
             if device_type == "cpu" and int(os.environ.get("CCL_WORKER_COUNT", 0)):
                 backend = "ccl"
+            if device_type == "xpu" and not is_torch_greater_or_equal("2.8", accept_dev=True):
+                backend = "ccl"
 
             torch.distributed.init_process_group(backend=backend, rank=rank, world_size=world_size)
             current_device = getattr(torch, device_type)