Hot-fix-mixstral-loss (#27948)

* fix loss computation * compute on GPU if possible
2025-08-03 03:31:05 +06:00 · 2023-12-12 12:20:28 +01:00 · 2023-12-12 12:20:28 +01:00 · 680c610f97
commit 680c610f97
parent 4b759da8be
1 changed files with 2 additions and 1 deletions
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@ -95,7 +95,8 @@ def load_balancing_loss_func(gate_logits: torch.Tensor, num_experts: torch.Tenso

    if isinstance(gate_logits, tuple):
        # cat along the layers?
-        gate_logits = torch.cat(gate_logits, dim=0)
+        compute_device = gate_logits[0].device
+        gate_logits = torch.cat([gate.to(compute_device) for gate in gate_logits], dim=0)

    routing_weights, selected_experts = torch.topk(gate_logits, top_k, dim=-1)
    routing_weights = routing_weights.softmax(dim=-1)