diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index 264bce993a3..94c6f19d434 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -663,7 +663,7 @@ class BeitEncoder(nn.Module): self.relative_position_bias = BeitRelativePositionBias(config, window_size=window_size) # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers, device="cpu")] self.layer = nn.ModuleList( [ BeitLayer( diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index b2fdf0dd7ee..a7a51cc86af 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -829,7 +829,7 @@ class ClapAudioEncoder(nn.Module): self.num_features = int(config.patch_embeds_hidden_size * 2 ** (self.num_layers - 1)) - drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] grid_size = self.patch_embed.grid_size self.input_resolutions = [(grid_size[0] // (2**i), grid_size[1] // (2**i)) for i in range(self.num_layers)] diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py index 5c769926d2c..d5f65f18b60 100755 --- a/src/transformers/models/convnext/modeling_convnext.py +++ b/src/transformers/models/convnext/modeling_convnext.py @@ -225,7 +225,8 @@ class ConvNextEncoder(nn.Module): super().__init__() self.stages = nn.ModuleList() drop_path_rates = [ - x.tolist() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths)).split(config.depths) + x.tolist() + for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu").split(config.depths) ] prev_chs = config.hidden_sizes[0] for i in range(config.num_stages): diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py index a9d8332ff0a..a3af36bac35 100644 --- a/src/transformers/models/convnextv2/modeling_convnextv2.py +++ b/src/transformers/models/convnextv2/modeling_convnextv2.py @@ -245,7 +245,8 @@ class ConvNextV2Encoder(nn.Module): super().__init__() self.stages = nn.ModuleList() drop_path_rates = [ - x.tolist() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths)).split(config.depths) + x.tolist() + for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu").split(config.depths) ] prev_chs = config.hidden_sizes[0] for i in range(config.num_stages): diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py index c2ca9eab5a1..0088b8b440d 100644 --- a/src/transformers/models/cvt/modeling_cvt.py +++ b/src/transformers/models/cvt/modeling_cvt.py @@ -449,7 +449,9 @@ class CvtStage(nn.Module): dropout_rate=config.drop_rate[self.stage], ) - drop_path_rates = [x.item() for x in torch.linspace(0, config.drop_path_rate[self.stage], config.depth[stage])] + drop_path_rates = [ + x.item() for x in torch.linspace(0, config.drop_path_rate[self.stage], config.depth[stage], device="cpu") + ] self.layers = nn.Sequential( *[ diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index 12a407c51ad..7ea0ffd39d4 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -676,7 +676,7 @@ class Data2VecVisionEncoder(nn.Module): self.relative_position_bias = Data2VecVisionRelativePositionBias(config, window_size=window_size) # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers, device="cpu")] self.layer = nn.ModuleList( [ Data2VecVisionLayer( diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py index fd223704cd0..43d8f3f1079 100644 --- a/src/transformers/models/donut/modeling_donut_swin.py +++ b/src/transformers/models/donut/modeling_donut_swin.py @@ -790,7 +790,7 @@ class DonutSwinEncoder(nn.Module): super().__init__() self.num_layers = len(config.depths) self.config = config - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] self.layers = nn.ModuleList( [ DonutSwinStage( diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py index 143d4e066be..41336c1b53a 100644 --- a/src/transformers/models/focalnet/modeling_focalnet.py +++ b/src/transformers/models/focalnet/modeling_focalnet.py @@ -486,7 +486,7 @@ class FocalNetStage(nn.Module): downsample = FocalNetPatchEmbeddings if (index < self.num_stages - 1) else None # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] drop_path = dpr[sum(config.depths[:index]) : sum(config.depths[: index + 1])] self.layers = nn.ModuleList( diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py index 8842b10cc15..0c90746d187 100755 --- a/src/transformers/models/glpn/modeling_glpn.py +++ b/src/transformers/models/glpn/modeling_glpn.py @@ -331,7 +331,7 @@ class GLPNEncoder(nn.Module): self.config = config # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] # patch embeddings embeddings = [] diff --git a/src/transformers/models/hiera/modeling_hiera.py b/src/transformers/models/hiera/modeling_hiera.py index 14a8dad524f..fa01dd909dc 100644 --- a/src/transformers/models/hiera/modeling_hiera.py +++ b/src/transformers/models/hiera/modeling_hiera.py @@ -639,9 +639,9 @@ class HieraEncoder(nn.Module): super().__init__() total_depth = sum(config.depths) # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, total_depth)] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, total_depth, device="cpu")] # query strides rule - cumulative_depths = torch.tensor(config.depths).cumsum(0).tolist() + cumulative_depths = torch.tensor(config.depths, device="cpu").cumsum(0).tolist() query_pool_layer = cumulative_depths[: config.num_query_pool] query_strides = [math.prod(config.query_stride) if i in query_pool_layer else 1 for i in range(total_depth)] diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py index dd3fc11ca16..fd25e84ffe7 100644 --- a/src/transformers/models/maskformer/modeling_maskformer_swin.py +++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py @@ -692,7 +692,7 @@ class MaskFormerSwinEncoder(nn.Module): super().__init__() self.num_layers = len(config.depths) self.config = config - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] self.layers = nn.ModuleList( [ MaskFormerSwinStage( diff --git a/src/transformers/models/mgp_str/modeling_mgp_str.py b/src/transformers/models/mgp_str/modeling_mgp_str.py index 07fd2fb4cb4..684cfd2e786 100644 --- a/src/transformers/models/mgp_str/modeling_mgp_str.py +++ b/src/transformers/models/mgp_str/modeling_mgp_str.py @@ -246,7 +246,7 @@ class MgpstrEncoder(nn.Module): def __init__(self, config: MgpstrConfig): super().__init__() # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers, device="cpu")] self.blocks = nn.Sequential( *[MgpstrLayer(config=config, drop_path=dpr[i]) for i in range(config.num_hidden_layers)] diff --git a/src/transformers/models/poolformer/modeling_poolformer.py b/src/transformers/models/poolformer/modeling_poolformer.py index d13304405ab..5b2ed8868b1 100755 --- a/src/transformers/models/poolformer/modeling_poolformer.py +++ b/src/transformers/models/poolformer/modeling_poolformer.py @@ -194,7 +194,7 @@ class PoolFormerEncoder(nn.Module): super().__init__() self.config = config # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] # patch embeddings embeddings = [] diff --git a/src/transformers/models/pvt/modeling_pvt.py b/src/transformers/models/pvt/modeling_pvt.py index c061a8fdf0f..f9eb1a3c586 100755 --- a/src/transformers/models/pvt/modeling_pvt.py +++ b/src/transformers/models/pvt/modeling_pvt.py @@ -369,7 +369,7 @@ class PvtEncoder(nn.Module): self.config = config # stochastic depth decay rule - drop_path_decays = torch.linspace(0, config.drop_path_rate, sum(config.depths)).tolist() + drop_path_decays = torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu").tolist() # patch embeddings embeddings = [] diff --git a/src/transformers/models/pvt_v2/modeling_pvt_v2.py b/src/transformers/models/pvt_v2/modeling_pvt_v2.py index 19e783a4fa2..517d2edbe32 100644 --- a/src/transformers/models/pvt_v2/modeling_pvt_v2.py +++ b/src/transformers/models/pvt_v2/modeling_pvt_v2.py @@ -323,7 +323,7 @@ class PvtV2EncoderLayer(nn.Module): ) # Transformer block # stochastic depth decay rule - drop_path_decays = torch.linspace(0, config.drop_path_rate, sum(config.depths)).tolist() + drop_path_decays = torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu").tolist() block_layers = [] for block_idx in range(config.depths[layer_idx]): block_layers.append( diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py index 84a90e5e623..4f9da2cab7b 100755 --- a/src/transformers/models/segformer/modeling_segformer.py +++ b/src/transformers/models/segformer/modeling_segformer.py @@ -356,7 +356,9 @@ class SegformerEncoder(nn.Module): self.config = config # stochastic depth decay rule - drop_path_decays = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + drop_path_decays = [ + x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu") + ] # patch embeddings embeddings = [] diff --git a/src/transformers/models/seggpt/modeling_seggpt.py b/src/transformers/models/seggpt/modeling_seggpt.py index d1c3d2eb6dc..e4058b33467 100644 --- a/src/transformers/models/seggpt/modeling_seggpt.py +++ b/src/transformers/models/seggpt/modeling_seggpt.py @@ -460,7 +460,7 @@ class SegGptEncoder(nn.Module): def __init__(self, config: SegGptConfig) -> None: super().__init__() self.config = config - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers, device="cpu")] self.layers = nn.ModuleList([SegGptLayer(config, dpr[i]) for i in range(config.num_hidden_layers)]) self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.gradient_checkpointing = False diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index e155874d8f0..ad998f6ff66 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -823,7 +823,7 @@ class SwinEncoder(nn.Module): super().__init__() self.num_layers = len(config.depths) self.config = config - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] self.layers = nn.ModuleList( [ SwinStage( diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py index a3ae0ed5881..6543fe5cdff 100644 --- a/src/transformers/models/swin2sr/modeling_swin2sr.py +++ b/src/transformers/models/swin2sr/modeling_swin2sr.py @@ -682,7 +682,7 @@ class Swin2SREncoder(nn.Module): super().__init__() self.num_stages = len(config.depths) self.config = config - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] self.stages = nn.ModuleList( [ Swin2SRStage( diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py index 1e07af70159..52e33cf3d99 100644 --- a/src/transformers/models/swinv2/modeling_swinv2.py +++ b/src/transformers/models/swinv2/modeling_swinv2.py @@ -877,7 +877,7 @@ class Swinv2Encoder(nn.Module): self.config = config if self.config.pretrained_window_sizes is not None: pretrained_window_sizes = config.pretrained_window_sizes - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")] layers = [] for i_layer in range(self.num_layers): diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py index b348791869a..0bd79a6cec0 100644 --- a/src/transformers/models/timesformer/modeling_timesformer.py +++ b/src/transformers/models/timesformer/modeling_timesformer.py @@ -295,7 +295,7 @@ class TimesformerLayer(nn.Module): attention_type = config.attention_type drop_path_rates = [ - x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers) + x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers, device="cpu") ] # stochastic depth decay rule drop_path_rate = drop_path_rates[layer_index] diff --git a/src/transformers/models/vitdet/modeling_vitdet.py b/src/transformers/models/vitdet/modeling_vitdet.py index 9585c295e18..3d740522884 100644 --- a/src/transformers/models/vitdet/modeling_vitdet.py +++ b/src/transformers/models/vitdet/modeling_vitdet.py @@ -535,7 +535,7 @@ class VitDetEncoder(nn.Module): depth = config.num_hidden_layers # stochastic depth decay rule - drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, depth)] + drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, depth, device="cpu")] layers = [] for i in range(depth): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 3ec6ba30bf5..e546d023f5e 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -4528,6 +4528,13 @@ class ModelTesterMixin: ), ) + def test_can_be_initialized_on_meta(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + # If it does not raise here, the test passes + with torch.device("meta"): + _ = model_class(config) + @require_torch_accelerator def test_can_load_with_device_context_manager(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common()