name: SSH into our runners on: workflow_dispatch: inputs: runner_type: description: 'Type of runner to test (a10 or t4)' required: true docker_image: description: 'Name of the Docker image' required: true num_gpus: description: 'Type of the number of gpus to use (`single` or `multi`)' required: true env: HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} HF_HOME: /mnt/cache TRANSFORMERS_IS_CI: yes OMP_NUM_THREADS: 8 MKL_NUM_THREADS: 8 RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} TF_FORCE_GPU_ALLOW_GROWTH: true CUDA_VISIBLE_DEVICES: 0,1 jobs: get_runner: name: "Get runner to use" runs-on: ubuntu-22.04 outputs: RUNNER: ${{ steps.set_runner.outputs.RUNNER }} steps: - name: Get runner to use shell: bash run: | if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV else echo "RUNNER=" >> $GITHUB_ENV fi - name: Set runner to use id: set_runner run: | echo ${{ env.RUNNER }} echo "RUNNER=${{ env.RUNNER }}" >> $GITHUB_OUTPUT ssh_runner: name: "SSH" needs: get_runner runs-on: group: ${{ needs.get_runner.outputs.RUNNER }} container: image: ${{ github.event.inputs.docker_image }} options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Update clone working-directory: /transformers run: | git fetch && git checkout ${{ github.sha }} - name: Cleanup working-directory: /transformers run: | rm -rf tests/__pycache__ rm -rf tests/models/__pycache__ rm -rf reports - name: Show installed libraries and their versions working-directory: /transformers run: pip freeze - name: NVIDIA-SMI run: | nvidia-smi - name: Store Slack infos #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step shell: bash run: | echo "${{ github.actor }}" github_actor=${{ github.actor }} github_actor=${github_actor/'-'/'_'} echo "$github_actor" echo "github_actor=$github_actor" >> $GITHUB_ENV - name: Store Slack infos #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step shell: bash run: | echo "${{ env.github_actor }}" if [ "${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" != "" ]; then echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" >> $GITHUB_ENV else echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV fi - name: Tailscale # In order to be able to SSH when a test fails uses: huggingface/tailscale-action@main with: authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }} slackChannel: ${{ env.SLACKCHANNEL }} slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} waitForSSH: true sshTimeout: 15m