Scheduling Jobs

**might**

# Below replace the word "UWNetID" with your UW NetID.
ssh UWNetid@klone.hyak.uw.edu

cd /gscratch/scrubbed/

# Below replace the word "UWNetID" with your UW NetID.
mkdir UWNetID
cd UWNetID

cp -r /sw/hyak101/basics .
# Above the "." is shorthand for "here" or the current directory.

cd basics
ls

data/
locator.sif
locator_NN_job.slurm
locator_NN_array.slurm
locator_NN_dropouts.slurm
loop_array.slurm
loop_job.slurm
loop_script.sh

hyakalloc

      Account resources available to user: UWNetID       
╭─────────┬──────────────┬──────┬────────┬──────┬───────╮
│ Account │    Partition │ CPUs │ Memory │ GPUs │       │
├─────────┼──────────────┼──────┼────────┼──────┼───────┤
│ account │      compute │  120 │   509G │    0 │ TOTAL │
│         │              │    0 │     0G │    0 │ USED  │
│         │              │  120 │   509G │    0 │ FREE  │
├─────────┼──────────────┼──────┼────────┼──────┼───────┤
│ account │    gpu-rtx6k │   10 │    81G │    2 │ TOTAL │
│         │              │    0 │     0G │    0 │ USED  │
│         │              │   10 │    81G │    2 │ FREE  │
╰─────────┴──────────────┴──────┴────────┴──────┴───────╯
 Checkpoint Resources  
╭───────┬──────┬──────╮
│       │ CPUs │ GPUs │
├───────┼──────┼──────┤
│ Idle: │ 1138 │  242 │
╰───────┴──────┴──────╯

╭─────────┬───────────┬──────┬────────────────────┬──────┬───────╮
│ Account │ Partition │ CPUs │             Memory │ GPUs │       │
├─────────┼───────────┼──────┼────────────────────┼──────┼───────┤
│    demo │   pending │    0 │ 18014398509481972G │    0 │ TOTAL │
│         │           │    0 │                 0G │    0 │ USED  │
│         │           │    0 │ 18014398509481972G │    0 │ FREE  │
╰─────────┴───────────┴──────┴────────────────────┴──────┴───────╯

sinfo -s

PARTITION        AVAIL  TIMELIMIT   NODES(A/I/O/T) NODELIST
compute-bigmem      up   infinite        28/0/0/28 n[3008-3011,3064,3066,3132-3133,3190,3244-3247,3252-3255,3353-3355,3400-3407]
ckpt                up   infinite   327/173/10/510 g[3001-3007,3010-3017,3020-3027,3030-3037,3040-3047,3050-3057,3060-3067,3070-3077,3080-3085],n[3000-3431],z[3001-3002,3005-3011]
ckpt-all            up   infinite   368/195/10/573 g[3001-3007,3010-3017,3020-3027,3030-3037,3040-3047,3050-3057,3060-3067,3070-3077,3080-3085,3090-3122],n[3000-3461],z[3001-3002,3005-3011]
ckpt-g2             up   infinite       41/22/0/63 g[3090-3122],n[3432-3461]
compute*            up   infinite    197/170/7/374 n[3012-3015,3024-3063,3068-3131,3134-3189,3191-3239,3248-3251,3256-3299,3304-3352,3356-3363,3368-3399,3408-3431]
cpu-g2              up   infinite        7/19/0/26 n[3432-3438,3440,3442,3444,3446-3461]
cpu-g2-mem2x        up   infinite          2/2/0/4 n[3439,3441,3443,3445]
gpu-2080ti          up   infinite        10/1/2/13 g[3001-3007,3014-3017,3027],z3001
gpu-a100            up   infinite          8/0/0/8 g[3080-3085],z[3010-3011]
gpu-a40             up   infinite        32/0/0/32 g[3040-3047,3050-3057,3060-3067,3070-3077]
gpu-l40             up   infinite        15/0/0/15 g[3090-3099,3115-3119]
gpu-l40s            up   infinite        17/1/0/18 g[3100-3114,3120-3122]
gpu-p100            up   infinite          2/0/0/2 z[3005-3006]
gpu-rtx6k           up   infinite        19/0/0/19 g[3010-3013,3020-3026,3030-3037]
gpu-titan           up   infinite          1/0/0/1 z3002
compute-hugemem     up   infinite        27/2/1/30 n[3000-3007,3016-3023,3065,3067,3240-3243,3300-3303,3364-3367]
compute-ultramem    up   infinite          3/0/0/3 z[3007-3009]

# Below replace the word "UWNetID" with your UW NetID.
watch -n 10 squeue -u UWNetID 

salloc --partition=ckpt-all --cpus-per-task=1 --mem=10G --time=2:00:00

salloc: Pending job allocation 18981043
salloc: job 18981043 queued and waiting for resources
salloc: job 18981043 has been allocated resources
salloc: Granted job allocation 18981043
salloc: Nodes n3424 are ready for job

[UWNetID@n3424 basics]$

      Account resources available to user: UWNetID       
╭─────────┬──────────────┬──────┬────────┬──────┬───────╮
│ Account │    Partition │ CPUs │ Memory │ GPUs │       │
├─────────┼──────────────┼──────┼────────┼──────┼───────┤
│ account │    gpu-rtx6k │   10 │    81G │    2 │ TOTAL │
│         │              │    0 │     0G │    0 │ USED  │
│         │              │   10 │    81G │    2 │ FREE  │
╰─────────┴──────────────┴──────┴────────┴──────┴───────╯

salloc --account=account --partition=gpu-rtx6k --gpus=1 --mem=10G --time=2:00:00
# Above replace the account and partition flags to match your account and partitions.

salloc --partition=ckpt-all --gpus-per-node=a40:1 --mem=10G --time=2:00:00 

sinfo -p ckpt-all -O nodehost,cpusstate,freemem,gres,gresused -S nodehost | grep -v null

HOSTNAMES       CPUS(A/I/O/T)       FREE_MEM     GRES             GRES_USED
g3003           10/0/30/40          361637       gpu:2080ti:8     gpu:2080ti:1(IDX:2)

salloc --partition=ckpt-all --gpus-per-node=2080ti:1 --mem=10G --time=2:00:00 

nvidia-smi

nvidia-smi --loop=5

ls

data  locator_NN_dropouts.slurm      locator.sif      loop_job.slurm
locator_NN_array_slurm  locator_NN_job.slurm    loop_array.slurm  loop_script.sh

cat loop_script.sh

#!/bin/bash

start=$1  # Starting number (first argument)
end=$2    # Ending number or last iteration (second argument)

if [ -z "$start" ] || [ -z "$end" ]; then
  echo "Usage: $0 <starting_number> <ending_number>"
  exit 1
fi

for ((i=start; i<=end; i++)); do
  if [ $i -eq $end ]; then
    echo "Sequence complete! Iterations from $start to $end."
  fi
done

./loop_script.sh 0 1000000

Sequence complete! Iterations from 0 to 1000000.

time ./loop_script.sh 0 1000000

Sequence complete! Iterations from 0 to 1000000.

real    0m4.216s
user    0m4.071s
sys     0m0.068s

nano loop_job.slurm

#!/bin/bash

#SBATCH --job-name=loop_job
#SBATCH --partition=ckpt
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --mem=1G
#SBATCH --time=5:00
#SBATCH -o log/%x_%j.out

time ./loop_script.sh 0 1000000

sbatch loop_job.slurm

cd log
ls

loop_job_123456789.out

cat loop_job_123456789.out

Sequence complete! Iterations from 0 to 1000000.

real    0m5.617s
user    0m5.570s
sys     0m0.016s

Scheduling Jobs

Set Up

Accounts and Partitions

Monitoring the Slurm Job Queue

Interactive Jobs

Requesting GPUs from a GPU partition

Requesting GPUs from Checkpoint

Identifying Idle GPU Types

Confirming the GPU is Active

A Simple Script as a Command Proxy

A CPU job and a GPU job will work equivalently in this section

OPTIONAL: To end an interactive job, type `exit` into the terminal

Batch Jobs

Literature Cited

Set Up​

Accounts and Partitions​

Monitoring the Slurm Job Queue​

Interactive Jobs​

Requesting GPUs from a GPU partition​

Requesting GPUs from Checkpoint​

Identifying Idle GPU Types​

Confirming the GPU is Active​

A Simple Script as a Command Proxy​

A CPU job and a GPU job will work equivalently in this section​

OPTIONAL: To end an interactive job, type exit into the terminal​

Batch Jobs​

Literature Cited​

Set Up

Accounts and Partitions

Monitoring the Slurm Job Queue

Interactive Jobs

Requesting GPUs from a GPU partition

Requesting GPUs from Checkpoint

Identifying Idle GPU Types

Confirming the GPU is Active

A Simple Script as a Command Proxy

A CPU job and a GPU job will work equivalently in this section

OPTIONAL: To end an interactive job, type `exit` into the terminal

Batch Jobs

Literature Cited