diff --git a/shell_scripts/cluster_install_notes.txt b/shell_scripts/cluster_install_notes.txt new file mode 100644 index 0000000..e69cf39 --- /dev/null +++ b/shell_scripts/cluster_install_notes.txt @@ -0,0 +1,24 @@ + + - copy all source files of mdt-public to cluster destination, e.g., by using update_scripts_on_cluster.sh. + + - log in to a COMPUTE NODE, e.g., e132-comp01, not one of the worker/submission since we need CUDA installed. + + - run: + + module load python/3.7.0 + module load gcc/7.2.0 + + virtualenv -p python3.7 + source /bin/activate + + export CUDA_HOME=/usr/local/cuda-${CUDA} + export TORCH_CUDA_ARCH_LIST="6.1;7.0;7.5" + + cd mdt-public + python setup.py install #--> check that custom extension are installed successfully. + + + +after install: + - until we have a better solution: submit jobs not from the recommended worker nodes but from a compute node (since we need /datasets to be mounted for job submission). + diff --git a/shell_scripts/job_starter.sh b/shell_scripts/job_starter.sh index 3792f8d..4e5add3 100644 --- a/shell_scripts/job_starter.sh +++ b/shell_scripts/job_starter.sh @@ -1,193 +1,193 @@ #!/bin/bash #wrapper for cluster_runner_....sh which copies job-specific, frequently changing files (e.g. configs.py) before the actual sbatch job #is submitted since the job might pend in queue before execution --> hazard of job-specific files being unintentionally changed during queue wait time. #positonal # -arg #1 identifies the folder name of the dataset-related code (e.g. >toy_exp< or >lidc_exp<) within the code source directory # -arg #2 is the experiment and first part of the job name, # optional args and flags: # -c / --create: (flag) whether to create the exp, i.e., if this is a new start of the exp with configs etc from source dir. # -f / --folds FOLDS: (option) fold(s) to run on (FOLDS needs to be only one int or string of multiple ints separated by space), default None (-->set to all in config) # -m / --mode MODE: (option) string, one of "train", "train_test", "test", defaults to "train_test" # -p / --exp_parent_dir: (option) name of parent_dir rel to dataset folder on cluster. exp_dir is exp_parent_dir/exp_name, if not given defaults to "experiments" # -q / --queue: (option) which queue (-q parameter for bsub) to send job to. default: gputest. others: gputest-short (max 5h jobs). # -w / --which: (option) same as argument -m to bsub; host or host list (string separated by space) to send the job to. # use nodenameXX where XX==nr of node or nodenameXX,nodenameYY,... or nodename[XX-YY]. nodename is e.g. e132-comp. # --gmem: (option) how much gpu memory to request for job (in gigabytes), defaults to 11.9. Currently, the smaller nodes have 11.9G, the larger ones 31.7G. # --resume: (flag) only with explicit fold argument, if set, resumes from checkpoint in exp_dir/fold_x/last_state.pth. # --no_parallel: (flag) if set, folds won't start as parallel jobs on cluster, but run sequentially in one job. dataset_name="${1}" exp_name="${2}" #arguments not passed, e.g. $7 if no seventh argument, are null. if [ ! -z "${18}" ]; then #-z checks if is null string echo "Error: Received too many arguments." exit fi #make args optional: move up if some args are missing inbetween while [ ${#} -gt 2 ]; do case "${3}" in -c|--create) create_exp="c" shift ;; -f|--folds) folds="${4}" shift; shift ;; -m|--mode) mode="${4}" shift; shift ;; -p|--exp_parent_dir) exp_parent_dir="${4}" shift; shift ;; -q|--queue) queue="${4}" shift; shift ;; -w|--which) which="${4}" shift; shift ;; -R|--resource) resource="${4}" shift; shift ;; --gmem) gmem="${4}" shift; shift ;; --resume) resume=true shift ;; --no_parallel) no_parallel=true shift ;; *) echo "Invalid argument/option passed: ${3}" exit 1 ;; esac done # default values if [ -z ${exp_parent_dir} ]; then exp_parent_dir="experiments" fi if [ -z ${mode} ]; then mode="train_test" fi if [ -z ${queue} ]; then queue="gputest" fi if [ -z ${gmem} ]; then gmem="11" fi root_dir=/home/ramien #assumes /home/ramien exists prep_node=ramien@e132-comp07 #node used for prep tasks like create_exp #medicaldetectiontoolkit source_dir=${root_dir}/mdt-public dataset_abs_path=${source_dir}/experiments/${dataset_name} #set as second argument passed to this script exp_parent_dir=/datasets/datasets_ramien/${dataset_name}/${exp_parent_dir} #exp_parent_dir=/home/gregor/Documents/medicaldetectiontoolkit/datasets/${dataset_name}/experiments #for testing this script # /dataset is not mounted on log-in/job submission nodes (would maybe make sense, I feel), only on queue gputest's nodes e132-compXX. #ssh ${prep_node} "mkdir -p ${exp_parent_dir}" exp_dir=${exp_parent_dir}/${exp_name} #activate virtualenv that has all the packages: -source_dl="module load python/3.7.0; module load gcc/7.2.0; source ${root_dir}/.virtualenvs/deeplearning37/bin/activate;" +source_dl="module load python/3.7.0; module load gcc/7.2.0; source ${root_dir}/.virtualenvs/mdt/bin/activate;" # TODO as long as no fix available: this script needs to be started directly from the prep node. :/ would be nice if (most importantly # 'module ...' would also work over ssh, but somehow some commands are not availabe over the ssh-induced shell (even when using it as interactive). eval ${source_dl} # ssh: (not working) #create_cmd="ssh ${prep_node} '${source_dl} python ${source_dir}/exec.py --server_env --mode create_exp --exp_dir ${exp_dir} --exp_source ${dataset_abs_path};'" # directly from prep node: create_cmd="python ${source_dir}/exec.py --server_env --mode create_exp --exp_dir ${exp_dir} --exp_source ${dataset_abs_path};" #if create_exp, check if would overwrite existing exp_dir if [ ! -z ${create_exp} ] && [ ${create_exp} = "c" ]; then #-n doesnt work as replacement for !-z if [ -d ${exp_dir} ]; then echo "Please confirm to overwrite exp ${exp_name} settings, (Y/n): "; read confirmation if ([ "${confirmation}" = "y" ] || [ "${confirmation}" = "yes" ] || [ "${confirmation}" = "Y" ] || [ -z "${confirmation}" ]); then echo "Overwriting ${exp_name}" else echo "Exiting due to overwrite denial. Adjust options." exit fi fi #echo "opts: name ${exp_name}, ${source_dir}/exec.py --server_env --mode create_exp --exp_dir ${exp_dir} --exp_source ${dataset_abs_path}" echo "Creating ${exp_name}" eval ${create_cmd} else if [ ! -d ${exp_dir} ]; then echo "Experiment directory ${exp_dir} does not exist." echo "Run create_exp? (Y/n): "; read confirmation if ([ "${confirmation}" = "y" ] || [ "${confirmation}" = "yes" ] || [ "${confirmation}" = "Y" ] || [ -z "${confirmation}" ]); then echo "Creating ${exp_name}" eval ${create_cmd} fi fi fi #if not create_exp, check if would overwrite existing folds (possibly valuable trained params!) if [ -z ${create_exp} ] && ([ ${mode} = "train" ] || [ ${mode} = "train_test" ]) && [ -z "${resume}" ]; then for f in ${folds}; do #if folds is null this check won't apply and folds will be quietly overwritten. if [ -d ${exp_dir}/fold_${f} ]; then #-d checks if is dir echo "please confirm to overwrite fold_${f}, (Y/n):"; read confirmation if ([ "${confirmation}" = "y" ] || [ "${confirmation}" = "yes" ] || [ "${confirmation}" = "Y" ] || [ -z "${confirmation}" ]); then echo "Overwriting "${exp_name}/fold_${f} else echo "Exiting due to overwrite denial. Adjust options." exit fi fi done fi bsub_opts="bsub -N -q ${queue} -gpu num=1:j_exclusive=yes:mode=exclusive_process:gmem=${gmem}G" if [ ! -z "$resource" ]; then bsub_opts=$bsub_opts $resource fi if [ ! -z ${which} ]; then bsub_opts="${bsub_opts} -m ${which}" fi #----- parallel/separate fold jobs (each fold in a single job) ----------- if [ ! -z "${folds}" ] && [ -z ${no_parallel} ]; then #WHY do i need to convert to string again? for f in ${folds}; do out_file=${exp_dir}/logs/fold_${f}_lsf_output.out bsub_opts="$bsub_opts -J '${dataset_name} ${exp_name} fold ${f} ${mode}' -oo '${out_file}'" eval "${bsub_opts} sh cluster_runner_meddec.sh ${source_dir} ${exp_dir} ${dataset_abs_path} ${mode} ${f} ${resume}" done #----- consecutive folds job (all folds in one single job) ----------- else if [ ! -z ${resume} ]; then echo "You need to explicitly specify folds if you would like to resume from a checkpoint. Exiting." exit fi out_file=${exp_dir}/lsf_output.out bsub_opts="$bsub_opts -J '${dataset_name} ${exp_name} folds ${folds} ${mode}' -oo '${out_file}'" eval "${bsub_opts} sh cluster_runner_meddec.sh ${source_dir} ${exp_dir} ${dataset_abs_path} ${mode} ${folds} ${resume}" echo "Started in no parallel, folds:" ${folds} fi diff --git a/shell_scripts/update_scripts_on_cluster.sh b/shell_scripts/update_scripts_on_cluster.sh index aa6c934..e5f6890 100644 --- a/shell_scripts/update_scripts_on_cluster.sh +++ b/shell_scripts/update_scripts_on_cluster.sh @@ -1,18 +1,18 @@ #cluster sync rootp=/home/gregor/Documents server=ramien@odcf-lsf01.dkfz.de #server=ramien@e132-comp04 serverroot=${server}:/home/ramien #---- medicaldetectiontoolkit ----- codep=${rootp}/mdt-public server_codep=${serverroot}/mdt-public -rsync -avhe "ssh -i /home/gregor/.ssh/id_rsa" ${rootp}/mdt-public/shell_scripts/dummy_runner.sh ${rootp}/mdt-public/shell_scripts/cluster_runner_meddec.sh ${rootp}/mdt-public/shell_scripts/job_starter.sh ${server_codep} +rsync -avhe "ssh -i /home/gregor/.ssh/id_rsa" ${rootp}/mdt-public/shell_scripts/cluster_runner_meddec.sh ${rootp}/mdt-public/shell_scripts/job_starter.sh ${server_codep} rsync -avhe "ssh -i /home/gregor/.ssh/id_rsa" ${rootp}/environmental/job_scheduler,cluster/bpeek_wrapper.sh ${serverroot} # add/remove --include 'custom_extension/**/*.whl' for compiled c++/CUDA exts -rsync -avhe "ssh -i /home/gregor/.ssh/id_rsa" --include '*/' --include '*.py' --include 'requirements.txt' --include 'custom_extensions/**/*.whl' --exclude '*' --prune-empty-dirs ${codep} ${serverroot} +rsync -avhe "ssh -i /home/gregor/.ssh/id_rsa" --include '*/' --include '*.py' --include '*.cpp' --include '*.cu' --include '*.h' --include 'requirements.txt' --exclude '*' --prune-empty-dirs ${codep} ${serverroot}