From 582e15d83fc36a37cc6505d512067b3643aa14b5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 21 Apr 2023 14:01:47 +0000 Subject: [PATCH 01/40] remove drain and resume functionality --- README.md | 52 +---------------------------------------------- defaults/main.yml | 3 --- tasks/drain.yml | 25 ----------------------- tasks/main.yml | 10 --------- tasks/resume.yml | 25 ----------------------- 5 files changed, 1 insertion(+), 114 deletions(-) delete mode 100644 tasks/drain.yml delete mode 100644 tasks/resume.yml diff --git a/README.md b/README.md index 35c1ba0..f8ff59a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # stackhpc.openhpc -This Ansible role installs packages and performs configuration to provide an OpenHPC Slurm cluster. It can also be used to drain and resume nodes. +This Ansible role installs packages and performs configuration to provide an OpenHPC Slurm cluster. As a role it must be used from a playbook, for which a simple example is given below. This approach means it is totally modular with no assumptions about available networks or any cluster features except for some hostname conventions. Any desired cluster fileystem or other required functionality may be freely integrated using additional Ansible roles or other approaches. @@ -37,8 +37,6 @@ each list element: * `database`: whether to enable slurmdbd * `batch`: whether to enable compute nodes * `runtime`: whether to enable OpenHPC runtime -* `drain`: whether to drain compute nodes -* `resume`: whether to resume compute nodes `openhpc_slurmdbd_host`: Optional. Where to deploy slurmdbd if are using this role to deploy slurmdbd, otherwise where an existing slurmdbd is running. This should be the name of a host in your inventory. Set this to `none` to prevent the role from managing slurmdbd. Defaults to `openhpc_slurm_control_host`. @@ -176,54 +174,6 @@ To deploy, create a playbook which looks like this: openhpc_packages: [] ... -To drain nodes, for example, before scaling down the cluster to 6 nodes: - - --- - - hosts: openstack - gather_facts: false - vars: - partition: "{{ cluster_group.output_value | selectattr('group', 'equalto', item.name) | list }}" - openhpc_slurm_partitions: - - name: "compute" - flavor: "compute-A" - image: "CentOS7.5-OpenHPC" - num_nodes: 6 - user: "centos" - openhpc_cluster_name: openhpc - roles: - # Our stackhpc.cluster-infra role can be invoked in `query` mode which - # looks up the state of the cluster by querying the Heat API. - - role: stackhpc.cluster-infra - cluster_name: "{{ cluster_name }}" - cluster_state: query - cluster_params: - cluster_groups: "{{ cluster_groups }}" - tasks: - # Given that the original cluster that was created had 8 nodes and the - # cluster we want to create has 6 nodes, the computed desired_state - # variable stores the list of instances to leave untouched. - - name: Count the number of compute nodes per slurm partition - set_fact: - desired_state: "{{ (( partition | first).nodes | map(attribute='name') | list )[:item.num_nodes] + desired_state | default([]) }}" - when: partition | length > 0 - with_items: "{{ openhpc_slurm_partitions }}" - - debug: var=desired_state - - - hosts: cluster_batch - become: yes - vars: - desired_state: "{{ hostvars['localhost']['desired_state'] | default([]) }}" - roles: - # Now, the stackhpc.openhpc role is invoked in drain/resume modes where - # the instances in desired_state are resumed if in a drained state and - # drained if in a resumed state. - - role: stackhpc.openhpc - openhpc_slurm_control_host: "{{ groups['cluster_control'] | first }}" - openhpc_enable: - drain: "{{ inventory_hostname not in desired_state }}" - resume: "{{ inventory_hostname in desired_state }}" - ... - --- 1 Slurm 20.11 removed `accounting_storage/filetxt` as an option. This version of Slurm was introduced in OpenHPC v2.1 but the OpenHPC repos are common to all OpenHPC v2.x releases. [↩](#accounting_storage) diff --git a/defaults/main.yml b/defaults/main.yml index 85b5766..39d13c7 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -8,7 +8,6 @@ openhpc_slurm_partitions: [] openhpc_cluster_name: openhpc_packages: - slurm-libpmi-ohpc -openhpc_drain_timeout: 86400 openhpc_resume_timeout: 300 openhpc_retry_delay: 10 openhpc_job_maxtime: '60-0' # quote this to avoid ansible converting some formats to seconds, which is interpreted as minutes by Slurm @@ -44,8 +43,6 @@ openhpc_enable: batch: false database: false runtime: false - drain: false - resume: false ohpc_slurm_services: control: slurmctld batch: slurmd diff --git a/tasks/drain.yml b/tasks/drain.yml deleted file mode 100644 index 8094508..0000000 --- a/tasks/drain.yml +++ /dev/null @@ -1,25 +0,0 @@ ---- -# Ansible tasks to drain a Slurm compute node. Waits for the compute node to be -# drained for up to a day by default. -# -# Variables: -# - node_to_drain: compute node to drain -# - drain_timeout: seconds to wait for node to drain, default is 86400. - -- name: Get nodes in DRAINED state - command: "sinfo --noheader --Node --format='%N' --states=DRAINED" - register: drained_nodes_results - changed_when: false - -- name: Drain compute node - command: "scontrol update nodename={{ inventory_hostname }} state=DRAIN reason='maintenance'" - when: inventory_hostname not in drained_nodes_results.stdout_lines - changed_when: true - -- name: Check node has drained - command: "sinfo --noheader --Node --format='%N' --states=DRAINED" - register: drained_nodes - until: "inventory_hostname in drained_nodes.stdout_lines" - delay: "{{ openhpc_retry_delay }}" - retries: "{{ (openhpc_drain_timeout / openhpc_retry_delay) | int }}" - changed_when: false diff --git a/tasks/main.yml b/tasks/main.yml index a767786..66d51b4 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -32,14 +32,4 @@ - openhpc_slurm_service_started | bool tags: post-configure -- name: Run drain or resume tasks - block: - - name: Run drain tasks - include_tasks: drain.yml - when: openhpc_enable.drain | default(false) | bool - - - name: Run resume tasks - include_tasks: resume.yml - when: openhpc_enable.resume | default(false) | bool - delegate_to: "{{ openhpc_slurm_control_host }}" ... diff --git a/tasks/resume.yml b/tasks/resume.yml deleted file mode 100644 index 94e25a7..0000000 --- a/tasks/resume.yml +++ /dev/null @@ -1,25 +0,0 @@ ---- -# Ansible tasks to resume a Slurm compute node. Waits for the compute node to -# change state for 5 minutes by default. -# -# Variables: -# - nodes_to_resume: compute node to resume -# - resume_timeout: seconds to wait for node to resume, default is 300. - -- name: Get nodes in ALLOC,IDLE states - command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE" - register: resumed_nodes_results - changed_when: false - -- name: Resume compute node - command: "scontrol update nodename={{ inventory_hostname }} state=RESUME" - when: inventory_hostname not in resumed_nodes_results.stdout_lines - changed_when: true - -- name: Check node has resumed - command: "sinfo --noheader --Node --format='%N' --states=ALLOC,IDLE" - register: resumed_nodes - until: "inventory_hostname in resumed_nodes.stdout_lines" - delay: "{{ openhpc_retry_delay }}" - retries: "{{ (openhpc_resume_timeout / openhpc_retry_delay) | int }}" - changed_when: false From b5af1865f0fe52627e1c4b137ec7fa184378087d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 25 Apr 2023 08:26:23 +0000 Subject: [PATCH 02/40] allow install and runtime taskbooks to be used directly --- handlers/main.yml | 1 - tasks/install.yml | 9 ++++++++- tasks/main.yml | 12 ------------ tasks/runtime.yml | 23 +++++++++++++++++++---- 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/handlers/main.yml b/handlers/main.yml index c86604b..0a086a1 100644 --- a/handlers/main.yml +++ b/handlers/main.yml @@ -60,4 +60,3 @@ state: restarted when: - openhpc_slurm_service_started | bool - - openhpc_slurm_service == 'slurmd' diff --git a/tasks/install.yml b/tasks/install.yml index ff87327..b90bb37 100644 --- a/tasks/install.yml +++ b/tasks/install.yml @@ -1,5 +1,12 @@ --- +- name: Enable batch on configless login-only nodes + set_fact: + openhpc_enable: "{{ openhpc_enable | combine({'batch':true}) }}" + when: + - openhpc_slurm_configless + - openhpc_login_only_nodes in group_names + - name: Ensure OpenHPC repos ansible.builtin.yum_repository: name: "{{ item.name }}" @@ -38,7 +45,7 @@ set_fact: openhpc_slurm_pkglist: "{{ openhpc_slurm_pkglist | default([]) + item.value }}" loop: "{{ ohpc_slurm_packages | dict2items }}" - when: openhpc_enable.get(item.key, false) + when: (openhpc_enable.get(item.key, false)) or () - name: Install required slurm packages yum: diff --git a/tasks/main.yml b/tasks/main.yml index 66d51b4..721cb81 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -1,17 +1,5 @@ --- -- name: Select slurm service to control - set_fact: - openhpc_slurm_service: "{{ ohpc_slurm_services[item] }}" - loop: "{{ ohpc_slurm_services.keys() | list }}" - when: "openhpc_enable.get(item, false)" - tags: always - -- name: Set slurmd as service for openhpc_login_only_nodes - set_fact: - openhpc_slurm_service: "slurmd" - when: openhpc_login_only_nodes and (openhpc_login_only_nodes in group_names) - - name: Install packages block: - include_tasks: install.yml diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 749e108..93b7f5a 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -20,6 +20,13 @@ - openhpc_slurm_control_host not in ansible_play_hosts - not openhpc_munge_key +- name: Enable batch on configless login-only nodes + set_fact: + openhpc_enable: "{{ openhpc_enable | combine({'batch':true}) }}" + when: + - openhpc_slurm_configless + - openhpc_login_only_nodes in group_names + - name: Ensure Slurm directories exists file: path: "{{ openhpc_state_save_location }}" @@ -150,7 +157,7 @@ group: root mode: 0644 when: - - openhpc_slurm_service == 'slurmd' + - openhpc_enable.batch | default(false) - openhpc_slurm_configless notify: - Restart slurmd service @@ -167,15 +174,23 @@ - name: Flush handler meta: flush_handlers # as then subsequent "ensure" is a no-op if slurm services bounced -- name: Ensure slurmdbd is started and running +- name: Ensure slurmdbd state service: name: slurmdbd enabled: "{{ openhpc_slurm_service_enabled | bool }}" state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" when: openhpc_enable.database | default(false) | bool -- name: Ensure Slurm service state +- name: Ensure slurmctld state + service: + name: slurmd + enabled: "{{ openhpc_slurm_service_enabled | bool }}" + state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + when: openhpc_enable.control | default(false) | bool + +- name: Ensure slurmd state service: - name: "{{ openhpc_slurm_service }}" + name: slurmd enabled: "{{ openhpc_slurm_service_enabled | bool }}" state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" + when: openhpc_enable.batch | default(false) | bool \ No newline at end of file From 47b2fd12c6fa51f734d0b7c974bd40293bef860d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 12 May 2023 12:13:09 +0000 Subject: [PATCH 03/40] fix linter complaints --- tasks/install.yml | 2 +- tasks/runtime.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/install.yml b/tasks/install.yml index b90bb37..7ba69f8 100644 --- a/tasks/install.yml +++ b/tasks/install.yml @@ -2,7 +2,7 @@ - name: Enable batch on configless login-only nodes set_fact: - openhpc_enable: "{{ openhpc_enable | combine({'batch':true}) }}" + openhpc_enable: "{{ openhpc_enable | combine({'batch': true}) }}" when: - openhpc_slurm_configless - openhpc_login_only_nodes in group_names diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 000f6c8..534eb4e 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -22,7 +22,7 @@ - name: Enable batch on configless login-only nodes set_fact: - openhpc_enable: "{{ openhpc_enable | combine({'batch':true}) }}" + openhpc_enable: "{{ openhpc_enable | combine({'batch': true}) }}" when: - openhpc_slurm_configless - openhpc_login_only_nodes in group_names From fe139b2d72c11ff3b1ea9e6dcaa896a9b0594cf7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 12 May 2023 12:19:01 +0000 Subject: [PATCH 04/40] fix slurmctld state --- tasks/runtime.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 534eb4e..5dcf607 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -196,7 +196,7 @@ - name: Ensure slurmctld state service: - name: slurmd + name: slurmctld enabled: "{{ openhpc_slurm_service_enabled | bool }}" state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" when: openhpc_enable.control | default(false) | bool From 080cf978b2b536e56d1d2d75ed381ef608c7e0a2 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 09:49:03 +0000 Subject: [PATCH 05/40] move common tasks to pre.yml --- tasks/install.yml | 7 +------ tasks/pre.yml | 13 +++++++++++++ tasks/runtime.yml | 10 +++------- 3 files changed, 17 insertions(+), 13 deletions(-) create mode 100644 tasks/pre.yml diff --git a/tasks/install.yml b/tasks/install.yml index 7ba69f8..715ac22 100644 --- a/tasks/install.yml +++ b/tasks/install.yml @@ -1,11 +1,6 @@ --- -- name: Enable batch on configless login-only nodes - set_fact: - openhpc_enable: "{{ openhpc_enable | combine({'batch': true}) }}" - when: - - openhpc_slurm_configless - - openhpc_login_only_nodes in group_names +- include_tasks: pre.yml - name: Ensure OpenHPC repos ansible.builtin.yum_repository: diff --git a/tasks/pre.yml b/tasks/pre.yml new file mode 100644 index 0000000..abc5042 --- /dev/null +++ b/tasks/pre.yml @@ -0,0 +1,13 @@ +- name: Enable batch on configless login-only nodes + set_fact: + openhpc_enable: "{{ openhpc_enable | combine({'batch': true}) }}" + when: + - openhpc_slurm_configless + - openhpc_login_only_nodes in group_names + +- name: Select slurm service to control + set_fact: + openhpc_slurm_service: "{{ ohpc_slurm_services[item] }}" + loop: "{{ ohpc_slurm_services.keys() | list }}" + when: "openhpc_enable.get(item, false)" + tags: always diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 1d1decd..73c9bdb 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -1,4 +1,7 @@ --- + +- include_tasks: pre.yml + - name: Check openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions exist assert: that: @@ -20,13 +23,6 @@ - openhpc_slurm_control_host not in ansible_play_hosts - not openhpc_munge_key -- name: Enable batch on configless login-only nodes - set_fact: - openhpc_enable: "{{ openhpc_enable | combine({'batch': true}) }}" - when: - - openhpc_slurm_configless - - openhpc_login_only_nodes in group_names - - name: Ensure Slurm directories exists file: path: "{{ openhpc_state_save_location }}" From f83e334c58c8ec744130a0f6eb3a54d65cd75abf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 10:02:52 +0000 Subject: [PATCH 06/40] remove unused openhpc_slurm_service --- tasks/pre.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tasks/pre.yml b/tasks/pre.yml index abc5042..3c0341c 100644 --- a/tasks/pre.yml +++ b/tasks/pre.yml @@ -4,10 +4,3 @@ when: - openhpc_slurm_configless - openhpc_login_only_nodes in group_names - -- name: Select slurm service to control - set_fact: - openhpc_slurm_service: "{{ ohpc_slurm_services[item] }}" - loop: "{{ ohpc_slurm_services.keys() | list }}" - when: "openhpc_enable.get(item, false)" - tags: always From 77a628faf799823e155f2c2ee34708452e9cbdb3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 11:22:36 +0000 Subject: [PATCH 07/40] fix ini_file use for some community.general versions --- tasks/runtime.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 73c9bdb..a351941 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -100,7 +100,7 @@ community.general.ini_file: path: "{{ _slurm_conf_tmpfile.path }}" option: "{{ item.key }}" - section: null + section: '' value: "{{ (item.value | join(',')) if (item.value is sequence and item.value is not string) else item.value }}" no_extra_spaces: true create: no From 5d88ca56b543b4a931119e5449be52fed16aabfe Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 11:23:20 +0000 Subject: [PATCH 08/40] fix var precedence in molecule test13 --- molecule/test13/converge.yml | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/molecule/test13/converge.yml b/molecule/test13/converge.yml index 625f278..6cfe96c 100644 --- a/molecule/test13/converge.yml +++ b/molecule/test13/converge.yml @@ -1,21 +1,21 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_control'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_control'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + openhpc_cluster_name: testohpc + openhpc_slurm_configless: true + openhpc_login_only_nodes: 'testohpc_login' + openhpc_config: + FirstJobId: 13 + SlurmctldSyslogDebug: error tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_control'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_control'] | first }}" - openhpc_slurm_partitions: - - name: "compute" - openhpc_cluster_name: testohpc - openhpc_slurm_configless: true - openhpc_login_only_nodes: 'testohpc_login' - openhpc_config: - FirstJobId: 13 - SlurmctldSyslogDebug: error From 33ad0e23cfa4c237c218010e970f313b665f58d9 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 11:35:54 +0000 Subject: [PATCH 09/40] fix var precedence in all molecule tests --- molecule/test1/converge.yml | 19 ++++++++------- molecule/test10/converge.yml | 20 ++++++++-------- molecule/test14/converge.yml | 45 ++++++++++++++++++------------------ molecule/test1b/converge.yml | 19 ++++++++------- molecule/test1c/converge.yml | 21 ++++++++--------- molecule/test2/converge.yml | 21 ++++++++--------- molecule/test3/converge.yml | 25 ++++++++++---------- molecule/test4/converge.yml | 28 +++++++++++----------- molecule/test5/converge.yml | 21 ++++++++--------- molecule/test6/converge.yml | 19 ++++++++------- molecule/test7/converge.yml | 22 +++++++++--------- molecule/test8/converge.yml | 23 +++++++++--------- molecule/test9/converge.yml | 23 +++++++++--------- 13 files changed, 149 insertions(+), 157 deletions(-) diff --git a/molecule/test1/converge.yml b/molecule/test1/converge.yml index 728db01..0408415 100644 --- a/molecule/test1/converge.yml +++ b/molecule/test1/converge.yml @@ -1,17 +1,16 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_login'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + openhpc_cluster_name: testohpc tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_login'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" - openhpc_slurm_partitions: - - name: "compute" - openhpc_cluster_name: testohpc - diff --git a/molecule/test10/converge.yml b/molecule/test10/converge.yml index a8a1908..aa862b4 100644 --- a/molecule/test10/converge.yml +++ b/molecule/test10/converge.yml @@ -1,17 +1,17 @@ --- - name: Create initial cluster hosts: initial + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_login'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + openhpc_cluster_name: testohpc + openhpc_slurm_configless: true tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_login'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" - openhpc_slurm_partitions: - - name: "compute" - openhpc_cluster_name: testohpc - openhpc_slurm_configless: true diff --git a/molecule/test14/converge.yml b/molecule/test14/converge.yml index f2daba0..f7db48c 100644 --- a/molecule/test14/converge.yml +++ b/molecule/test14/converge.yml @@ -1,30 +1,29 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_login'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + extra_nodes: + # Need to specify IPs for the non-existent State=DOWN nodes, because otherwise even in this state slurmctld will exclude a node with no lookup information from the config. + # We use invalid IPs here (i.e. starting 0.) to flag the fact the nodes shouldn't exist. + # Note this has to be done via slurm config rather than /etc/hosts due to Docker limitations on modifying the latter. + - NodeName: fake-x,fake-y + NodeAddr: 0.42.42.0,0.42.42.1 + State: DOWN + CPUs: 1 + - NodeName: fake-2cpu-[3,7-9] + NodeAddr: 0.42.42.3,0.42.42.7,0.42.42.8,0.42.42.9 + State: DOWN + CPUs: 2 + openhpc_cluster_name: testohpc + openhpc_slurm_configless: true tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_login'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" - openhpc_slurm_partitions: - - name: "compute" - extra_nodes: - # Need to specify IPs for the non-existent State=DOWN nodes, because otherwise even in this state slurmctld will exclude a node with no lookup information from the config. - # We use invalid IPs here (i.e. starting 0.) to flag the fact the nodes shouldn't exist. - # Note this has to be done via slurm config rather than /etc/hosts due to Docker limitations on modifying the latter. - - NodeName: fake-x,fake-y - NodeAddr: 0.42.42.0,0.42.42.1 - State: DOWN - CPUs: 1 - - NodeName: fake-2cpu-[3,7-9] - NodeAddr: 0.42.42.3,0.42.42.7,0.42.42.8,0.42.42.9 - State: DOWN - CPUs: 2 - openhpc_cluster_name: testohpc - openhpc_slurm_configless: true - diff --git a/molecule/test1b/converge.yml b/molecule/test1b/converge.yml index 728db01..0408415 100644 --- a/molecule/test1b/converge.yml +++ b/molecule/test1b/converge.yml @@ -1,17 +1,16 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_login'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + openhpc_cluster_name: testohpc tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_login'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" - openhpc_slurm_partitions: - - name: "compute" - openhpc_cluster_name: testohpc - diff --git a/molecule/test1c/converge.yml b/molecule/test1c/converge.yml index 63d4448..341cd9e 100644 --- a/molecule/test1c/converge.yml +++ b/molecule/test1c/converge.yml @@ -1,18 +1,17 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_login'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_service_enabled: true + openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + openhpc_cluster_name: testohpc tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_login'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_service_enabled: true - openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" - openhpc_slurm_partitions: - - name: "compute" - openhpc_cluster_name: testohpc - diff --git a/molecule/test2/converge.yml b/molecule/test2/converge.yml index fc93e28..1433682 100644 --- a/molecule/test2/converge.yml +++ b/molecule/test2/converge.yml @@ -1,18 +1,17 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_login'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" + openhpc_slurm_partitions: + - name: "part1" + - name: "part2" + openhpc_cluster_name: testohpc tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_login'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" - openhpc_slurm_partitions: - - name: "part1" - - name: "part2" - openhpc_cluster_name: testohpc - diff --git a/molecule/test3/converge.yml b/molecule/test3/converge.yml index e1f6449..7805064 100644 --- a/molecule/test3/converge.yml +++ b/molecule/test3/converge.yml @@ -1,20 +1,21 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_login'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + groups: + - name: "grp1" + - name: "grp2" + openhpc_cluster_name: testohpc tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_login'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" - openhpc_slurm_partitions: - - name: "compute" - groups: - - name: "grp1" - - name: "grp2" - openhpc_cluster_name: testohpc + diff --git a/molecule/test4/converge.yml b/molecule/test4/converge.yml index 4397b71..ec83f10 100644 --- a/molecule/test4/converge.yml +++ b/molecule/test4/converge.yml @@ -1,21 +1,21 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_login'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + database: "{{ inventory_hostname in groups['testohpc_login'] }}" + runtime: true + openhpc_slurm_accounting_storage_type: 'accounting_storage/slurmdbd' + openhpc_slurmdbd_mysql_database: slurm_acct_db + openhpc_slurmdbd_mysql_password: secure-password + openhpc_slurmdbd_mysql_username: slurm + openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + openhpc_cluster_name: testohpc tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_login'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - database: "{{ inventory_hostname in groups['testohpc_login'] }}" - runtime: true - openhpc_slurm_accounting_storage_type: 'accounting_storage/slurmdbd' - openhpc_slurmdbd_mysql_database: slurm_acct_db - openhpc_slurmdbd_mysql_password: secure-password - openhpc_slurmdbd_mysql_username: slurm - openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" - openhpc_slurm_partitions: - - name: "compute" - openhpc_cluster_name: testohpc diff --git a/molecule/test5/converge.yml b/molecule/test5/converge.yml index 2964a61..0ac4e91 100644 --- a/molecule/test5/converge.yml +++ b/molecule/test5/converge.yml @@ -1,18 +1,17 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_login'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + openhpc_cluster_name: testohpc + openhpc_slurm_configless: true tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_login'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" - openhpc_slurm_partitions: - - name: "compute" - openhpc_cluster_name: testohpc - openhpc_slurm_configless: true - diff --git a/molecule/test6/converge.yml b/molecule/test6/converge.yml index 6a29645..52d6d50 100644 --- a/molecule/test6/converge.yml +++ b/molecule/test6/converge.yml @@ -1,17 +1,16 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_login'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" + openhpc_slurm_partitions: + - name: "n/a" + openhpc_cluster_name: testohpc + openhpc_slurm_configless: true tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_login'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_login'] | first }}" - openhpc_slurm_partitions: - - name: "n/a" - openhpc_cluster_name: testohpc - openhpc_slurm_configless: true - diff --git a/molecule/test7/converge.yml b/molecule/test7/converge.yml index 0089cc1..31c629d 100644 --- a/molecule/test7/converge.yml +++ b/molecule/test7/converge.yml @@ -1,6 +1,17 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + batch: true + runtime: true + openhpc_slurm_service_started: false + openhpc_slurm_control_host: testohpc-login-0 + openhpc_slurm_partitions: + - name: "compute" + openhpc_cluster_name: testohpc + openhpc_slurm_configless: true + openhpc_munge_key: "{{ specified_munge_key.content | b64decode }}" tasks: - name: Generate munge key on ansible control host (so can verify) command: "dd if=/dev/urandom of=/tmp/ansible-role-openhpc-test7 bs=1 count=1024" # can't use tmpfile as not idempotent @@ -17,14 +28,3 @@ - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - batch: true - runtime: true - openhpc_slurm_service_started: false - openhpc_slurm_control_host: testohpc-login-0 - openhpc_slurm_partitions: - - name: "compute" - openhpc_cluster_name: testohpc - openhpc_slurm_configless: true - openhpc_munge_key: "{{ specified_munge_key.content | b64decode }}" diff --git a/molecule/test8/converge.yml b/molecule/test8/converge.yml index f5730ab..33a8e23 100644 --- a/molecule/test8/converge.yml +++ b/molecule/test8/converge.yml @@ -1,19 +1,18 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_control'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_control'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + openhpc_cluster_name: testohpc + openhpc_slurm_configless: true + openhpc_login_only_nodes: 'testohpc_login' tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_control'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_control'] | first }}" - openhpc_slurm_partitions: - - name: "compute" - openhpc_cluster_name: testohpc - openhpc_slurm_configless: true - openhpc_login_only_nodes: 'testohpc_login' - diff --git a/molecule/test9/converge.yml b/molecule/test9/converge.yml index f5730ab..33a8e23 100644 --- a/molecule/test9/converge.yml +++ b/molecule/test9/converge.yml @@ -1,19 +1,18 @@ --- - name: Converge hosts: all + vars: + openhpc_enable: + control: "{{ inventory_hostname in groups['testohpc_control'] }}" + batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" + runtime: true + openhpc_slurm_control_host: "{{ groups['testohpc_control'] | first }}" + openhpc_slurm_partitions: + - name: "compute" + openhpc_cluster_name: testohpc + openhpc_slurm_configless: true + openhpc_login_only_nodes: 'testohpc_login' tasks: - name: "Include ansible-role-openhpc" include_role: name: "{{ lookup('env', 'MOLECULE_PROJECT_DIRECTORY') | basename }}" - vars: - openhpc_enable: - control: "{{ inventory_hostname in groups['testohpc_control'] }}" - batch: "{{ inventory_hostname in groups['testohpc_compute'] }}" - runtime: true - openhpc_slurm_control_host: "{{ groups['testohpc_control'] | first }}" - openhpc_slurm_partitions: - - name: "compute" - openhpc_cluster_name: testohpc - openhpc_slurm_configless: true - openhpc_login_only_nodes: 'testohpc_login' - From 96834013e3594db1fe90680cba1ff69e1311ca8c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 12:39:25 +0000 Subject: [PATCH 10/40] fix slurmd always starting on control node --- handlers/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/handlers/main.yml b/handlers/main.yml index 0a086a1..0dda917 100644 --- a/handlers/main.yml +++ b/handlers/main.yml @@ -60,3 +60,5 @@ state: restarted when: - openhpc_slurm_service_started | bool + - openhpc_enable.batch | default(false) | bool + # 2nd condition required as notification happens on controller, which isn't necessarily a compute note From d4163bc69164998d5e064aabe59f131a7948a4a8 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 13:08:52 +0000 Subject: [PATCH 11/40] move install to install-ohpc.yml --- tasks/{install.yml => install-ohpc.yml} | 0 tasks/main.yml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename tasks/{install.yml => install-ohpc.yml} (100%) diff --git a/tasks/install.yml b/tasks/install-ohpc.yml similarity index 100% rename from tasks/install.yml rename to tasks/install-ohpc.yml diff --git a/tasks/main.yml b/tasks/main.yml index 721cb81..48a1308 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -2,7 +2,7 @@ - name: Install packages block: - - include_tasks: install.yml + - include_tasks: install-ohpc.yml when: openhpc_enable.runtime | default(false) | bool tags: install From d4c56214fdbec2c1cfdf7a5cdffbacf84eba9bac Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 13:11:35 +0000 Subject: [PATCH 12/40] remove unused ohpc_slurm_services var --- defaults/main.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/defaults/main.yml b/defaults/main.yml index 4f69871..c345eda 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -45,9 +45,6 @@ openhpc_enable: batch: false database: false runtime: false -ohpc_slurm_services: - control: slurmctld - batch: slurmd # Repository configuration openhpc_extra_repos: [] From 5090860dcc0fe6839dfe6b5932835d423d1f18ab Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 14:12:41 +0000 Subject: [PATCH 13/40] add install-generic for binary-only install --- defaults/main.yml | 8 ++++++ tasks/install-generic.yml | 49 ++++++++++++++++++++++++++++++++++ templates/slurmctld.service.j2 | 22 +++++++++++++++ templates/slurmd.service.j2 | 25 +++++++++++++++++ templates/slurmdbd.service.j2 | 22 +++++++++++++++ 5 files changed, 126 insertions(+) create mode 100644 tasks/install-generic.yml create mode 100644 templates/slurmctld.service.j2 create mode 100644 templates/slurmd.service.j2 create mode 100644 templates/slurmdbd.service.j2 diff --git a/defaults/main.yml b/defaults/main.yml index c345eda..8cde1b7 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -46,6 +46,14 @@ openhpc_enable: database: false runtime: false +# Only used for install-generic.yml: +openhpc_generic_packages: + - munge + - mariadb-connector-c # only required on slurmdbd + - hwloc-libs # only required on slurmd +openhpc_binary_dir: +openhpc_library_dir: + # Repository configuration openhpc_extra_repos: [] diff --git a/tasks/install-generic.yml b/tasks/install-generic.yml new file mode 100644 index 0000000..6a57af5 --- /dev/null +++ b/tasks/install-generic.yml @@ -0,0 +1,49 @@ +- include_tasks: pre.yml + +- name: Create a list of slurm daemons + set_fact: + _ohpc_daemons: "{{ _ohpc_daemon_map | dict2items | selectattr('value') | items2dict | list }}" + vars: + _ohpc_daemon_map: + slurmctld: "{{ openhpc_enable.control }}" + slurmd: "{{ openhpc_enable.batch }}" + slurmdbd: "{{ openhpc_enable.database }}" + +- name: Install system packages + dnf: + name: "{{ openhpc_generic_packages }}" + +- name: Create Slurm user + user: + name: slurm + comment: SLURM resource manager + home: /etc/slurm + shell: /sbin/nologin + +- name: Create Slurm unit files + template: + src: "{{ item }}.service.j2" + dest: /lib/systemd/system/{{ item }}.service + loop: "{{ _ohpc_daemons }}" + register: _slurm_systemd_units + +- name: Get current library locations + shell: + cmd: "ldconfig -v | grep -v ^$'\t'" + register: _slurm_ldconfig + changed_when: false + +- name: Add library locations to ldd search path + copy: + dest: /etc/ld.so.conf.d/slurm.conf + content: "{{ openhpc_library_dir }}" + owner: root + group: root + mode: ugo=r + when: openhpc_library_dir not in _ldd_paths + vars: + _ldd_paths: "{{ _slurm_ldconfig.stdout_lines | map('split', ':') | map('first') }}" + +- name: Reload Slurm unit files + command: systemctl daemon-reload + when: _slurm_systemd_units.changed diff --git a/templates/slurmctld.service.j2 b/templates/slurmctld.service.j2 new file mode 100644 index 0000000..0044f71 --- /dev/null +++ b/templates/slurmctld.service.j2 @@ -0,0 +1,22 @@ +[Unit] +Description=Slurm controller daemon +After=network-online.target munge.service +Wants=network-online.target +ConditionPathExists=/etc/slurm/slurm.conf + +[Service] +Type=simple +EnvironmentFile=-/etc/sysconfig/slurmctld +EnvironmentFile=-/etc/default/slurmctld +ExecStart={{ openhpc_binary_dir }}/slurmctld -D -s $SLURMCTLD_OPTIONS +ExecReload=/bin/kill -HUP $MAINPID +LimitNOFILE=65536 +TasksMax=infinity + +# Uncomment the following lines to disable logging through journald. +# NOTE: It may be preferable to set these through an override file instead. +#StandardOutput=null +#StandardError=null + +[Install] +WantedBy=multi-user.target diff --git a/templates/slurmd.service.j2 b/templates/slurmd.service.j2 new file mode 100644 index 0000000..8763111 --- /dev/null +++ b/templates/slurmd.service.j2 @@ -0,0 +1,25 @@ +[Unit] +Description=Slurm node daemon +After=munge.service network-online.target remote-fs.target +Wants=network-online.target + +[Service] +Type=simple +EnvironmentFile=-/etc/sysconfig/slurmd +EnvironmentFile=-/etc/default/slurmd +ExecStart={{ openhpc_binary_dir }}/slurmd -D -s $SLURMD_OPTIONS +ExecReload=/bin/kill -HUP $MAINPID +KillMode=process +LimitNOFILE=131072 +LimitMEMLOCK=infinity +LimitSTACK=infinity +Delegate=yes +TasksMax=infinity + +# Uncomment the following lines to disable logging through journald. +# NOTE: It may be preferable to set these through an override file instead. +#StandardOutput=null +#StandardError=null + +[Install] +WantedBy=multi-user.target diff --git a/templates/slurmdbd.service.j2 b/templates/slurmdbd.service.j2 new file mode 100644 index 0000000..fca4a53 --- /dev/null +++ b/templates/slurmdbd.service.j2 @@ -0,0 +1,22 @@ +[Unit] +Description=Slurm DBD accounting daemon +After=network-online.target munge.service mysql.service mysqld.service mariadb.service +Wants=network-online.target +ConditionPathExists=/etc/slurm/slurmdbd.conf + +[Service] +Type=simple +EnvironmentFile=-/etc/sysconfig/slurmdbd +EnvironmentFile=-/etc/default/slurmdbd +ExecStart={{ openhpc_binary_dir }}/slurmdbd -D -s $SLURMDBD_OPTIONS +ExecReload=/bin/kill -HUP $MAINPID +LimitNOFILE=65536 +TasksMax=infinity + +# Uncomment the following lines to disable logging through journald. +# NOTE: It may be preferable to set these through an override file instead. +#StandardOutput=null +#StandardError=null + +[Install] +WantedBy=multi-user.target From 253f2b191da6e3f55c0e6ab69114336e15dcfd87 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 14:29:51 +0000 Subject: [PATCH 14/40] distinguish between system and user slurm binaries for generic install --- defaults/main.yml | 5 +++-- tasks/install-generic.yml | 8 ++++++++ templates/slurmctld.service.j2 | 2 +- templates/slurmd.service.j2 | 2 +- templates/slurmdbd.service.j2 | 2 +- 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/defaults/main.yml b/defaults/main.yml index 8cde1b7..78c157c 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -51,8 +51,9 @@ openhpc_generic_packages: - munge - mariadb-connector-c # only required on slurmdbd - hwloc-libs # only required on slurmd -openhpc_binary_dir: -openhpc_library_dir: +openhpc_sbin_dir: /usr/sbin # path to slurm daemon binaries (e.g. slurmctld) +openhpc_bin_dir: /usr/bin # path to slurm user binaries (e.g sinfo) +openhpc_library_dir: /usr/lib64/slurm # path to slurm libraries # Repository configuration openhpc_extra_repos: [] diff --git a/tasks/install-generic.yml b/tasks/install-generic.yml index 6a57af5..964791f 100644 --- a/tasks/install-generic.yml +++ b/tasks/install-generic.yml @@ -47,3 +47,11 @@ - name: Reload Slurm unit files command: systemctl daemon-reload when: _slurm_systemd_units.changed + +- name: Add slurm user binaries to PATH + copy: + dest: /etc/profile.d/slurm.sh + content: PATH=$PATH:{{ openhpc_bin_dir }} + owner: root + group: root + mode: u=rw,go=r diff --git a/templates/slurmctld.service.j2 b/templates/slurmctld.service.j2 index 0044f71..6376766 100644 --- a/templates/slurmctld.service.j2 +++ b/templates/slurmctld.service.j2 @@ -8,7 +8,7 @@ ConditionPathExists=/etc/slurm/slurm.conf Type=simple EnvironmentFile=-/etc/sysconfig/slurmctld EnvironmentFile=-/etc/default/slurmctld -ExecStart={{ openhpc_binary_dir }}/slurmctld -D -s $SLURMCTLD_OPTIONS +ExecStart={{ openhpc_sbin_dir }}/slurmctld -D -s $SLURMCTLD_OPTIONS ExecReload=/bin/kill -HUP $MAINPID LimitNOFILE=65536 TasksMax=infinity diff --git a/templates/slurmd.service.j2 b/templates/slurmd.service.j2 index 8763111..501d0e9 100644 --- a/templates/slurmd.service.j2 +++ b/templates/slurmd.service.j2 @@ -7,7 +7,7 @@ Wants=network-online.target Type=simple EnvironmentFile=-/etc/sysconfig/slurmd EnvironmentFile=-/etc/default/slurmd -ExecStart={{ openhpc_binary_dir }}/slurmd -D -s $SLURMD_OPTIONS +ExecStart={{ openhpc_sbin_dir }}/slurmd -D -s $SLURMD_OPTIONS ExecReload=/bin/kill -HUP $MAINPID KillMode=process LimitNOFILE=131072 diff --git a/templates/slurmdbd.service.j2 b/templates/slurmdbd.service.j2 index fca4a53..3a2bfec 100644 --- a/templates/slurmdbd.service.j2 +++ b/templates/slurmdbd.service.j2 @@ -8,7 +8,7 @@ ConditionPathExists=/etc/slurm/slurmdbd.conf Type=simple EnvironmentFile=-/etc/sysconfig/slurmdbd EnvironmentFile=-/etc/default/slurmdbd -ExecStart={{ openhpc_binary_dir }}/slurmdbd -D -s $SLURMDBD_OPTIONS +ExecStart={{ openhpc_sbin_dir }}/slurmdbd -D -s $SLURMDBD_OPTIONS ExecReload=/bin/kill -HUP $MAINPID LimitNOFILE=65536 TasksMax=infinity From 1b92b5ee008a1e1230c20ed3fc0ad015e818a3ba Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 14:54:56 +0000 Subject: [PATCH 15/40] remove support for CentOS7 / OpenHPC --- .github/workflows/ci.yml | 25 +------------------------ README.md | 8 +++----- defaults/main.yml | 22 +--------------------- molecule/README.md | 3 +-- tasks/install.yml | 14 +------------- tasks/runtime.yml | 7 +------ vars/{ohpc-2 => main.yml} | 0 vars/ohpc-1.3 | 20 -------------------- 8 files changed, 8 insertions(+), 91 deletions(-) rename vars/{ohpc-2 => main.yml} (100%) delete mode 100644 vars/ohpc-1.3 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c070832..6c13daf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,6 @@ jobs: fail-fast: false matrix: image: - - 'centos:7' - 'rockylinux:8.8' scenario: - test1 @@ -44,29 +43,7 @@ jobs: - test13 - test14 - exclude: - - image: 'centos:7' - scenario: test5 - - image: 'centos:7' - scenario: test6 - - image: 'centos:7' - scenario: test7 - - image: 'centos:7' - scenario: test8 - - image: 'centos:7' - scenario: test9 - - image: 'centos:7' - scenario: test10 - - image: 'centos:7' - scenario: test11 - - image: 'centos:7' - scenario: test12 - - image: 'centos:7' - scenario: test13 - - image: 'centos:7' - scenario: test14 - - image: 'centos:7' - scenario: test15 + exclude: [] steps: - name: Check out the codebase. diff --git a/README.md b/README.md index dbcfc7c..71d0f35 100644 --- a/README.md +++ b/README.md @@ -2,16 +2,14 @@ # stackhpc.openhpc -This Ansible role installs packages and performs configuration to provide an OpenHPC Slurm cluster. +This Ansible role installs packages and performs configuration to provide an OpenHPC v2.x Slurm cluster. As a role it must be used from a playbook, for which a simple example is given below. This approach means it is totally modular with no assumptions about available networks or any cluster features except for some hostname conventions. Any desired cluster fileystem or other required functionality may be freely integrated using additional Ansible roles or other approaches. -The minimal image for nodes is a CentOS 7 or RockyLinux 8 GenericCloud image. These use OpenHPC v1 and v2 respectively. Centos8/OpenHPCv2 is generally preferred as it provides additional functionality for Slurm, compilers, MPI and transport libraries. +The minimal image for nodes is a RockyLinux 8 GenericCloud image. ## Role Variables -`openhpc_version`: Optional. OpenHPC version to install. Defaults provide `1.3` for Centos 7 and `2` for RockyLinux/CentOS 8. - `openhpc_extra_repos`: Optional list. Extra Yum repository definitions to configure, following the format of the Ansible [yum_repository](https://docs.ansible.com/ansible/2.9/modules/yum_repository_module.html) module. Respected keys for each list element: @@ -42,7 +40,7 @@ each list element: `openhpc_slurmdbd_host`: Optional. Where to deploy slurmdbd if are using this role to deploy slurmdbd, otherwise where an existing slurmdbd is running. This should be the name of a host in your inventory. Set this to `none` to prevent the role from managing slurmdbd. Defaults to `openhpc_slurm_control_host`. -`openhpc_slurm_configless`: Optional, default false. If true then slurm's ["configless" mode](https://slurm.schedmd.com/configless_slurm.html) is used. **NB: Requires Centos8/OpenHPC v2.** +`openhpc_slurm_configless`: Optional, default false. If true then slurm's ["configless" mode](https://slurm.schedmd.com/configless_slurm.html) is used. `openhpc_munge_key`: Optional. Define a munge key to use. If not provided then one is generated but the `openhpc_slurm_control_host` must be in the play. diff --git a/defaults/main.yml b/defaults/main.yml index c345eda..f799801 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -1,5 +1,5 @@ --- -openhpc_version: "{{ '1.3' if ansible_distribution_major_version == '7' else '2' }}" +openhpc_version: '1.3' openhpc_slurm_service_enabled: true openhpc_slurm_service_started: "{{ openhpc_slurm_service_enabled }}" openhpc_slurm_service: @@ -50,19 +50,6 @@ openhpc_enable: openhpc_extra_repos: [] ohpc_openhpc_repos: - "7": - - name: OpenHPC - file: OpenHPC - description: "OpenHPC-1.3 - Base" - baseurl: "http://build.openhpc.community/OpenHPC:/1.3/CentOS_7" - gpgcheck: true - gpgkey: https://raw.githubusercontent.com/openhpc/ohpc/v1.3.5.GA/components/admin/ohpc-release/SOURCES/RPM-GPG-KEY-OpenHPC-1 - - name: OpenHPC-updates - file: OpenHPC - description: "OpenHPC-1.3 - Updates" - baseurl: "http://build.openhpc.community/OpenHPC:/1.3/updates/CentOS_7" - gpgcheck: true - gpgkey: https://raw.githubusercontent.com/openhpc/ohpc/v1.3.5.GA/components/admin/ohpc-release/SOURCES/RPM-GPG-KEY-OpenHPC-1 "8": - name: OpenHPC file: OpenHPC @@ -78,13 +65,6 @@ ohpc_openhpc_repos: gpgkey: https://raw.githubusercontent.com/openhpc/ohpc/v2.6.1.GA/components/admin/ohpc-release/SOURCES/RPM-GPG-KEY-OpenHPC-2 ohpc_default_extra_repos: - "7": - - name: epel - file: epel - description: "Extra Packages for Enterprise Linux 7 - $basearch" - metalink: "https://mirrors.fedoraproject.org/metalink?repo=epel-7&arch=$basearch&infra=$infra&content=$contentdir" - gpgcheck: true - gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-7" "8": - name: epel file: epel diff --git a/molecule/README.md b/molecule/README.md index 03494e3..fe95788 100644 --- a/molecule/README.md +++ b/molecule/README.md @@ -42,8 +42,7 @@ Local installation on a RockyLinux 8.x machine looks like: Then to run tests, e.g.:: cd ansible-role-openhpc/ - MOLECULE_IMAGE=centos:7 molecule test --all # NB some won't work as require OpenHPC v2.x (-> CentOS 8.x) features - see `.github/workflows/ci.yml` - MOLECULE_IMAGE=rockylinux:8.6 molecule test --all + MOLECULE_IMAGE=rockylinux:8.8 molecule test --all During development you may want to: diff --git a/tasks/install.yml b/tasks/install.yml index 715ac22..c993673 100644 --- a/tasks/install.yml +++ b/tasks/install.yml @@ -16,25 +16,13 @@ loop_control: label: "{{ item.name }}" -- name: Include variables for OpenHPC version - include_vars: - file: "ohpc-{{ openhpc_version }}" - -- name: Find PowerTools repo - find: - paths: /etc/yum.repos.d - patterns: '*-*PowerTools.repo' - register: powertools - when: ansible_distribution_major_version == "8" - - name: Enable PowerTools repo # NB: doesn't run command `dnf config-manager --set-enabled PowerTools` as can't make that idempotent lineinfile: - path: "{{ powertools.files[0].path }}" # 8.2: /etc/yum.repos.d/CentOS-PowerTools.repo 8.3: /etc/yum.repos.d/CentOS-Linux-PowerTools.repo + path: /etc/yum.repos.d/Rocky-PowerTools.repo create: false # raises error if not already installed regexp: enabled= line: enabled=1 - when: ansible_distribution_major_version == "8" - name: Build host-specific list of required slurm packages set_fact: diff --git a/tasks/runtime.yml b/tasks/runtime.yml index a351941..5c925da 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -10,11 +10,6 @@ - openhpc_cluster_name != '' - openhpc_slurm_partitions is defined fail_msg: "Undefined openhpc_slurm_control_host, openhpc_cluster_name or openhpc_slurm_partitions." - -- name: Fail if configless mode selected when not on Centos 8 - fail: - msg: "openhpc_slurm_configless = True requires Centos8 / OpenHPC v2" - when: openhpc_slurm_configless and not ansible_distribution_major_version == "8" - name: Fail if control host not in play and munge key not specified fail: @@ -202,4 +197,4 @@ name: slurmd enabled: "{{ openhpc_slurm_service_enabled | bool }}" state: "{{ 'started' if openhpc_slurm_service_started | bool else 'stopped' }}" - when: openhpc_enable.batch | default(false) | bool \ No newline at end of file + when: openhpc_enable.batch | default(false) | bool diff --git a/vars/ohpc-2 b/vars/main.yml similarity index 100% rename from vars/ohpc-2 rename to vars/main.yml diff --git a/vars/ohpc-1.3 b/vars/ohpc-1.3 deleted file mode 100644 index b099e4b..0000000 --- a/vars/ohpc-1.3 +++ /dev/null @@ -1,20 +0,0 @@ ---- -# OpenHPC 1.3 on CentOS 7 - -ohpc_slurm_packages: - control: - - "@ohpc-slurm-server" - - "slurm-slurmctld-ohpc" - - "slurm-example-configs-ohpc" - batch: - - "@ohpc-base-compute" - - "@ohpc-slurm-client" - runtime: - - "slurm-ohpc" - - "munge-ohpc" - - "slurm-slurmd-ohpc" - - "slurm-example-configs-ohpc" - - "{{ 'lmod-ohpc' if openhpc_module_system_install else '' }}" - database: - - "slurm-slurmdbd-ohpc" -... From 985dd3da2d0c23fdbef3532c2f5f15965c8fd634 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 15:22:46 +0000 Subject: [PATCH 16/40] remove post-configure, not needed as of slurm v20.02 --- tasks/main.yml | 8 -------- tasks/post-configure.yml | 14 -------------- 2 files changed, 22 deletions(-) delete mode 100644 tasks/post-configure.yml diff --git a/tasks/main.yml b/tasks/main.yml index 721cb81..1ec95d0 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -12,12 +12,4 @@ when: openhpc_enable.runtime | default(false) | bool tags: configure -- name: Run post-configure tasks - include_tasks: post-configure.yml - when: - - openhpc_enable.runtime | default(false) | bool - # Requires operational slurm cluster - - openhpc_slurm_service_started | bool - tags: post-configure - ... diff --git a/tasks/post-configure.yml b/tasks/post-configure.yml deleted file mode 100644 index 06842db..0000000 --- a/tasks/post-configure.yml +++ /dev/null @@ -1,14 +0,0 @@ ---- -# NOTE: Slurm cluster assummed to be operational - -- name: Ensure the cluster exists in the accounting database - sacct_cluster: - name: "{{ openhpc_cluster_name }}" - state: present - run_once: true - when: - - openhpc_slurm_accounting_storage_type == 'accounting_storage/slurmdbd' - # We need to restart to create the database table - notify: - - Restart slurmdbd service - - Restart slurmctld service From bb0ad774dff8d051a77a188d95cdc0ce29cb07a4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 21:08:05 +0000 Subject: [PATCH 17/40] add openmpi/IMB-MPI1 by default for generic install --- defaults/main.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/defaults/main.yml b/defaults/main.yml index 78c157c..814bc80 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -51,6 +51,7 @@ openhpc_generic_packages: - munge - mariadb-connector-c # only required on slurmdbd - hwloc-libs # only required on slurmd + - mpitests-openmpi # allows testing MPI using /usr/lib64/openmpi/bin openhpc_sbin_dir: /usr/sbin # path to slurm daemon binaries (e.g. slurmctld) openhpc_bin_dir: /usr/bin # path to slurm user binaries (e.g sinfo) openhpc_library_dir: /usr/lib64/slurm # path to slurm libraries @@ -107,7 +108,7 @@ ohpc_repos: "{{ ohpc_openhpc_repos[ansible_distribution_major_version] + ohpc_de openhpc_munge_key: openhpc_login_only_nodes: '' -openhpc_module_system_install: true +openhpc_module_system_install: true # only works for install-ohpc.yml/main.yml # Auto detection openhpc_ram_multiplier: 0.95 From caebc4f2c7bf0dbdedfeeb4bf801c39d40e2dda5 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Sep 2023 21:08:29 +0000 Subject: [PATCH 18/40] allow removal of slurm.conf options --- tasks/runtime.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tasks/runtime.yml b/tasks/runtime.yml index a351941..75f053a 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -103,6 +103,7 @@ section: '' value: "{{ (item.value | join(',')) if (item.value is sequence and item.value is not string) else item.value }}" no_extra_spaces: true + state: "{{ 'absent' if item.value == '' else 'present' }}" create: no mode: 0644 loop: "{{ openhpc_config | dict2items }}" From 7e7108729fcac8a4f32b11eaca204845201e047c Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 20 Sep 2023 07:54:59 +0000 Subject: [PATCH 19/40] update README --- README.md | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index dbcfc7c..fa138b6 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,27 @@ # stackhpc.openhpc -This Ansible role installs packages and performs configuration to provide an OpenHPC Slurm cluster. +This Ansible role installs packages and performs configuration to provide a Slurm cluster. By default this uses packages from [OpenHPC](https://openhpc.community/) but it is also possible to use alternative Slurm binaries and packages. As a role it must be used from a playbook, for which a simple example is given below. This approach means it is totally modular with no assumptions about available networks or any cluster features except for some hostname conventions. Any desired cluster fileystem or other required functionality may be freely integrated using additional Ansible roles or other approaches. The minimal image for nodes is a CentOS 7 or RockyLinux 8 GenericCloud image. These use OpenHPC v1 and v2 respectively. Centos8/OpenHPCv2 is generally preferred as it provides additional functionality for Slurm, compilers, MPI and transport libraries. +## Task files +This role provides four task files which can be selected by using the `tasks_from` parameter of Ansible's `import_role` or `include_role` modules: +- `main.yml`: Runs `install-ohpc.yml` and `runtime.yml`. Default if no `tasks_from` parameter is used. +- `install-ohpc.yml`: Installs repos and packages for OpenHPC. +- `install-generic.yml`: Installs systemd units etc. for user-provided binaries. +- `runtime.yml`: Slurm/service configuration. + ## Role Variables -`openhpc_version`: Optional. OpenHPC version to install. Defaults provide `1.3` for Centos 7 and `2` for RockyLinux/CentOS 8. +Variables only relevant for `install-ohpc.yml` or `install-generic.yml` task files are marked as such below. + +`openhpc_version`: Optional. OpenHPC version to install. Defaults provide `1.3` for Centos 7 and `2` for RockyLinux/CentOS 8 (`install-ohpc.yml` only). `openhpc_extra_repos`: Optional list. Extra Yum repository definitions to configure, following the format of the Ansible -[yum_repository](https://docs.ansible.com/ansible/2.9/modules/yum_repository_module.html) module. Respected keys for +[yum_repository](https://docs.ansible.com/ansible/2.9/modules/yum_repository_module.html) module (`install-ohpc.yml` only). Respected keys for each list element: * `name`: Required * `description`: Optional @@ -32,7 +41,7 @@ each list element: `openhpc_slurm_control_host_address`: Optional string. IP address or name to use for the `openhpc_slurm_control_host`, e.g. to use a different interface than is resolved from `openhpc_slurm_control_host`. -`openhpc_packages`: additional OpenHPC packages to install. +`openhpc_packages`: additional OpenHPC packages to install (`install-ohpc.yml` only). `openhpc_enable`: * `control`: whether to enable control host @@ -48,7 +57,15 @@ each list element: `openhpc_login_only_nodes`: Optional. If using "configless" mode specify the name of an ansible group containing nodes which are login-only nodes (i.e. not also control nodes), if required. These nodes will run `slurmd` to contact the control node for config. -`openhpc_module_system_install`: Optional, default true. Whether or not to install an environment module system. If true, lmod will be installed. If false, You can either supply your own module system or go without one. +`openhpc_module_system_install`: Optional, default true. Whether or not to install an environment module system. If true, lmod will be installed. If false, You can either supply your own module system or go without one (`install-ohpc.yml` only). + +`openhpc_generic_packages`: Optional. List of system packages to install, see `defaults/main.yml` for details (`install-generic.yml` only). + +`openhpc_sbin_dir`: Optional. Path to slurm daemon binaries such as `slurmctld`, default `/usr/sbin` (`install-generic.yml` only). + +`openhpc_bin_dir`: Optional. Path to Slurm user binaries such as `sinfo`, default `/usr/bin` (`install-generic.yml` only). + +`openhpc_library_dir`: Optional. Path to Slurm libraries, default `/usr/lib64/slurm` (`install-generic.yml` only). ### slurm.conf From 336ba637d6a9cca7cccf629d58703bf3631dc060 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 20 Sep 2023 08:01:56 +0000 Subject: [PATCH 20/40] enable openhpc_extra_repos for both generic and ohpc installs --- README.md | 11 +---------- defaults/main.yml | 6 ------ tasks/install-generic.yml | 6 ++++++ tasks/install-ohpc.yml | 18 ++++++++---------- 4 files changed, 15 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 0b938f9..b22d60f 100644 --- a/README.md +++ b/README.md @@ -20,16 +20,7 @@ This role provides four task files which can be selected by using the `tasks_fro Variables only relevant for `install-ohpc.yml` or `install-generic.yml` task files are marked as such below. `openhpc_extra_repos`: Optional list. Extra Yum repository definitions to configure, following the format of the Ansible -[yum_repository](https://docs.ansible.com/ansible/2.9/modules/yum_repository_module.html) module (`install-ohpc.yml` only). Respected keys for -each list element: -* `name`: Required -* `description`: Optional -* `file`: Required -* `baseurl`: Optional -* `metalink`: Optional -* `mirrorlist`: Optional -* `gpgcheck`: Optional -* `gpgkey`: Optional +[yum_repository](https://docs.ansible.com/ansible/2.9/modules/yum_repository_module.html) module. `openhpc_slurm_service_enabled`: boolean, whether to enable the appropriate slurm service (slurmd/slurmctld). diff --git a/defaults/main.yml b/defaults/main.yml index c9c2c66..5995693 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -73,9 +73,6 @@ ohpc_openhpc_repos: baseurl: "http://repos.openhpc.community/OpenHPC/2/updates/CentOS_8" gpgcheck: true gpgkey: https://raw.githubusercontent.com/openhpc/ohpc/v2.6.1.GA/components/admin/ohpc-release/SOURCES/RPM-GPG-KEY-OpenHPC-2 - -ohpc_default_extra_repos: - "8": - name: epel file: epel description: "Extra Packages for Enterprise Linux 8 - $basearch" @@ -83,9 +80,6 @@ ohpc_default_extra_repos: gpgcheck: true gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" -# Concatenate all repo definitions here -ohpc_repos: "{{ ohpc_openhpc_repos[ansible_distribution_major_version] + ohpc_default_extra_repos[ansible_distribution_major_version] + openhpc_extra_repos }}" - openhpc_munge_key: openhpc_login_only_nodes: '' openhpc_module_system_install: true # only works for install-ohpc.yml/main.yml diff --git a/tasks/install-generic.yml b/tasks/install-generic.yml index 964791f..fcada6d 100644 --- a/tasks/install-generic.yml +++ b/tasks/install-generic.yml @@ -9,6 +9,12 @@ slurmd: "{{ openhpc_enable.batch }}" slurmdbd: "{{ openhpc_enable.database }}" +- name: Ensure extra repos + ansible.builtin.yum_repository: "{{ item }}" + loop: "{{ openhpc_extra_repos }}" + loop_control: + label: "{{ item.name }}" + - name: Install system packages dnf: name: "{{ openhpc_generic_packages }}" diff --git a/tasks/install-ohpc.yml b/tasks/install-ohpc.yml index c993673..d5540fe 100644 --- a/tasks/install-ohpc.yml +++ b/tasks/install-ohpc.yml @@ -3,16 +3,14 @@ - include_tasks: pre.yml - name: Ensure OpenHPC repos - ansible.builtin.yum_repository: - name: "{{ item.name }}" - description: "{{ item.description | default(omit) }}" - file: "{{ item.file }}" - baseurl: "{{ item.baseurl | default(omit) }}" - metalink: "{{ item.metalink | default(omit) }}" - mirrorlist: "{{ item.mirrorlist | default(omit) }}" - gpgcheck: "{{ item.gpgcheck | default(omit) }}" - gpgkey: "{{ item.gpgkey | default(omit) }}" - loop: "{{ ohpc_repos }}" + ansible.builtin.yum_repository: "{{ item }}" + loop: "{{ ohpc_openhpc_repos[ansible_distribution_major_version] }}" + loop_control: + label: "{{ item.name }}" + +- name: Ensure extra repos + ansible.builtin.yum_repository: "{{ item }}" + loop: "{{ openhpc_extra_repos }}" loop_control: label: "{{ item.name }}" From 050e4497962c8bc41b498d09a0f57ca8cc484fec Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 20 Sep 2023 08:03:14 +0000 Subject: [PATCH 21/40] README tweak --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b22d60f..a7af4a9 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Variables only relevant for `install-ohpc.yml` or `install-generic.yml` task fil `openhpc_extra_repos`: Optional list. Extra Yum repository definitions to configure, following the format of the Ansible [yum_repository](https://docs.ansible.com/ansible/2.9/modules/yum_repository_module.html) module. -`openhpc_slurm_service_enabled`: boolean, whether to enable the appropriate slurm service (slurmd/slurmctld). +`openhpc_slurm_service_enabled`: Optional boolean, whether to enable the appropriate slurm service (slurmd/slurmctld). Default `true`. `openhpc_slurm_service_started`: Optional boolean. Whether to start slurm services. If set to false, all services will be stopped. Defaults to `openhpc_slurm_service_enabled`. From b096101b76a1af004f13e354a42102e9a1122cc6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 20 Sep 2023 08:22:09 +0000 Subject: [PATCH 22/40] add openhpc_config_files parameter --- README.md | 4 ++++ defaults/main.yml | 9 ++++++++- tasks/runtime.yml | 22 ++++++++-------------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index a7af4a9..68ab22e 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,10 @@ Variables only relevant for `install-ohpc.yml` or `install-generic.yml` task fil `openhpc_library_dir`: Optional. Path to Slurm libraries, default `/usr/lib64/slurm` (`install-generic.yml` only). +`openhpc_config_files`: Optional. List of additional Slurm configuration files to template. Default templates `gres.conf` to control node. List elements are dicts which must contain: + - `template`: A dict with parameters for Ansible's [template](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/template_module.html) module. + - `enable`: String `control`, `batch`, `database` or `runtime` specifying nodes to template this file on (i.e. matches keys from `openhpc_enable`). Any other string results in no templating. + ### slurm.conf `openhpc_slurm_partitions`: Optional. List of one or more slurm partitions, default `[]`. Each partition may contain the following values: diff --git a/defaults/main.yml b/defaults/main.yml index 5995693..0b3886c 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -15,8 +15,15 @@ openhpc_job_maxtime: '60-0' # quote this to avoid ansible converting some format openhpc_config: "{{ openhpc_extra_config | default({}) }}" openhpc_gres_template: gres.conf.j2 openhpc_slurm_configless: "{{ 'enable_configless' in openhpc_config.get('SlurmctldParameters', []) }}" - openhpc_state_save_location: /var/spool/slurm +openhpc_config_files: + - template: + dest: /etc/slurm/gres.conf + src: "{{ openhpc_gres_template }}" + mode: "0600" + owner: slurm + group: slurm + enable: control # Accounting openhpc_slurm_accounting_storage_host: "{{ openhpc_slurmdbd_host }}" diff --git a/tasks/runtime.yml b/tasks/runtime.yml index ce07bc1..a23bdb6 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -115,22 +115,16 @@ group: root mode: 0644 when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - notify: - - Restart slurmctld service + notify: Restart slurmctld service register: ohpc_slurm_conf # NB uses restart rather than reload as number of nodes might have changed -- name: Create gres.conf - template: - src: "{{ openhpc_gres_template }}" - dest: /etc/slurm/gres.conf - mode: "0600" - owner: slurm - group: slurm - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - notify: - - Restart slurmctld service - register: ohpc_gres_conf +- name: Template other Slurm configuration files + template: "{{ item.template }}" + loop: "{{ openhpc_config_files }}" + when: "openhpc_enable[item.enable] | default(false) | bool" + notify: Restart slurmctld service + register: ohpc_other_conf # NB uses restart rather than reload as this is needed in some cases - name: Remove local tempfile for slurm.conf templating @@ -148,7 +142,7 @@ changed_when: true when: - openhpc_slurm_control_host in ansible_play_hosts - - hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler + - hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_other_conf.changed # noqa no-handler notify: - Restart slurmd service From d0d7dbfdb9abbfcf26e82bfa1a17bd3c0e53608b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 20 Sep 2023 09:27:36 +0000 Subject: [PATCH 23/40] change library_dir to lib_dir --- README.md | 2 +- defaults/main.yml | 2 +- tasks/install-generic.yml | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 68ab22e..f33f899 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ Variables only relevant for `install-ohpc.yml` or `install-generic.yml` task fil `openhpc_bin_dir`: Optional. Path to Slurm user binaries such as `sinfo`, default `/usr/bin` (`install-generic.yml` only). -`openhpc_library_dir`: Optional. Path to Slurm libraries, default `/usr/lib64/slurm` (`install-generic.yml` only). +`openhpc_lib_dir`: Optional. Path to Slurm libraries, default `/usr/lib64/slurm` (`install-generic.yml` only). `openhpc_config_files`: Optional. List of additional Slurm configuration files to template. Default templates `gres.conf` to control node. List elements are dicts which must contain: - `template`: A dict with parameters for Ansible's [template](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/template_module.html) module. diff --git a/defaults/main.yml b/defaults/main.yml index 0b3886c..c749657 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -61,7 +61,7 @@ openhpc_generic_packages: - mpitests-openmpi # allows testing MPI using /usr/lib64/openmpi/bin openhpc_sbin_dir: /usr/sbin # path to slurm daemon binaries (e.g. slurmctld) openhpc_bin_dir: /usr/bin # path to slurm user binaries (e.g sinfo) -openhpc_library_dir: /usr/lib64/slurm # path to slurm libraries +openhpc_lib_dir: /usr/lib64/slurm # path to slurm libraries # Repository configuration openhpc_extra_repos: [] diff --git a/tasks/install-generic.yml b/tasks/install-generic.yml index fcada6d..bb57470 100644 --- a/tasks/install-generic.yml +++ b/tasks/install-generic.yml @@ -42,11 +42,11 @@ - name: Add library locations to ldd search path copy: dest: /etc/ld.so.conf.d/slurm.conf - content: "{{ openhpc_library_dir }}" + content: "{{ openhpc_lib_dir }}" owner: root group: root mode: ugo=r - when: openhpc_library_dir not in _ldd_paths + when: openhpc_lib_dir not in _ldd_paths vars: _ldd_paths: "{{ _slurm_ldconfig.stdout_lines | map('split', ':') | map('first') }}" From 10cb71a36fdbb6214b53895b09996542aa078ab4 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 20 Sep 2023 10:37:20 +0000 Subject: [PATCH 24/40] fix perms --- tasks/install-generic.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/install-generic.yml b/tasks/install-generic.yml index bb57470..43b5055 100644 --- a/tasks/install-generic.yml +++ b/tasks/install-generic.yml @@ -45,7 +45,7 @@ content: "{{ openhpc_lib_dir }}" owner: root group: root - mode: ugo=r + mode: ug=rw,o=r when: openhpc_lib_dir not in _ldd_paths vars: _ldd_paths: "{{ _slurm_ldconfig.stdout_lines | map('split', ':') | map('first') }}" From cb6edfcfc22d9fb09b4414bf47742ee5e378ac0d Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 20 Sep 2023 11:18:13 +0000 Subject: [PATCH 25/40] fix/silence linter warnings --- tasks/install-generic.yml | 10 +++++++--- tasks/install-ohpc.yml | 4 ++-- tasks/runtime.yml | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tasks/install-generic.yml b/tasks/install-generic.yml index 43b5055..d76ceac 100644 --- a/tasks/install-generic.yml +++ b/tasks/install-generic.yml @@ -10,7 +10,7 @@ slurmdbd: "{{ openhpc_enable.database }}" - name: Ensure extra repos - ansible.builtin.yum_repository: "{{ item }}" + ansible.builtin.yum_repository: "{{ item }}" # noqa: args[module] loop: "{{ openhpc_extra_repos }}" loop_control: label: "{{ item.name }}" @@ -30,12 +30,15 @@ template: src: "{{ item }}.service.j2" dest: /lib/systemd/system/{{ item }}.service + owner: root + group: root + mode: ug=rw,o=r loop: "{{ _ohpc_daemons }}" register: _slurm_systemd_units - name: Get current library locations shell: - cmd: "ldconfig -v | grep -v ^$'\t'" + cmd: "ldconfig -v | grep -v ^$'\t'" # noqa: no-tabs risky-shell-pipe register: _slurm_ldconfig changed_when: false @@ -51,7 +54,8 @@ _ldd_paths: "{{ _slurm_ldconfig.stdout_lines | map('split', ':') | map('first') }}" - name: Reload Slurm unit files - command: systemctl daemon-reload + # Can't do just this from systemd module + command: systemctl daemon-reload # noqa: command-instead-of-module no-changed-when no-handler when: _slurm_systemd_units.changed - name: Add slurm user binaries to PATH diff --git a/tasks/install-ohpc.yml b/tasks/install-ohpc.yml index d5540fe..5f48cd8 100644 --- a/tasks/install-ohpc.yml +++ b/tasks/install-ohpc.yml @@ -3,13 +3,13 @@ - include_tasks: pre.yml - name: Ensure OpenHPC repos - ansible.builtin.yum_repository: "{{ item }}" + ansible.builtin.yum_repository: "{{ item }}" # noqa: args[module] loop: "{{ ohpc_openhpc_repos[ansible_distribution_major_version] }}" loop_control: label: "{{ item.name }}" - name: Ensure extra repos - ansible.builtin.yum_repository: "{{ item }}" + ansible.builtin.yum_repository: "{{ item }}" # noqa: args[module] loop: "{{ openhpc_extra_repos }}" loop_control: label: "{{ item.name }}" diff --git a/tasks/runtime.yml b/tasks/runtime.yml index a23bdb6..571424b 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -120,7 +120,7 @@ # NB uses restart rather than reload as number of nodes might have changed - name: Template other Slurm configuration files - template: "{{ item.template }}" + template: "{{ item.template }}" # noqa: risky-file-permissions loop: "{{ openhpc_config_files }}" when: "openhpc_enable[item.enable] | default(false) | bool" notify: Restart slurmctld service From 087141469c361ddbd6275fb1e604b7530a5c80c7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 20 Sep 2023 14:33:24 +0000 Subject: [PATCH 26/40] remove packages only required for hpctests --- defaults/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/defaults/main.yml b/defaults/main.yml index 4e79367..105da0d 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -53,11 +53,10 @@ openhpc_enable: runtime: false # Only used for install-generic.yml: -openhpc_generic_packages: +openhpc_generic_packages: - munge - mariadb-connector-c # only required on slurmdbd - hwloc-libs # only required on slurmd - - mpitests-openmpi # allows testing MPI using /usr/lib64/openmpi/bin openhpc_sbin_dir: /usr/sbin # path to slurm daemon binaries (e.g. slurmctld) openhpc_bin_dir: /usr/bin # path to slurm user binaries (e.g sinfo) openhpc_lib_dir: /usr/lib64/slurm # path to slurm libraries From 58526d5da32750ea7826435af3bed3c330af6d7e Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 22 Sep 2023 10:08:05 +0000 Subject: [PATCH 27/40] document openhpc_config_files restart behaviour --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f33f899..86b871c 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ Variables only relevant for `install-ohpc.yml` or `install-generic.yml` task fil `openhpc_lib_dir`: Optional. Path to Slurm libraries, default `/usr/lib64/slurm` (`install-generic.yml` only). -`openhpc_config_files`: Optional. List of additional Slurm configuration files to template. Default templates `gres.conf` to control node. List elements are dicts which must contain: +`openhpc_config_files`: Optional. List of additional Slurm configuration files to template. Changes to any templated files will restart `slurmctld` and `slurmd`s. The default templates `gres.conf` on the control node. List elements are dicts which must contain: - `template`: A dict with parameters for Ansible's [template](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/template_module.html) module. - `enable`: String `control`, `batch`, `database` or `runtime` specifying nodes to template this file on (i.e. matches keys from `openhpc_enable`). Any other string results in no templating. From 0fcaf69b70191be408d76030972c2fedbcc65f00 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 26 Sep 2023 13:00:54 +0000 Subject: [PATCH 28/40] bugfix missing newline in slurm.conf --- templates/slurm.conf.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/templates/slurm.conf.j2 b/templates/slurm.conf.j2 index 5e6ce33..343abc8 100644 --- a/templates/slurm.conf.j2 +++ b/templates/slurm.conf.j2 @@ -10,6 +10,7 @@ # ClusterName={{ openhpc_cluster_name }} SlurmctldHost={{ openhpc_slurm_control_host }}{% if openhpc_slurm_control_host_address is defined %}({{ openhpc_slurm_control_host_address }}){% endif %} + #SlurmctldHost= # #DisableRootJobs=NO From 5b9b10606e50ea1cf6bfe4626bf7944b4510d7da Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 26 Sep 2023 13:11:52 +0000 Subject: [PATCH 29/40] make path for slurm.conf configurable --- README.md | 2 ++ defaults/main.yml | 1 + tasks/runtime.yml | 11 +++++++---- templates/slurmctld.service.j2 | 4 ++-- templates/slurmdbd.service.j2 | 2 +- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 86b871c..7ffa312 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,8 @@ Variables only relevant for `install-ohpc.yml` or `install-generic.yml` task fil ### slurm.conf +`openhpc_slurm_conf_path`: Optional. Path to template `slurm.conf` configuration file to. Default `/etc/slurm/slurm.conf` + `openhpc_slurm_partitions`: Optional. List of one or more slurm partitions, default `[]`. Each partition may contain the following values: * `groups`: If there are multiple node groups that make up the partition, a list of group objects can be defined here. Otherwise, `groups` can be omitted and the following attributes can be defined in the partition object: diff --git a/defaults/main.yml b/defaults/main.yml index 105da0d..7e68249 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -15,6 +15,7 @@ openhpc_config: "{{ openhpc_extra_config | default({}) }}" openhpc_gres_template: gres.conf.j2 openhpc_slurm_configless: "{{ 'enable_configless' in openhpc_config.get('SlurmctldParameters', []) }}" openhpc_state_save_location: /var/spool/slurm +openhpc_slurm_conf_path: /etc/slurm/slurm.conf openhpc_config_files: - template: dest: /etc/slurm/gres.conf diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 571424b..b4d11e9 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -20,12 +20,15 @@ - name: Ensure Slurm directories exists file: - path: "{{ openhpc_state_save_location }}" + path: "{{ item }}" owner: slurm group: slurm - mode: 0755 + mode: ug=wrX,o=rX state: directory - when: inventory_hostname == openhpc_slurm_control_host + loop: + - "{{ openhpc_slurm_conf_path | dirname }}" + - "{{ openhpc_state_save_location }}" + when: openhpc_enable.control | default(false) or not openhpc_slurm_configless - name: Generate a Munge key on control host # NB this is usually a no-op as the package install actually generates a (node-unique) one, so won't usually trigger handler @@ -110,7 +113,7 @@ - name: Create slurm.conf copy: src: "{{ _slurm_conf_tmpfile.path }}" - dest: /etc/slurm/slurm.conf + dest: "{{ openhpc_slurm_conf_path }}" owner: root group: root mode: 0644 diff --git a/templates/slurmctld.service.j2 b/templates/slurmctld.service.j2 index 6376766..86d73d2 100644 --- a/templates/slurmctld.service.j2 +++ b/templates/slurmctld.service.j2 @@ -2,13 +2,13 @@ Description=Slurm controller daemon After=network-online.target munge.service Wants=network-online.target -ConditionPathExists=/etc/slurm/slurm.conf +ConditionPathExists={{ openhpc_slurm_conf_path }} [Service] Type=simple EnvironmentFile=-/etc/sysconfig/slurmctld EnvironmentFile=-/etc/default/slurmctld -ExecStart={{ openhpc_sbin_dir }}/slurmctld -D -s $SLURMCTLD_OPTIONS +ExecStart={{ openhpc_sbin_dir }}/slurmctld -D -s -f {{ openhpc_slurm_conf_path }} $SLURMCTLD_OPTIONS ExecReload=/bin/kill -HUP $MAINPID LimitNOFILE=65536 TasksMax=infinity diff --git a/templates/slurmdbd.service.j2 b/templates/slurmdbd.service.j2 index 3a2bfec..591f1d5 100644 --- a/templates/slurmdbd.service.j2 +++ b/templates/slurmdbd.service.j2 @@ -2,7 +2,7 @@ Description=Slurm DBD accounting daemon After=network-online.target munge.service mysql.service mysqld.service mariadb.service Wants=network-online.target -ConditionPathExists=/etc/slurm/slurmdbd.conf +ConditionPathExists={{ openhpc_slurm_conf_path | dirname + '/slurmdbd.conf' }} [Service] Type=simple From 95c4df8fef18295debb9c728891bf00d1163bcbc Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 26 Sep 2023 13:18:57 +0000 Subject: [PATCH 30/40] make slurm.conf template src configurable --- README.md | 6 ++++-- defaults/main.yml | 1 + tasks/runtime.yml | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7ffa312..26a07c2 100644 --- a/README.md +++ b/README.md @@ -62,8 +62,6 @@ Variables only relevant for `install-ohpc.yml` or `install-generic.yml` task fil ### slurm.conf -`openhpc_slurm_conf_path`: Optional. Path to template `slurm.conf` configuration file to. Default `/etc/slurm/slurm.conf` - `openhpc_slurm_partitions`: Optional. List of one or more slurm partitions, default `[]`. Each partition may contain the following values: * `groups`: If there are multiple node groups that make up the partition, a list of group objects can be defined here. Otherwise, `groups` can be omitted and the following attributes can be defined in the partition object: @@ -99,6 +97,10 @@ For each group (if used) or partition any nodes in an ansible inventory group `< `openhpc_state_save_location`: Optional. Absolute path for Slurm controller state (`slurm.conf` parameter [StateSaveLocation](https://slurm.schedmd.com/slurm.conf.html#OPT_StateSaveLocation)) +`openhpc_slurm_conf_template`: Optional. Path of Jinja template for slurm.conf configuration file. Default is `slurm.conf.j2` template in role. **NB:** The required templating is complex, if just setting specific parameters use `openhpc_config` intead. + +`openhpc_slurm_conf_path`: Optional. Path to template `slurm.conf` configuration file to. Default `/etc/slurm/slurm.conf` + #### Accounting By default, no accounting storage is configured. OpenHPC v1.x and un-updated OpenHPC v2.0 clusters support file-based accounting storage which can be selected by setting the role variable `openhpc_slurm_accounting_storage_type` to `accounting_storage/filetxt`[1](#slurm_ver_footnote). Accounting for OpenHPC v2.1 and updated OpenHPC v2.0 clusters requires the Slurm database daemon, `slurmdbd` (although job completion may be a limited alternative, see [below](#Job-accounting). To enable accounting: diff --git a/defaults/main.yml b/defaults/main.yml index 7e68249..4da0a35 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -16,6 +16,7 @@ openhpc_gres_template: gres.conf.j2 openhpc_slurm_configless: "{{ 'enable_configless' in openhpc_config.get('SlurmctldParameters', []) }}" openhpc_state_save_location: /var/spool/slurm openhpc_slurm_conf_path: /etc/slurm/slurm.conf +openhpc_slurm_conf_template: slurm.conf.j2 openhpc_config_files: - template: dest: /etc/slurm/gres.conf diff --git a/tasks/runtime.yml b/tasks/runtime.yml index b4d11e9..eb97908 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -85,7 +85,7 @@ - name: Template basic slurm.conf template: - src: slurm.conf.j2 + src: "{{ openhpc_slurm_conf_template }}" dest: "{{ _slurm_conf_tmpfile.path }}" lstrip_blocks: true mode: 0644 From 2b8b8c576501d4ceb6420be0cae771e1c905450b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 27 Sep 2023 09:13:57 +0000 Subject: [PATCH 31/40] symlink slurm user tools so monitoring works --- tasks/install-generic.yml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tasks/install-generic.yml b/tasks/install-generic.yml index d76ceac..6cb235e 100644 --- a/tasks/install-generic.yml +++ b/tasks/install-generic.yml @@ -58,10 +58,19 @@ command: systemctl daemon-reload # noqa: command-instead-of-module no-changed-when no-handler when: _slurm_systemd_units.changed -- name: Add slurm user binaries to PATH - copy: - dest: /etc/profile.d/slurm.sh - content: PATH=$PATH:{{ openhpc_bin_dir }} +- name: Find user binaries + find: + paths: "{{ openhpc_bin_dir }}" + register: _ohpc_binaries + +- name: Symlink slurm user binaries into $PATH + file: + src: "{{ item.path }}" + state: link + dest: "{{ ('/usr/bin', item.path | basename) | path_join }}" owner: root group: root - mode: u=rw,go=r + mode: u=rwx,go=rx + loop: "{{ _ohpc_binaries.files }}" + loop_control: + label: "{{ item.path }}" From edcfb0086d9b10f2a18d7291c880ef6c2af57bc3 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 6 Oct 2023 15:59:29 +0000 Subject: [PATCH 32/40] fix slurm directories --- README.md | 2 ++ defaults/main.yml | 1 + tasks/runtime.yml | 14 +++++++++----- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 26a07c2..296448e 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,8 @@ For each group (if used) or partition any nodes in an ansible inventory group `< `openhpc_state_save_location`: Optional. Absolute path for Slurm controller state (`slurm.conf` parameter [StateSaveLocation](https://slurm.schedmd.com/slurm.conf.html#OPT_StateSaveLocation)) +`openhpc_slurmd_spool_dir`: Optional. Absolute path for slurmd state (`slurm.conf` parameter [SlurmdSpoolDir](https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmdSpoolDir)) + `openhpc_slurm_conf_template`: Optional. Path of Jinja template for slurm.conf configuration file. Default is `slurm.conf.j2` template in role. **NB:** The required templating is complex, if just setting specific parameters use `openhpc_config` intead. `openhpc_slurm_conf_path`: Optional. Path to template `slurm.conf` configuration file to. Default `/etc/slurm/slurm.conf` diff --git a/defaults/main.yml b/defaults/main.yml index 4da0a35..c33de7b 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -15,6 +15,7 @@ openhpc_config: "{{ openhpc_extra_config | default({}) }}" openhpc_gres_template: gres.conf.j2 openhpc_slurm_configless: "{{ 'enable_configless' in openhpc_config.get('SlurmctldParameters', []) }}" openhpc_state_save_location: /var/spool/slurm +openhpc_slurmd_spool_dir: /var/spool/slurm openhpc_slurm_conf_path: /etc/slurm/slurm.conf openhpc_slurm_conf_template: slurm.conf.j2 openhpc_config_files: diff --git a/tasks/runtime.yml b/tasks/runtime.yml index eb97908..34cbf38 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -20,15 +20,19 @@ - name: Ensure Slurm directories exists file: - path: "{{ item }}" + path: "{{ item.path }}" owner: slurm group: slurm - mode: ug=wrX,o=rX + mode: '0755' state: directory loop: - - "{{ openhpc_slurm_conf_path | dirname }}" - - "{{ openhpc_state_save_location }}" - when: openhpc_enable.control | default(false) or not openhpc_slurm_configless + - path: "{{ openhpc_state_save_location }}" # StateSaveLocation + enable: control + - path: "{{ openhpc_slurm_conf_path | dirname }}" + enable: runtime + - path: "{{ openhpc_slurmd_spool_dir }}" # SlurmdSpoolDir + enable: batch + when: "openhpc_enable[item.enable] | default(false) | bool" - name: Generate a Munge key on control host # NB this is usually a no-op as the package install actually generates a (node-unique) one, so won't usually trigger handler From 1f14dbd354c02724af541f3f06b988ad9f3e9b40 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 10 Oct 2023 09:47:53 +0000 Subject: [PATCH 33/40] fix slurmdbd path for non-default slurm.conf paths --- tasks/runtime.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 34cbf38..ff1e104 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -72,7 +72,7 @@ - name: Template slurmdbd.conf template: src: slurmdbd.conf.j2 - dest: /etc/slurm/slurmdbd.conf + dest: "{{ openhpc_slurm_conf_path | dirname }}/slurmdbd.conf" mode: "0600" owner: slurm group: slurm From a5d106ff0deeddf97649581a92399c67f4781e16 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Fri, 16 Feb 2024 15:59:57 +0000 Subject: [PATCH 34/40] default gres.conf to correct directory --- defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/main.yml b/defaults/main.yml index c33de7b..18954c3 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -20,7 +20,7 @@ openhpc_slurm_conf_path: /etc/slurm/slurm.conf openhpc_slurm_conf_template: slurm.conf.j2 openhpc_config_files: - template: - dest: /etc/slurm/gres.conf + dest: "{{ openhpc_slurm_conf_path | dirname }}/gres.conf" src: "{{ openhpc_gres_template }}" mode: "0600" owner: slurm From 5b73b8a8a3ef3b241ff5ac33b73e8ac3ce89f3a1 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 20 Feb 2024 12:05:41 +0000 Subject: [PATCH 35/40] document for openhpc_config --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 296448e..6b365dc 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ For each group (if used) or partition any nodes in an ansible inventory group `< `openhpc_cluster_name`: name of the cluster. -`openhpc_config`: Optional. Mapping of additional parameters and values for `slurm.conf`. Note these will override any included in `templates/slurm.conf.j2`. +`openhpc_config`: Optional. Mapping of additional parameters and values for `slurm.conf`. Note these will override any included in `templates/slurm.conf.j2`. Setting a parameter's value to the string `` will omit a parameter which is included in the template. `openhpc_ram_multiplier`: Optional, default `0.95`. Multiplier used in the calculation: `total_memory * openhpc_ram_multiplier` when setting `RealMemory` for the partition in slurm.conf. Can be overriden on a per partition basis using `openhpc_slurm_partitions.ram_multiplier`. Has no effect if `openhpc_slurm_partitions.ram_mb` is set. From 69e25acaf1094a0a17b7402e29314d13fb60c04b Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 27 Feb 2024 11:59:07 +0000 Subject: [PATCH 36/40] minor merge diff fixes --- README.md | 4 ++-- templates/slurm.conf.j2 | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6b365dc..19cf812 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # stackhpc.openhpc -This Ansible role installs packages and performs configuration to provide a Slurm cluster. By default this uses packages from [OpenHPC v2.x](https://openhpc.community/) but it is also possible to use alternative Slurm binaries and packages. +This Ansible role installs packages and performs configuration to provide a Slurm cluster. By default this uses packages from [OpenHPC](https://openhpc.community/) but it is also possible to use alternative Slurm binaries and packages. As a role it must be used from a playbook, for which a simple example is given below. This approach means it is totally modular with no assumptions about available networks or any cluster features except for some hostname conventions. Any desired cluster fileystem or other required functionality may be freely integrated using additional Ansible roles or other approaches. @@ -30,7 +30,7 @@ Variables only relevant for `install-ohpc.yml` or `install-generic.yml` task fil `openhpc_slurm_control_host_address`: Optional string. IP address or name to use for the `openhpc_slurm_control_host`, e.g. to use a different interface than is resolved from `openhpc_slurm_control_host`. -`openhpc_packages`: additional OpenHPC packages to install (`install-ohpc.yml` only). +`openhpc_packages`: Optional list. Additional OpenHPC packages to install (`install-ohpc.yml` only). `openhpc_enable`: * `control`: whether to enable control host diff --git a/templates/slurm.conf.j2 b/templates/slurm.conf.j2 index 8645321..ad84610 100644 --- a/templates/slurm.conf.j2 +++ b/templates/slurm.conf.j2 @@ -10,7 +10,6 @@ # ClusterName={{ openhpc_cluster_name }} SlurmctldHost={{ openhpc_slurm_control_host }}{% if openhpc_slurm_control_host_address is defined %}({{ openhpc_slurm_control_host_address }}){% endif %} - #SlurmctldHost= # #DisableRootJobs=NO From 23ddc8237cf1a8e633d3eebdf16e9af29f3c1842 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 27 Feb 2024 12:18:45 +0000 Subject: [PATCH 37/40] Fix EPEL not getting installed --- defaults/main.yml | 3 +++ tasks/install-ohpc.yml | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/defaults/main.yml b/defaults/main.yml index 6531601..c3bc31e 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -112,6 +112,9 @@ ohpc_default_extra_repos: gpgcheck: true gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-8" +# Concatenate extra repo definitions here +ohpc_extra_repos: "{{ ohpc_default_extra_repos[ansible_distribution_major_version] + openhpc_extra_repos }}" + openhpc_munge_key: openhpc_login_only_nodes: '' openhpc_module_system_install: true # only works for install-ohpc.yml/main.yml diff --git a/tasks/install-ohpc.yml b/tasks/install-ohpc.yml index 5993429..3cd56cd 100644 --- a/tasks/install-ohpc.yml +++ b/tasks/install-ohpc.yml @@ -10,7 +10,7 @@ - name: Ensure extra repos ansible.builtin.yum_repository: "{{ item }}" # noqa: args[module] - loop: "{{ openhpc_extra_repos }}" + loop: "{{ ohpc_extra_repos }}" # NB this gets required ones for OpenHPC too loop_control: label: "{{ item.name }}" From 59ee7ccf93e8cf771b8a32a73f2b6ace3cb6d662 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 19 Mar 2024 14:30:55 +0000 Subject: [PATCH 38/40] build RL9.3 container images with systemd --- .github/workflows/ci.yml | 45 +++++++++++++++++++++++++++++++++--- molecule/README.md | 19 +++++++-------- molecule/images/Dockerfile | 2 ++ molecule/test1/molecule.yml | 6 ++--- molecule/test10/molecule.yml | 8 +++---- molecule/test11/molecule.yml | 6 ++--- molecule/test12/molecule.yml | 6 ++--- molecule/test13/molecule.yml | 10 ++++---- molecule/test14/molecule.yml | 6 ++--- molecule/test1b/molecule.yml | 4 ++-- molecule/test1c/molecule.yml | 6 ++--- molecule/test2/molecule.yml | 10 ++++---- molecule/test3/molecule.yml | 10 ++++---- molecule/test4/molecule.yml | 6 ++--- molecule/test5/molecule.yml | 6 ++--- molecule/test6/molecule.yml | 2 +- molecule/test7/molecule.yml | 2 +- molecule/test8/molecule.yml | 10 ++++---- molecule/test9/molecule.yml | 10 ++++---- 19 files changed, 108 insertions(+), 66 deletions(-) create mode 100644 molecule/images/Dockerfile diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7ab02fc..2ab7596 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,33 @@ name: CI - master jobs: + build: + name: Build Rocky9 container image + # Upstream rockylinux:9.3 images don't contain systemd, which means /sbin/init fails. + # A workaround of using "/bin/bash -c 'dnf -y install systemd && /sbin/init'" + # as the container command is flaky. + # This job builds an image using the upstream rockylinux:9.3 image which ensures + # that the image used for the molecule workflow is always updated. + runs-on: ubuntu-22.04 + defaults: + run: + working-directory: molecule/images + steps: + - name: Check out the codebase. + uses: actions/checkout@v4 + + - name: Build image + run: podman build -t rocky93systemd:latest . + + - name: Save image + run: podman save --output rocky93systemd.docker rocky93systemd:latest + + - name: Upload rocky9 image + uses: actions/upload-artifact@v4 + with: + name: rocky93systemd + path: molecule/images/rocky93systemd.docker + molecule: name: Molecule # Workaround: systemd/kernel compatibility issue: @@ -18,13 +45,14 @@ jobs: # See: # - https://bugzilla.redhat.com/show_bug.cgi?id=190144 # - https://github.com/systemd/systemd/pull/16424 - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 + needs: build strategy: fail-fast: false matrix: image: - 'rockylinux:8.9' - - 'rockylinux:9.3' + - 'localhost/rocky93systemd' scenario: - test1 - test1b @@ -42,13 +70,24 @@ jobs: - test12 - test13 - test14 - exclude: [] steps: - name: Check out the codebase. uses: actions/checkout@v4 + - name: Download rocky9 container image + uses: actions/download-artifact@v4 + with: + name: rocky93systemd + path: molecule/images/rocky93systemd.docker + if: matrix.image == 'localhost/rocky93systemd' + + - name: Load rocky9 container image + run: podman load --input rocky93systemd.docker/rocky93systemd.docker + working-directory: molecule/images + if: matrix.image == 'localhost/rocky93systemd' + - name: Set up Python 3. uses: actions/setup-python@v4 with: diff --git a/molecule/README.md b/molecule/README.md index 9fed9a6..16126cb 100644 --- a/molecule/README.md +++ b/molecule/README.md @@ -39,17 +39,18 @@ Local installation on a RockyLinux 8.x machine looks like: pip install -r molecule/requirements.txt ansible-galaxy collection install containers.podman:>=1.10.1 -Then to run tests, e.g.:: +Build a RockyLinux 9.3 image with systemd included: - cd ansible-role-openhpc/ - MOLECULE_IMAGE=rockylinux:8.9 molecule test --all - -During development you may want to: + cd ansible-role-openhpc/molecule/images + podman build -t rocky93systemd:latest . -- See some debugging information by prepending: +Run tests, e.g.: - MOLECULE_NO_LOG="false" ... + cd ansible-role-openhpc/ + MOLECULE_NO_LOG="false" MOLECULE_IMAGE=rockylinux:8.9 molecule test --all -- Prevent destroying insstances using: +where the image may be `rockylinux:8.9` or `localhost/rocky93systemd`. - molecule test --destroy never +Other useful options during development: +- Prevent destroying instances by using `molecule test --destroy never` +- Run only a single test using e.g. `molecule test --scenario test5` diff --git a/molecule/images/Dockerfile b/molecule/images/Dockerfile new file mode 100644 index 0000000..ba6f78b --- /dev/null +++ b/molecule/images/Dockerfile @@ -0,0 +1,2 @@ +FROM rockylinux:9.3 +RUN dnf install -y systemd && dnf clean all diff --git a/molecule/test1/molecule.yml b/molecule/test1/molecule.yml index 838d009..29d30ca 100644 --- a/molecule/test1/molecule.yml +++ b/molecule/test1/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -19,7 +19,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -31,7 +31,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test10/molecule.yml b/molecule/test10/molecule.yml index 40f305c..9601f63 100644 --- a/molecule/test10/molecule.yml +++ b/molecule/test10/molecule.yml @@ -8,7 +8,7 @@ platforms: groups: - testohpc_login - initial - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -21,7 +21,7 @@ platforms: groups: - testohpc_compute - initial - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -34,7 +34,7 @@ platforms: groups: - testohpc_compute - initial - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -46,7 +46,7 @@ platforms: pre_build_image: true groups: # NB this is NOT in the "testohpc_compute" so that it isn't added to slurm.conf initially - new - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test11/molecule.yml b/molecule/test11/molecule.yml index e6abf9b..de2f7a6 100644 --- a/molecule/test11/molecule.yml +++ b/molecule/test11/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -21,7 +21,7 @@ platforms: - testohpc_compute - testohpc_compute_orig - testohpc_compute_new - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -34,7 +34,7 @@ platforms: groups: - testohpc_compute - testohpc_compute_orig - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test12/molecule.yml b/molecule/test12/molecule.yml index 838d009..29d30ca 100644 --- a/molecule/test12/molecule.yml +++ b/molecule/test12/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -19,7 +19,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -31,7 +31,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test13/molecule.yml b/molecule/test13/molecule.yml index 3faf9e5..a64acf2 100644 --- a/molecule/test13/molecule.yml +++ b/molecule/test13/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_control - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -20,7 +20,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -33,7 +33,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -46,7 +46,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -58,7 +58,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test14/molecule.yml b/molecule/test14/molecule.yml index 838d009..29d30ca 100644 --- a/molecule/test14/molecule.yml +++ b/molecule/test14/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -19,7 +19,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -31,7 +31,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test1b/molecule.yml b/molecule/test1b/molecule.yml index d1cbb4a..369a533 100644 --- a/molecule/test1b/molecule.yml +++ b/molecule/test1b/molecule.yml @@ -9,7 +9,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -21,7 +21,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test1c/molecule.yml b/molecule/test1c/molecule.yml index 1e992a3..f6759fc 100644 --- a/molecule/test1c/molecule.yml +++ b/molecule/test1c/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -19,7 +19,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -31,7 +31,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test2/molecule.yml b/molecule/test2/molecule.yml index f0d8820..cef6d49 100644 --- a/molecule/test2/molecule.yml +++ b/molecule/test2/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -20,7 +20,7 @@ platforms: groups: - testohpc_compute - testohpc_part1 - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -33,7 +33,7 @@ platforms: groups: - testohpc_compute - testohpc_part1 - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -46,7 +46,7 @@ platforms: groups: - testohpc_compute - testohpc_part2 - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -59,7 +59,7 @@ platforms: groups: - testohpc_compute - testohpc_part2 - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test3/molecule.yml b/molecule/test3/molecule.yml index 42d1037..1e2868c 100644 --- a/molecule/test3/molecule.yml +++ b/molecule/test3/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -20,7 +20,7 @@ platforms: groups: - testohpc_compute - testohpc_grp1 - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -33,7 +33,7 @@ platforms: groups: - testohpc_compute - testohpc_grp1 - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -46,7 +46,7 @@ platforms: groups: - testohpc_compute - testohpc_grp2 - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -59,7 +59,7 @@ platforms: groups: - testohpc_compute - testohpc_grp2 - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test4/molecule.yml b/molecule/test4/molecule.yml index 838d009..29d30ca 100644 --- a/molecule/test4/molecule.yml +++ b/molecule/test4/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -19,7 +19,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -31,7 +31,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test5/molecule.yml b/molecule/test5/molecule.yml index 838d009..29d30ca 100644 --- a/molecule/test5/molecule.yml +++ b/molecule/test5/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -19,7 +19,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -31,7 +31,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test6/molecule.yml b/molecule/test6/molecule.yml index d80f580..b7c0c50 100644 --- a/molecule/test6/molecule.yml +++ b/molecule/test6/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test7/molecule.yml b/molecule/test7/molecule.yml index 2539406..5b6d4c6 100644 --- a/molecule/test7/molecule.yml +++ b/molecule/test7/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test8/molecule.yml b/molecule/test8/molecule.yml index 3faf9e5..a64acf2 100644 --- a/molecule/test8/molecule.yml +++ b/molecule/test8/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_control - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -20,7 +20,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -33,7 +33,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -46,7 +46,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -58,7 +58,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp diff --git a/molecule/test9/molecule.yml b/molecule/test9/molecule.yml index 77f04bd..15742c0 100644 --- a/molecule/test9/molecule.yml +++ b/molecule/test9/molecule.yml @@ -7,7 +7,7 @@ platforms: pre_build_image: true groups: - testohpc_control - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -20,7 +20,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -33,7 +33,7 @@ platforms: pre_build_image: true groups: - testohpc_login - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -46,7 +46,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp @@ -58,7 +58,7 @@ platforms: pre_build_image: true groups: - testohpc_compute - command: "/bin/bash -c 'dnf -y install systemd && /sbin/init'" # not in RL9 image + command: /sbin/init tmpfs: - /run - /tmp From 513516c85aff1582fd876d75377f0a212ef511af Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 23 Jul 2024 14:25:46 +0000 Subject: [PATCH 39/40] allow use on image containing slurm binaries --- tasks/install-generic.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tasks/install-generic.yml b/tasks/install-generic.yml index 6cb235e..b9b61c7 100644 --- a/tasks/install-generic.yml +++ b/tasks/install-generic.yml @@ -71,6 +71,7 @@ owner: root group: root mode: u=rwx,go=rx + force: true # files may already exist loop: "{{ _ohpc_binaries.files }}" loop_control: label: "{{ item.path }}" From a34dace42d212c9543e0911e05e3689879af2024 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 24 Jul 2024 13:46:36 +0000 Subject: [PATCH 40/40] prepend slurm binaries to PATH instead of symlinking --- tasks/install-generic.yml | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/tasks/install-generic.yml b/tasks/install-generic.yml index b9b61c7..a767797 100644 --- a/tasks/install-generic.yml +++ b/tasks/install-generic.yml @@ -58,20 +58,15 @@ command: systemctl daemon-reload # noqa: command-instead-of-module no-changed-when no-handler when: _slurm_systemd_units.changed -- name: Find user binaries - find: - paths: "{{ openhpc_bin_dir }}" - register: _ohpc_binaries - -- name: Symlink slurm user binaries into $PATH - file: - src: "{{ item.path }}" - state: link - dest: "{{ ('/usr/bin', item.path | basename) | path_join }}" +- name: Prepend $PATH with slurm user binary location + lineinfile: + path: /etc/environment + line: "{{ new_path }}" + regexp: "^{{ new_path | regex_escape }}" owner: root group: root - mode: u=rwx,go=rx - force: true # files may already exist - loop: "{{ _ohpc_binaries.files }}" - loop_control: - label: "{{ item.path }}" + mode: u=gw,go=r + vars: + new_path: PATH="{{ openhpc_bin_dir }}:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin" + +- meta: reset_connection # to get new environment