-
Notifications
You must be signed in to change notification settings - Fork 0
/
rocks_hpc_init.yml
201 lines (168 loc) · 7.83 KB
/
rocks_hpc_init.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# Example playbook (part 1) for deploying a Rocks cluster node. It is meant to
# run just once, after the first reboot from the anaconda installation. Rerun
# this playbook will fail because CUDA complains the nvidia kernel mod already
# exists (unless it failed to install in the first place). Comment out CUDA roles
# for rerun.
- hosts: '{{ target }}'
vars:
ansible_connection: local
ansible_python_interpreter: "{{ansible_playbook_python}}"
inventory_hostname: '{{ target }}'
vars_files:
- group_vars/all
pre_tasks:
- include_role:
name: common
- name: ROCKS getting the cluster name
shell: /opt/rocks/bin/rocks list host attr {{ inventory_hostname }} | grep "Info_ClusterName" | awk '{print $3}'
register: rocks_Info_ClusterName
changed_when: false
ignore_errors: true
- name: ROCKS getting the cluster frontend private IP
shell: /opt/rocks/bin/rocks list host attr {{ inventory_hostname }} | grep "Kickstart_PrivateAddress" | awk '{print $3}'
register: rocks_kickstart_private_ip
changed_when: false
ignore_errors: true
- name: ROCKS getting the host's appliance type
shell: /opt/rocks/bin/rocks list host attr {{ inventory_hostname }} | grep "appliance" | awk '{print $3}'
register: rocks_appliance
changed_when: false
ignore_errors: true
- name: ROCKS getting the host's cluster private IP
shell: /opt/rocks/bin/rocks list host interface {{ inventory_hostname }} | grep 'private' | awk '{print $4}'
register: rocks_private_ip
changed_when: false
ignore_errors: true
- name: ROCKS getting disk_raid_lvl config from Rocks attr
shell: /opt/rocks/bin/rocks list host attr {{ inventory_hostname }} | grep 'disk_raid_lvl' | awk '{print $3}'
register: disk_raid_lvl
changed_when: false
ignore_errors: true
- name: GPGPU detecting any CUDA-Capable GPU
shell: lspci | grep -i nvidia
register: nv_gpu_detected
changed_when: false
ignore_errors: true
- name: FPGA detecting Intel A10 PAC FPGA
shell: lspci | grep -i 09c[45]
register: fpga_a10_detected
changed_when: false
ignore_errors: true
- name: MLNX detecting any OFED compatible devices
shell: lshw -class network | grep -i 'MT27710'
register: ofed_dev_detected
changed_when: false
ignore_errors: true
- name: LOCAL DISK getting a list of none OS block devices
shell: lsblk | grep disk | grep -Ev "sda" | awk '{print $1}' | awk '$0="/dev/"$0' | tr '\n' ' '
register: block_dev
changed_when: false
ignore_errors: true
- name: DEBUG printing disk_raid_lvl value and all non OS block devices
debug:
msg: "{{ disk_raid_lvl.stdout }} {{ block_dev.stdout }}"
- name: LOCAL_DISK recording disk_raid_lvl value to a local file
copy:
content: "{{ disk_raid_lvl.stdout }}"
dest: /root/disk_raid_lvl
- name: LOCAL DISK filtering block devices according to the blacklist (/root/filtered_disks)
script: ./init_rocks_cluster/rocks_local_fs_blacklist.sh {{ block_dev.stdout }} > /root/filtered_disks
- name: LOCAL DISK deploying storage format script (rocks_local_fs_nukeit.sh)
copy:
src: ./init_rocks_cluster/rocks_local_fs_nukeit.sh
dest: /root/rocks_local_fs_nukeit.sh
owner: root
group: root
mode: 0744
backup: yes
- name: init.d deploying custom init script (post_boot_script.sh)
copy:
src: ./init_rocks_cluster/post_boot_script.sh
dest: /etc/rc.d/init.d/post_boot_script.sh
owner: root
group: root
mode: 0755
# ANSIBLE101 https://stackoverflow.com/questions/24851575/ansible-how-to-pass-multiple-commands
- name: init.d enabling post_boot_script.sh by chkconfig
command: "{{ item }}"
with_items:
- chkconfig --add post_boot_script.sh
- chkconfig post_boot_script.sh on
roles:
- role: hw_dell_dsu
vars:
install_dsu: true
install_omsa: true
config_ome_snmp_alert: true
config_om_thrmshutdown: true
when:
- vendor.stdout == "Dell Inc."
- role: nw_mlnx_ofed
vars:
install_mlnx_ofed: "{{ 'true' if ofed_dev_detected is success else 'false' }}"
mlnx_ofed_installer_opts: '--force --with-neohost-backend'
- role: hw_nvidia_cuda
vars:
install_cuda_driver: "{{ 'true' if nv_gpu_detected is success else 'false' }}" # ANSIBLE101 conditional var assign
- role: hw_intel_fpga
vars:
install_ias: "{{ 'true' if fpga_a10_detected is success else 'false' }}"
- role: fs_local_mdraid
vars:
config_local_mdraid: "{{ 'true' if disk_raid_lvl.stdout.find('0') != -1 or disk_raid_lvl.stdout.find('1') != -1 else 'false' }}"
local_mdraid_dev_file: '/root/filtered_disks'
local_mdraid_level: "{{ disk_raid_lvl.stdout }}"
#
# ANSIBLE101 stdout.find() always return -1 if str not found. This is because the
# Python string.find method returns -1 when it cannot find the requested substring.
# Otherwise it returns the index where the string was found.
#
local_mdraid_part_suffix: "{{ 'p1' if model.stdout.find('C6145') == -1 else '1' }}"
local_mdraid_mk_fs_cmd: "mkfs.xfs -i projid32bit=1 -K -L mkraid_xfs /dev/{{ local_mdraid_dev_name }} -f"
- role: fs_local_jbod
vars:
config_local_hdfs: "{{ 'true' if disk_raid_lvl.stdout.find('hdfs') != -1 else 'false' }}"
local_jbod_dev_file: '/root/filtered_disks'
local_jbod_hdfs_mnt_prefix: 'd'
local_jbod_part_suffix: "{{ 'p1' if model.stdout.find('C6145') == -1 else '1' }}"
local_jbod_ext_label: 'hdfs_ext4'
post_tasks:
- name: LOCAL DISK {{ local_mdraid_dev_name }} adding command for mounting to /scratch_local
blockinfile:
path: /etc/rc.d/init.d/post_boot_script.sh
marker: "# {mark} ANSIBLE MANAGED BLOCK /scratch_local md127"
block: |
mkdir /scratch_local
chmod 777 /scratch_local
mount -o prjquota /dev/{{ local_mdraid_dev_name }} /scratch_local
when: config_local_mdraid|bool
- name: LOCAL DISK {{ local_mdraid_dev_name }} adding formatting command to nukeit script
blockinfile:
path: /root/rocks_local_fs_nukeit.sh
block: |
bash /root/del_md_raid.sh /scratch_local {{ local_mdraid_dev_name }} {{ local_mdraid_part_suffix }} {{ mdraid_dev.stdout }}
when: config_local_mdraid|bool
- name: LOCAL DISK HDFS storage adding mounting command on boot
blockinfile:
path: /etc/rc.d/init.d/post_boot_script.sh
marker: "# {mark} ANSIBLE MANAGED BLOCK /dx HDFS"
block: |
bash /root/mnt_hdfs_disk.sh mount {{ local_jbod_hdfs_mnt_prefix }} {{ local_jbod_part_suffix }} {{ jbod_dev.stdout }}
when: config_local_hdfs|bool
- name: LOCAL DISK HDFS storage adding formatting command to nukeit script
blockinfile:
path: /root/rocks_local_fs_nukeit.sh
block: |
bash /root/mnt_hdfs_disk.sh umount {{ local_jbod_hdfs_mnt_prefix }} {{ local_jbod_part_suffix }} {{ jbod_dev.stdout }}
bash /root/del_hdfs_disk.sh {{ local_jbod_part_suffix }} {{ jbod_dev.stdout }}
when: config_local_hdfs|bool
- name: MNT /mnt/qmaster_common NFS mount
blockinfile:
path: /etc/rc.d/init.d/post_boot_script.sh
marker: "# {mark} ANSIBLE MANAGED BLOCK /mnt/qmaster_common"
block: |
mkdir /mnt/qmaster_common
mount -t nfs {{ rocks_kickstart_private_ip.stdout }}://opt/gridengine/default/common /mnt/qmaster_common
- name: INFO printing cluster node basic info
debug:
msg: "{{ rocks_Info_ClusterName.stdout }} | {{ target }} | {{ rocks_private_ip.stdout }} | {{ rocks_appliance.stdout }} | {{ vendor.stdout }} | {{ model.stdout }}"