forked from karpathy/llm.c
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile
177 lines (157 loc) · 6.11 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
CC ?= clang
CFLAGS = -Ofast -Wno-unused-result -Wno-ignored-pragmas -Wno-unknown-attributes
LDFLAGS =
LDLIBS = -lm
INCLUDES =
CFLAGS_COND = -march=native
# Find nvcc
SHELL_UNAME = $(shell uname)
REMOVE_FILES = rm -f
OUTPUT_FILE = -o $@
CUDA_OUTPUT_FILE = -o $@
# NVCC flags
# -t=0 is short for --threads, 0 = number of CPUs on the machine
NVCC_FLAGS = -O3 -t=0 --use_fast_math
NVCC_LDFLAGS = -lcublas -lcublasLt
NCLL_INCLUDES =
NVCC_LDLIBS =
ifneq ($(OS), Windows_NT)
NVCC := $(shell which nvcc 2>/dev/null)
# Function to test if the compiler accepts a given flag.
define check_and_add_flag
$(eval FLAG_SUPPORTED := $(shell printf "int main() { return 0; }\n" | $(CC) $(1) -x c - -o /dev/null 2>/dev/null && echo 'yes'))
ifeq ($(FLAG_SUPPORTED),yes)
CFLAGS += $(1)
endif
endef
# Check each flag and add it if supported
$(foreach flag,$(CFLAGS_COND),$(eval $(call check_and_add_flag,$(flag))))
else
CFLAGS :=
REMOVE_FILES = del *.exe,*.obj,*.lib,*.exp,*.pdb && del
SHELL_UNAME := Windows
ifneq ($(shell where nvcc 2> nul),"")
NVCC := nvcc
else
NVCC :=
endif
CC := cl
CFLAGS = /Idev /Zi /nologo /Wall /WX- /diagnostics:column /sdl /O2 /Oi /Ot /GL /D _DEBUG /D _CONSOLE /D _UNICODE /D UNICODE /Gm- /EHsc /MD /GS /Gy /fp:fast /Zc:wchar_t /Zc:forScope /Zc:inline /permissive- \
/external:W3 /Gd /TP /wd4996 /[email protected] /FC /openmp:llvm
LDFLAGS :=
LDLIBS :=
INCLUDES :=
NVCC_FLAGS += -I"dev"
ifeq ($(WIN_CI_BUILD),1)
$(info Windows CI build)
OUTPUT_FILE = /link /OUT:$@
CUDA_OUTPUT_FILE = -o $@
else
$(info Windows local build)
OUTPUT_FILE = /link /OUT:$@ && copy /Y $@ [email protected]
CUDA_OUTPUT_FILE = -o $@ && copy /Y [email protected] $@
endif
endif
# Check if OpenMP is available
# This is done by attempting to compile an empty file with OpenMP flags
# OpenMP makes the code a lot faster so I advise installing it
# e.g. on MacOS: brew install libomp
# e.g. on Ubuntu: sudo apt-get install libomp-dev
# later, run the program by prepending the number of threads, e.g.: OMP_NUM_THREADS=8 ./gpt2
# First, check if NO_OMP is set to 1, if not, proceed with the OpenMP checks
ifeq ($(NO_OMP), 1)
$(info OpenMP is manually disabled)
else
ifneq ($(OS), Windows_NT)
# Detect if running on macOS or Linux
ifeq ($(SHELL_UNAME), Darwin)
# Check for Homebrew's libomp installation in different common directories
ifeq ($(shell [ -d /opt/homebrew/opt/libomp/lib ] && echo "exists"), exists)
# macOS with Homebrew on ARM (Apple Silicon)
CFLAGS += -Xclang -fopenmp -DOMP
LDFLAGS += -L/opt/homebrew/opt/libomp/lib
LDLIBS += -lomp
INCLUDES += -I/opt/homebrew/opt/libomp/include
$(info OpenMP found, compiling with OpenMP support)
else ifeq ($(shell [ -d /usr/local/opt/libomp/lib ] && echo "exists"), exists)
# macOS with Homebrew on Intel
CFLAGS += -Xclang -fopenmp -DOMP
LDFLAGS += -L/usr/local/opt/libomp/lib
LDLIBS += -lomp
INCLUDES += -I/usr/local/opt/libomp/include
$(info OpenMP found, compiling with OpenMP support)
else
$(warning OpenMP not found, skipping OpenMP support)
endif
else
# Check for OpenMP support in GCC or Clang on Linux
ifeq ($(shell echo | $(CC) -fopenmp -x c -E - > /dev/null 2>&1; echo $$?), 0)
CFLAGS += -fopenmp -DOMP
LDLIBS += -lgomp
$(info OpenMP found, compiling with OpenMP support)
else
$(warning OpenMP not found, skipping OpenMP support)
endif
endif
endif
endif
ifeq ($(NO_MULTI_GPU), 1)
$(info Multi-GPU (OpenMPI + NCCL) is manually disabled)
else
ifneq ($(OS), Windows_NT)
# Detect if running on macOS or Linux
ifeq ($(SHELL_UNAME), Darwin)
$(warning Multi-GPU on CUDA on Darwin is not supported, skipping OpenMPI + NCCL support)
else ifeq ($(shell [ -d /usr/lib/x86_64-linux-gnu/openmpi/lib/ ] && [ -d /usr/lib/x86_64-linux-gnu/openmpi/include/ ] && echo "exists"), exists)
$(info OpenMPI found, adding support)
NVCC_INCLUDES += -I/usr/lib/x86_64-linux-gnu/openmpi/include
NVCC_LDFLAGS += -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
NVCC_LDLIBS += -lmpi -lnccl
NVCC_FLAGS += -DMULTI_GPU
else
$(warning OpenMPI is not found, disabling multi-GPU support)
$(warning On Linux you can try install OpenMPI with `sudo apt install openmpi-bin openmpi-doc libopenmpi-dev`)
endif
endif
endif
# Precision settings, default to bf16 but ability to override
PRECISION ?= BF16
VALID_PRECISIONS := FP32 FP16 BF16
ifeq ($(filter $(PRECISION),$(VALID_PRECISIONS)),)
$(error Invalid precision $(PRECISION), valid precisions are $(VALID_PRECISIONS))
endif
ifeq ($(PRECISION), FP32)
PFLAGS = -DENABLE_FP32
else ifeq ($(PRECISION), FP16)
PFLAGS = -DENABLE_FP16
else
PFLAGS = -DENABLE_BF16
endif
# PHONY means these targets will always be executed
.PHONY: all train_gpt2 test_gpt2 train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu profile_gpt2cu
# Add targets
TARGETS = train_gpt2 test_gpt2
# Conditional inclusion of CUDA targets
ifeq ($(NVCC),)
$(info nvcc not found, skipping CUDA builds)
else
$(info nvcc found, including CUDA builds)
TARGETS += train_gpt2cu test_gpt2cu train_gpt2fp32cu test_gpt2fp32cu
endif
all: $(TARGETS)
train_gpt2: train_gpt2.c
$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE)
test_gpt2: test_gpt2.c
$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $< $(LDLIBS) $(OUTPUT_FILE)
train_gpt2cu: train_gpt2.cu
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(NVCC_LDFLAGS) $(CUDA_OUTPUT_FILE)
train_gpt2fp32cu: train_gpt2_fp32.cu
$(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(NVCC_LDFLAGS) $(CUDA_OUTPUT_FILE)
test_gpt2cu: test_gpt2.cu
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(NVCC_LDFLAGS) $(CUDA_OUTPUT_FILE)
test_gpt2fp32cu: test_gpt2_fp32.cu
$(NVCC) $(NVCC_FLAGS) $< $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(NVCC_LDFLAGS) $(CUDA_OUTPUT_FILE)
profile_gpt2cu: profile_gpt2.cu
$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $< $(NVCC_LDFLAGS) $(CUDA_OUTPUT_FILE)
clean:
$(REMOVE_FILES) $(TARGETS)