-
Notifications
You must be signed in to change notification settings - Fork 0
/
hardware.tex
93 lines (83 loc) · 3.16 KB
/
hardware.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
\section{Hardware}
% NOTE: The easiest way to find the hardware specifics of a device
% is to run the deviceQuery executable (cuda-samples)
% git clone https://github.com/NVIDIA/cuda-samples.git
% cd cuda-samples
% make -j 6
% cd bin/x86_64/linux/release
% ./deviceQuery
% NOTE 2:
% Ampere A100:
% https://images.nvidia.com/aem-dam/en-zz/Solutions/data-center/nvidia-ampere-architecture-whitepaper.pdf
% https://developer.nvidia.com/blog/nvidia-ampere-architecture-in-depth/
% Hopper H100:
% https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/
% Blackwell B100:
% https://resources.nvidia.com/en-us-blackwell-architecture
% Will be succeeded by the Rubin generation (R100/R200)
% https://blogs.nvidia.com/blog/computex-2024-jensen-huang/
\subsection{Streaming multiprocessor (SM)}
\begin{frame}
\frametitle{Streaming Multiprocessor (SM)}
\begin{itemize}
\item GPU device connected to the CPU by a PCIe bus.
\item each GPU device contains an array (\textcolor{red}{$\mathbf{x}$}) of Streaming Multiprocessors \textbf{(SM)}.
\item each SM has:
\begin{itemize}
\item a Single-Instruction Multiple-Thread (\texttt{SIMT}) Architecture.
\item contains \textcolor{red}{$\mathbf{y}$} regular cores and [\textcolor{red}{$\mathbf{z}$} tensor cores].
\end{itemize}
\item scalable: newer generations: increase of \textcolor{red}{$\mathbf{x}$}, \textcolor{red}{$\mathbf{y}$}
and [\textcolor{red}{$\mathbf{z}$}], e.g.:
\begin{itemize}
\item \texttt{NVIDIA A100-PCIE-40GB} (\textit{notch293})
\begin{itemize}
\item global memory: $40$ GB.
\item $108$ SMs, $64$ Cores/SM, $4$ Tensor Cores/SM.
\item GPU Max. Clock Rate: $1.41$ GHz.
\end{itemize}
\item \texttt{NVIDIA H100 SXM5 NVL} (\textit{grn008})
\begin{itemize}
\item global memory: $93$ GB.
\item $132$ SMs, $128$ Cores/SM, $4$ Tensor Cores/SM.
\item GPU Max. Clock Rate: $1.78$ GHz.
\end{itemize}
\end{itemize}
\end{itemize}
\end{frame}
% NOTE:
\begin{frame}
\frametitle{\texttt{NVIDIA GH100 SM}}
\begin{columns}
\column{0.50\textwidth}
\begin{figure}[H]
\centering
\includegraphics[width=0.80\textwidth]{./img/H100-Streaming-Multiprocessor-SM-1104x1536.png}
\caption{\small{GH100 SM}}
\end{figure}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{\texttt{NVIDIA GH100 Full Device}}
\begin{columns}
\column{0.90\textwidth}
\begin{figure}[H]
\centering
\includegraphics[width=0.90\textwidth]{./img/Full-H100-GPU-with-144-SMs-1536x686.png}
\caption{\small{NVIDIA GH100 Full Device ($144$ SMs).}}
\end{figure}
\end{columns}
\end{frame}
\subsection{Warps}
\begin{frame}
\frametitle{GPU Threads - Warps}
\begin{itemize}
\item Each SM:
\begin{itemize}
\item generates, schedules, executes threads in batches of $32$ threads.
\item \textbf{WARP}: a batch of $32$ threads
\end{itemize}
\item each thread in a WARP executes the same instructions but runs its own "path".
\item if threads within a WARP diverge, the threads become inactive/disabled.
\end{itemize}
\end{frame}