diff --git a/lectures/2022_04_11.tex b/lectures/2022_04_11.tex
index a84001c..45f3aad 100644
--- a/lectures/2022_04_11.tex
+++ b/lectures/2022_04_11.tex
@@ -561,7 +561,7 @@ \section{4SID procedure (noise-free)} \label{sec:4SID-NF}
 \]
 
 Suppose that $\rank (H_i) = i \quad \forall i \in \{1, \dots, n\}$ \quad and $\rank (H_{n+1}) = n$.\\
-If this happens, it means that we found the first Hankel matrix which in not full rank and it also means that we have estimated (found) the order of the system.
+If this happens, it means that we found the first Hankel matrix which is not full rank and it also means that we have estimated (found) the order of the system.
 
 \paragraph{Step 2} Take $H_{n+1}$ (\textbf{recall}: $\rank (H_{n+1}) = n$) and factorize it in two rectangular matrices of size $(n+1) \times n$ and $n \times (n+1)$.
 
@@ -623,7 +623,7 @@ \section{4SID procedure (noise-free)} \label{sec:4SID-NF}
 \[ \left\{\hat{H}, \hat{G}, \hat{F}\right\} = \left\{O_{n+1}(\texttt{1;:}), R_{n+1}(\texttt{:;1}), (O')^{-1}O'' \right\}\]
 
 \begin{rem}
-    If the measurement is noisy all this process is \textbf{useless}. Also, if $n$ is unknown, \nameref{4SID-NF:step1} could never stop.
+    In practice measurements are always noisy and since this simple process is very sensitive to noise, it is \textbf{useless}. This is because using noisy \gls{ir} \nameref{4SID-NF:step1} could never stop, and, also if $n$ is known in advance, the estimated model would be badly wrong.
 \end{rem}
 
 \section{4SID procedure (with Noise)} \label{sec:4SID-N}
diff --git a/lectures/2022_04_12.tex b/lectures/2022_04_12.tex
index 519fe7d..a8eda15 100644
--- a/lectures/2022_04_12.tex
+++ b/lectures/2022_04_12.tex
@@ -207,7 +207,7 @@
 
 where $\hat{H}_{qd}$ is the \emph{signal part} and $H_{res,qd}$ is the \emph{noise (or residual) part} of $\tilde{H}_{qd}$.\\
 
-\textbf{Note} $\hat{S}$ is now a square diagonal matrix of $\sigma_1, \dots, \sigma_q$.\\
+\textbf{Note} $\hat{S}$ is now a square diagonal matrix of $\sigma_1, \dots, \sigma_n$.\\
 \textbf{Note} From $\tilde{H}_{qd}$ to $\hat{H}_{qd}$ the rank is hugely reduced.\\
 \textbf{Note} $\hat{H}_{qd}$ is the "cleaned" Hankel matrix.
 
diff --git a/lectures/2022_05_02.tex b/lectures/2022_05_02.tex
index ebe9b4b..16344e4 100644
--- a/lectures/2022_05_02.tex
+++ b/lectures/2022_05_02.tex
@@ -296,7 +296,7 @@
         \eta(t) = c\eta(t-1) + e(t) \xRightarrow{z} \eta(t+1) &= c\eta(t) + e(t+1) \\
         \eta(t+1) &= c\eta(t) + v(t)
     \end{align*}
-    where $v(t) = e(t+1), \quad v \sim \WN(0,1) \text{and} \quad v \perp v_2$.
+    where $v(t) = e(t+1), \quad v \sim \WN(0,1) \quad \text{and} \quad v \perp v_2$.
 
     \paragraph{Trick} Extension of the state vector.
 
diff --git a/lectures/2022_05_09.tex b/lectures/2022_05_09.tex
index da0dae5..a8927bf 100644
--- a/lectures/2022_05_09.tex
+++ b/lectures/2022_05_09.tex
@@ -108,7 +108,7 @@ \section{MVC System}
         \draw[->] (5.5,0) -- (5.5,-1.5) -- (-1,-1.5) -- (cont);
     \end{tikzpicture}
 \end{figure}
-where $y^0(t)$ os the desired output value, called \emph{reference}.
+where $y^0(t)$ is the desired output value, called \emph{reference}.
 
 Some additional (small) technical \textbf{assumptions}:
 \begin{itemize}
@@ -118,7 +118,7 @@ \section{MVC System}
 
 In a more formal way MVC is an optimization control problem that tries to find $u(t)$ that minimize this performance index:
 \[
-    J = E\left[ (y(t) - y^0(t))^2 \right]
+    J = \EE \left[ (y(t) - y^0(t))^2 \right]
 \]
 
 where $J$ is the variance of the tracking error: that's why it's called Minimum Variance Control.
@@ -158,7 +158,7 @@ \section{MVC System}
 \end{align}
 
 \begin{rem}[$k$-step predictor of an \gls{armax} system]
-
+    Assuming the delay of the input $d$ is equal to $k$ we have
 	\[
 		\hat{y}(t+k|t) = \frac{B(z) E(z)}{C(z)} u(t) + \frac{\tilde{R}(z)}{C(z)} y(t)
 	\] 
diff --git a/lectures/2022_05_12.tex b/lectures/2022_05_12.tex
index 941e312..18ac720 100644
--- a/lectures/2022_05_12.tex
+++ b/lectures/2022_05_12.tex
@@ -431,6 +431,10 @@ \subsection*{Choice of the sampling time}
 
 The critical choice is $\Delta T$ (sampling time).
 
+\begin{thm}[Nyquist–Shannon sampling theorem]\label{th:shannon}
+    The maximum frequency content $f_{MAX}$ of a signal to be sampled should be such that $f_{MAX} \le f_N$.
+\end{thm}
+
 \paragraph{Simple idea}
 The general intuitive rule is: the smaller $\Delta T$, the better.
 
@@ -442,7 +446,7 @@ \subsection*{Choice of the sampling time}
     where $f_S$ is the \emph{sampling frequency} and $f_N$ is the \emph{Nyquist frequency}.
 
     \paragraph{Spectrum of a discretized signal}
-    For a discretized signal its spectrum is limited and it ranges over $[0, \omega_N]$; as a result, the fitting between the spectrum of the discretized signal and the one of the original signal is very close at low frequencies and becomes more inaccurate as we approach $\omega_N$.
+    For a discretized signal its spectrum is limited and it ranges over $[0, \omega_N]$ (\nameref{th:shannon}); as a result, the fitting between the spectrum of the discretized signal and the one of the original signal is very close at low frequencies and becomes more inaccurate as we approach $\omega_N$.
 \end{rem}
 
 If $\Delta T$ is large, $f_S$ is small (and accordingly,  $f_N$ too):
@@ -592,11 +596,6 @@ \subsection*{Choice of the sampling time}
 \paragraph{Aliasing problem}
 Another problem is managing the \emph{aliasing} problem, which is a big and critical problem in the A/D step. 
 
-\begin{thm}[Nyquist–Shannon sampling theorem]\label{th:shannon}
-    The maximum frequency content of a signal $f_{MAX}$ to be sampled should be such that $f_{MAX} \le f_N$.
-\end{thm}
-
-
 When we want measure a signal $x$ we capture also the measurement noise; indeed, what we really obtain is $\tilde{x}(t) = x(t) + e(t)$, where $e$ is the noise. 
 Hence, the spectrum of the measured signal will also be composed by the spectrum of the original signal $x$ and the spectrum of the noise.
 
@@ -621,7 +620,7 @@ \subsection*{Choice of the sampling time}
 \end{figure}
 
 Therefore, if we want to sample the measured signal, and for example, $f_{MAX} = 2 \text{KHz}$, then we need $f_N \ge 2 \text{KHz} \implies f_S \ge 4 \text{KHz}$. On the other hand, we know that the bandwidth of the original signal $x(t)$ will be much smaller due to the presence of the noise. 
-For example, suppose that the frequency content of $x(t)$ is contained in the range $[0, 0.5]$ KHz: we can therefore sample with an A/D that samples at $f_S = 1$ KHz.
+For example, suppose that the frequency content of $x(t)$ is contained in the range $[0, 0.5]$ KHz $\implies f'_{MAX} = 0.5$ KHz: we can therefore sample with an A/D that samples at $f_S = 2 f'_{MAX} = 1$ KHz.
 
 \subparagraph{Analog solution} The classical way to deal with aliasing is to use anti-alias analog filters. 
 
diff --git a/lectures/2022_05_17.tex b/lectures/2022_05_17.tex
index 4178741..9872334 100644
--- a/lectures/2022_05_17.tex
+++ b/lectures/2022_05_17.tex
@@ -101,7 +101,7 @@ \section{Linear Time Invariant Systems}\label{sec:BB-SW-LTI}
 \]
 
 
-\textbf{Note} Once the SW-sensor has been designed (trained), we no longer need samples ofokok $x(t)$.
+\textbf{Note} Once the SW-sensor has been designed (trained), we no longer need samples of $x(t)$.
 
 \textbf{Note} The above method is a classic \gls{bb} parametric approach (using \gls{tf}s) but the same can also be done using 4-SID algorithm.
 
diff --git a/lectures/2022_05_23.tex b/lectures/2022_05_23.tex
index ea09703..c8b0aab 100644
--- a/lectures/2022_05_23.tex
+++ b/lectures/2022_05_23.tex
@@ -243,7 +243,7 @@ \section{\gls{gb} system identification using Simulation Error Method}
 
 \paragraph{Step 2} Define model structure
 \[
-    y(t) = \mathcal{M}(u(t); \bar{\theta}, \theta)
+    \hat{y}(t) = \mathcal{M}(\tilde{u}(t); \bar{\theta}, \theta)
 \]
 Mathematical model (linear or non-linear) usually written from first principle equations. $\bar{\theta}$ is the set of \textbf{known parameters} (mass, resistance, \dots), $\theta$ is the set of \textbf{unknown parameters} (possibly with bounds).
 
@@ -284,7 +284,8 @@ \subsection{Comparison of SEM with \gls{pem}}
         \draw[->] (m) -- (sum) node[pos=0.9] {$+$} node[pos=0.5] {$\hat{y}(t; \theta)$};
         \draw[->] (4,3) -- (sum) node[pos=0.9] {$-$};
         \draw[->] (sum) -- (J) node[pos=0.5, right] {\footnotesize simulation error};
-        \draw[->] (J) -| (m);
+        \draw[->] (J) -- ++(-4,0) -- ($(m) + (0.5, 1)$) 
+            node[pos=0.9] {$\theta^*$};
 
         \draw[<-] (0,3) -- (-1,3) node[left] {$\tilde{u}(t)$};
         \draw[->] (2,3) -- (5,3) node[right] {$\tilde{y}(t)$};
@@ -311,7 +312,7 @@ \subsection{Comparison of SEM with \gls{pem}}
 
     Using \gls{pem}
     \[
-        \hat{y}(t|t-1) = -a_1\hat{y}(t-1)-a_2\hat{y}(t-2)+b_0\hat{u}(t-1)+b_1\hat{u}(t-2)
+        \hat{y}(t|t-1) = -a_1\tilde{y}(t-1)-a_2\tilde{y}(t-2)+b_0\tilde{u}(t-1)+b_1\tilde{u}(t-2)
     \]
     \begin{align*}
         J_N(\theta) &= \frac{1}{N}\sum_{t=1}^N \left( \tilde{y}(t) - \hat{y}(t|t-1, \theta) \right)^2 \\