Merge pull request #61 from ccai1/main

Project Submission - Cathy Cai
deep-learning-mit · Dec 16, 2023 · a92e079 · a92e079
2 parents a19fb14 + 3e0cb1a
commit a92e079
Show file tree

Hide file tree

Showing 7 changed files with 401 additions and 1 deletion.
diff --git a/_posts/2023-11-08-overpameterization.md b/_posts/2023-11-08-overpameterization.md
@@ -45,7 +45,7 @@ _styles: >
 
 ## Project Proposal
 
-In my final project, I want to analyze the role of over-parameterization in the generalization of neural networks. Empirical work has demonstrated that over-parameterized neural networks generalize better to test data, which is counterintuitive because conventional wisdom states that overparameterized network can easily fit random labels to the data. Previous work has sought to explain this phenomena in MLPs and CNNs. The work of @neyshabur2018towards analyzed the capacity bound of two layer ReLU networks and demonstrates that it decreases with width. The work of @nichani2020increasing analyzed the test risk as depth increases with CNNs and showed that it follows a U-shaped curve. In my proposal, I want to analyze why another form of overparameterized neural networks do well: the Neural Tangent Kernel @cho2009kernel. The NTK approximates an MLP with infinite width and outperforms neural networks on certain tasks, e.g. @radhakrishnan2022simple. I want to analyze NTKs to assess whether the kernel-structure gives some information to the generalization capabilities of the extremely overparameterized neural networks. The key questions I want to answer include: why do overparameterized neural networks work so well? Is the wider the better? How does generalization capacity differ between types of models (e.g. NN/CNNs, NTK/CNTK)? 
+In my final project, I want to analyze the role of over-parameterization in the generalization of neural networks. Empirical work has demonstrated that over-parameterized neural networks generalize better to test data, which is counterintuitive because conventional wisdom states that overparameterized network can easily fit random labels to the data. Previous work has sought to explain this phenomena in MLPs and CNNs. The work of <d-cite key="neyshabur2018towards"></d-cite> analyzed the capacity bound of two layer ReLU networks and demonstrates that it decreases with width. The work of <d-cite key="nichani2020increasing"></d-cite> analyzed the test risk as depth increases with CNNs and showed that it follows a U-shaped curve. In my proposal, I want to analyze why another form of overparameterized neural networks do well: the Neural Tangent Kernel <d-cite key="cho2009kernel"></d-cite>. The NTK approximates an MLP with infinite width and outperforms neural networks on certain tasks, e.g. <d-cite key="radhakrishnan2022simple"></d-cite>. I want to analyze NTKs to assess whether the kernel-structure gives some information to the generalization capabilities of the extremely overparameterized neural networks. The key questions I want to answer include: why do overparameterized neural networks work so well? Is the wider the better? How does generalization capacity differ between types of models (e.g. NN/CNNs, NTK/CNTK)? 
 
 ### Outline
 * Literature Review

diff --git a/_posts/2023-12-12-overparameterization.md b/_posts/2023-12-12-overparameterization.md
diff --git a/assets/bibliography/2023-12-12-overparameterization.bib b/assets/bibliography/2023-12-12-overparameterization.bib
@@ -0,0 +1,75 @@
+@article{neyshabur2018towards,
+  title={Towards understanding the role of over-parametrization in generalization of neural networks},
+  author={Neyshabur, Behnam and Li, Zhiyuan and Bhojanapalli, Srinadh and LeCun, Yann and Srebro, Nathan},
+  journal={arXiv preprint arXiv:1805.12076},
+  year={2018}
+}
+
+@article{nichani2020increasing,
+  title={Increasing depth leads to U-shaped test risk in over-parameterized convolutional networks},
+  author={Nichani, Eshaan and Radhakrishnan, Adityanarayanan and Uhler, Caroline},
+  journal={arXiv preprint arXiv:2010.09610},
+  year={2020}
+}
+
+@article{cho2009kernel,
+  title={Kernel methods for deep learning},
+  author={Cho, Youngmin and Saul, Lawrence},
+  journal={Advances in neural information processing systems},
+  volume={22},
+  year={2009}
+}
+
+@article{radhakrishnan2022simple,
+  title={Simple, fast, and flexible framework for matrix completion with infinite width neural networks},
+  author={Radhakrishnan, Adityanarayanan and Stefanakis, George and Belkin, Mikhail and Uhler, Caroline},
+  journal={Proceedings of the National Academy of Sciences},
+  volume={119},
+  number={16},
+  pages={e2115064119},
+  year={2022},
+  publisher={National Acad Sciences}
+}
+
+@article{belkin2019reconciling,
+  title={Reconciling modern machine-learning practice and the classical bias--variance trade-off},
+  author={Belkin, Mikhail and Hsu, Daniel and Ma, Siyuan and Mandal, Soumik},
+  journal={Proceedings of the National Academy of Sciences},
+  volume={116},
+  number={32},
+  pages={15849--15854},
+  year={2019},
+  publisher={National Acad Sciences}
+}
+
+@article{jacot2018neural,
+  title={Neural tangent kernel: Convergence and generalization in neural networks},
+  author={Jacot, Arthur and Gabriel, Franck and Hongler, Cl{\'e}ment},
+  journal={Advances in neural information processing systems},
+  volume={31},
+  year={2018}
+}
+
+@article{cai2023synthetic,
+  title={Synthetic Lethality Screening with Recursive Feature Machines},
+  author={Cai, Cathy and Radhakrishnan, Adityanarayanan and Uhler, Caroline},
+  journal={bioRxiv},
+  pages={2023--12},
+  year={2023},
+  publisher={Cold Spring Harbor Laboratory}
+}
+
+@article{lee2017deep,
+  title={Deep neural networks as gaussian processes},
+  author={Lee, Jaehoon and Bahri, Yasaman and Novak, Roman and Schoenholz, Samuel S and Pennington, Jeffrey and Sohl-Dickstein, Jascha},
+  journal={arXiv preprint arXiv:1711.00165},
+  year={2017}
+}
+@article{neal1996priors,
+  title={Priors for infinite networks},
+  author={Neal, Radford M and Neal, Radford M},
+  journal={Bayesian learning for neural networks},
+  pages={29--53},
+  year={1996},
+  publisher={Springer}
+}
diff --git a/assets/img/2023-12-12-overparameterization/Fig1.png b/assets/img/2023-12-12-overparameterization/Fig1.png
diff --git a/assets/img/2023-12-12-overparameterization/Fig2.png b/assets/img/2023-12-12-overparameterization/Fig2.png
diff --git a/assets/img/2023-12-12-overparameterization/Fig3.png b/assets/img/2023-12-12-overparameterization/Fig3.png
diff --git a/assets/img/2023-12-12-overparameterization/Fig4.png b/assets/img/2023-12-12-overparameterization/Fig4.png