harvard-edge · profvjreddi · Jun 26, 2024 · Jun 25, 2024 · Jun 26, 2024 · Jun 26, 2024
diff --git a/contents/data_engineering/data_engineering.bib b/contents/data_engineering/data_engineering.bib
@@ -17,18 +17,6 @@ @inproceedings{10.1109/ICRA.2017.7989092
  month = may,
 }
 
-@inproceedings{Data_Cascades_2021,
- author = {Sambasivan, Nithya and Kapania, Shivani and Highfill, Hannah and Akrong, Diana and Paritosh, Praveen and Aroyo, Lora M},
- title = {{{\textquotedblleft}Everyone} wants to do the model work, not the data work{\textquotedblright}: {Data} Cascades in High-Stakes {AI}},
- year = {2021},
- doi = {10.1145/3411764.3445518},
- source = {Crossref},
- url = {https://doi.org/10.1145/3411764.3445518},
- booktitle = {Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems},
- publisher = {ACM},
- month = may,
-}
-
 @article{aledhari2020federated,
  author = {Aledhari, Mohammed and Razzak, Rehma and Parizi, Reza M. and Saeed, Fahad},
  bdsk-url-1 = {https://doi.org/10.1109/access.2020.3013541},
@@ -174,10 +162,10 @@ @article{krishnan2022selfsupervised
 }
 
 @inproceedings{mazumder2021multilingual,
-  title={Multilingual spoken words corpus},
-  author={Mazumder, Mark and Chitlangia, Sharad and Banbury, Colby and Kang, Yiping and Ciro, Juan Manuel and Achorn, Keith and Galvez, Daniel and Sabini, Mark and Mattson, Peter and Kanter, David and others},
-  booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)},
-  year={2021}
+ author = {Mazumder, Mark and Chitlangia, Sharad and Banbury, Colby and Kang, Yiping and Ciro, Juan Manuel and Achorn, Keith and Galvez, Daniel and Sabini, Mark and Mattson, Peter and Kanter, David and others},
+ title = {Multilingual spoken words corpus},
+ booktitle = {Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)},
+ year = {2021},
 }
 
 @article{northcutt2021pervasive,
@@ -236,3 +224,14 @@ @article{victor2019machine
  issn = {2374-3468, 2159-5399},
  month = jul,
 }
+
+@article{kuznetsova2020open,
+ author = {Kuznetsova, Alina and Rom, Hassan and Alldrin, Neil and Uijlings, Jasper and Krasin, Ivan and Pont-Tuset, Jordi and Kamali, Shahab and Popov, Stefan and Malloci, Matteo and Kolesnikov, Alexander and others},
+ title = {The open images dataset v4: {Unified} image classification, object detection, and visual relationship detection at scale},
+ journal = {International journal of computer vision},
+ volume = {128},
+ number = {7},
+ pages = {1956--1981},
+ year = {2020},
+ publisher = {Springer},
+}
diff --git a/contents/data_engineering/data_engineering.qmd b/contents/data_engineering/data_engineering.qmd
@@ -336,6 +336,22 @@ Common annotation approaches include manual labeling, crowdsourcing, and semi-au
 
 After deciding on their labels' desired content and format, creators begin the annotation process. To collect large numbers of labels from human annotators, creators frequently rely on dedicated annotation platforms, which can connect them to teams of human annotators. When using these platforms, creators may need more insight into annotators' backgrounds and experience levels with topics of interest. However, some platforms offer access to annotators with specific expertise (e.g., doctors).
 
+:::{#exr-bl .callout-caution collapse="true"}
+
+### Bootstrapped Labels
+
+Let us explore Wake Vision, a comprehensive dataset designed for TinyML person detection. This dataset is derived from a larger, general-purpose dataset, Open Images [@kuznetsova2020open], and tailored specifically for binary person detection.
+
+The transformation process involves filtering and relabeling the existing labels and bounding boxes in Open Images using an automated pipeline. This method not only conserves time and resources but also ensures the dataset meets the specific requirements of TinyML applications.
+
+Additionally, we generate metadata to benchmark the fairness and robustness of models in challenging scenarios.
+
+Let's get started!
+
+[![](https://colab.research.google.com/assets/colab-badge.png)](https://colab.research.google.com/drive/1HC5lkBblrdRZ4vaT5M5061TKKep0MS-M?usp=sharing)
+
+:::
+
 ### Ensuring Label Quality
 
 There is no guarantee that the data labels are actually correct. @fig-hard-labels shows some examples of hard labeling cases: some errors arise from blurred pictures that make them hard to identify (the frog image), and others stem from a lack of domain knowledge (the black stork case). It is possible that despite the best instructions being given to labelers, they still mislabel some images (@northcutt2021pervasive). Strategies like quality checks, training annotators, and collecting multiple labels per datapoint can help ensure label quality. For ambiguous tasks, multiple annotators can help identify controversial datapoints and quantify disagreement levels.
@@ -512,6 +528,8 @@ To reinforce the concepts covered in this chapter, we have curated a set of exer
 
 * @exr-dp
 
+* @exr-bl
+
 :::
 
 :::{.callout-warning collapse="false"}