diff --git a/.all-contributorsrc b/.all-contributorsrc index 1753e43e..9b1efc08 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -6,6 +6,15 @@ "README.md" ], "contributors": [ + { + "login": "Mjrovai", + "name": "Marcelo Rovai", + "avatar_url": "https://avatars.githubusercontent.com/Mjrovai", + "profile": "https://github.com/Mjrovai", + "contributions": [ + "doc" + ] + }, { "login": "sjohri20", "name": "sjohri20", @@ -16,19 +25,37 @@ ] }, { - "login": "uchendui", - "name": "Ikechukwu Uchendu", - "avatar_url": "https://avatars.githubusercontent.com/uchendui", - "profile": "https://github.com/uchendui", + "login": "oishib", + "name": "oishib", + "avatar_url": "https://avatars.githubusercontent.com/oishib", + "profile": "https://github.com/oishib", "contributions": [ "doc" ] }, { - "login": "Naeemkh", - "name": "naeemkh", - "avatar_url": "https://avatars.githubusercontent.com/Naeemkh", - "profile": "https://github.com/Naeemkh", + "login": "BaeHenryS", + "name": "Henry Bae", + "avatar_url": "https://avatars.githubusercontent.com/BaeHenryS", + "profile": "https://github.com/BaeHenryS", + "contributions": [ + "doc" + ] + }, + { + "login": "marcozennaro", + "name": "Marco Zennaro", + "avatar_url": "https://avatars.githubusercontent.com/marcozennaro", + "profile": "https://github.com/marcozennaro", + "contributions": [ + "doc" + ] + }, + { + "login": "DivyaAmirtharaj", + "name": "Divya", + "avatar_url": "https://avatars.githubusercontent.com/DivyaAmirtharaj", + "profile": "https://github.com/DivyaAmirtharaj", "contributions": [ "doc" ] @@ -43,19 +70,19 @@ ] }, { - "login": "mpstewart1", - "name": "Matthew Stewart", - "avatar_url": "https://avatars.githubusercontent.com/mpstewart1", - "profile": "https://github.com/mpstewart1", + "login": "jessicaquaye", + "name": "Jessica Quaye", + "avatar_url": "https://avatars.githubusercontent.com/jessicaquaye", + "profile": "https://github.com/jessicaquaye", "contributions": [ "doc" ] }, { - "login": "Mjrovai", - "name": "Marcelo Rovai", - "avatar_url": "https://avatars.githubusercontent.com/Mjrovai", - "profile": "https://github.com/Mjrovai", + "login": "colbybanbury", + "name": "Colby Banbury", + "avatar_url": "https://avatars.githubusercontent.com/colbybanbury", + "profile": "https://github.com/colbybanbury", "contributions": [ "doc" ] @@ -70,37 +97,55 @@ ] }, { - "login": "profvjreddi", - "name": "Vijay Janapa Reddi", - "avatar_url": "https://avatars.githubusercontent.com/profvjreddi", - "profile": "https://github.com/profvjreddi", + "login": "mmaz", + "name": "Mark Mazumder", + "avatar_url": "https://avatars.githubusercontent.com/mmaz", + "profile": "https://github.com/mmaz", "contributions": [ "doc" ] }, { - "login": "marcozennaro", - "name": "Marco Zennaro", - "avatar_url": "https://avatars.githubusercontent.com/marcozennaro", - "profile": "https://github.com/marcozennaro", + "login": "sophiacho1", + "name": "sophiacho1", + "avatar_url": "https://avatars.githubusercontent.com/sophiacho1", + "profile": "https://github.com/sophiacho1", "contributions": [ "doc" ] }, { - "login": "oishib", - "name": "oishib", - "avatar_url": "https://avatars.githubusercontent.com/oishib", - "profile": "https://github.com/oishib", + "login": "uchendui", + "name": "Ikechukwu Uchendu", + "avatar_url": "https://avatars.githubusercontent.com/uchendui", + "profile": "https://github.com/uchendui", "contributions": [ "doc" ] }, { - "login": "jessicaquaye", - "name": "Jessica Quaye", - "avatar_url": "https://avatars.githubusercontent.com/jessicaquaye", - "profile": "https://github.com/jessicaquaye", + "login": "Naeemkh", + "name": "naeemkh", + "avatar_url": "https://avatars.githubusercontent.com/Naeemkh", + "profile": "https://github.com/Naeemkh", + "contributions": [ + "doc" + ] + }, + { + "login": "profvjreddi", + "name": "Vijay Janapa Reddi", + "avatar_url": "https://avatars.githubusercontent.com/profvjreddi", + "profile": "https://github.com/profvjreddi", + "contributions": [ + "doc" + ] + }, + { + "login": "mpstewart1", + "name": "Matthew Stewart", + "avatar_url": "https://avatars.githubusercontent.com/mpstewart1", + "profile": "https://github.com/mpstewart1", "contributions": [ "doc" ] diff --git a/README.md b/README.md index bdd76b17..a7d8bfba 100644 --- a/README.md +++ b/README.md @@ -88,19 +88,26 @@ quarto render + - - + + + + - - + + + + + + + + - - - +
Marcelo Rovai
Marcelo Rovai

📖
sjohri20
sjohri20

📖
Ikechukwu Uchendu
Ikechukwu Uchendu

📖
naeemkh
naeemkh

📖
oishib
oishib

📖
Henry Bae
Henry Bae

📖
Marco Zennaro
Marco Zennaro

📖
Divya
Divya

📖
ishapira
ishapira

📖
Matthew Stewart
Matthew Stewart

📖
Marcelo Rovai
Marcelo Rovai

📖
Jessica Quaye
Jessica Quaye

📖
Colby Banbury
Colby Banbury

📖
Shvetank Prakash
Shvetank Prakash

📖
Mark Mazumder
Mark Mazumder

📖
sophiacho1
sophiacho1

📖
Ikechukwu Uchendu
Ikechukwu Uchendu

📖
naeemkh
naeemkh

📖
Vijay Janapa Reddi
Vijay Janapa Reddi

📖
Marco Zennaro
Marco Zennaro

📖
oishib
oishib

📖
Jessica Quaye
Jessica Quaye

📖
Matthew Stewart
Matthew Stewart

📖
diff --git a/_quarto.yml b/_quarto.yml index 4b23c5ed..ff2b11c3 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -21,7 +21,7 @@ book: search: true pinned: true collapse: true - + sidebar: search: true @@ -76,8 +76,11 @@ book: - part: EXERCISES chapters: - - embedded_sys_exercise.qmd - - embedded_ml_exercise.qmd + - niclav_sys.qmd + - image_classification.qmd + - object_detection_fomo.qmd + - kws_feature_eng.qmd + - kws_nicla.qmd references: references.qmd @@ -121,6 +124,7 @@ format: citations-hover: false fig-width: 8 fig-height: 6 + number-depth: 3 editor: render-on-save: true diff --git a/ai_for_good.qmd b/ai_for_good.qmd index 777364d8..cecde81f 100644 --- a/ai_for_good.qmd +++ b/ai_for_good.qmd @@ -4,7 +4,7 @@ By aligning AI progress with human values, goals, and ethics, the ultimate goal > The "AI for Good" movement plays a critical role in cultivating a future where an AI-empowered society is more just, sustainable, and prosperous for all of humanity. -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives diff --git a/benchmarking.qmd b/benchmarking.qmd index 5e1a634b..6f11a146 100644 --- a/benchmarking.qmd +++ b/benchmarking.qmd @@ -1,89 +1,815 @@ # Benchmarking AI -::: {.callout-tip collapse="true"} +![_DALL·E 3 Prompt: Photo of a podium set against a tech-themed backdrop. On each tier of the podium, there are AI chips with intricate designs. The top chip has a gold medal hanging from it, the second one has a silver medal, and the third has a bronze medal. Banners with 'AI Olympics' are displayed prominently in the background._](images/benchmarking/cover_ai_benchmarking.png) + +Benchmarking is a critical part of developing and deploying machine learning systems, especially for tinyML applications. Benchmarks allow developers to measure and compare the performance of different model architectures, training procedures, and deployment strategies. This provides key insights into which approaches work best for the problem at hand and the constraints of the deployment environment. + +This chapter will provide an overview of popular ML benchmarks, best practices for benchmarking, and how to use benchmarks to improve model development and system performance. It aims to provide developers with the right tools and knowledge to effectively benchmark and optimize their systems, especially for tinyML systems. + +::: {.callout-tip} ## Learning Objectives -* coming soon. +* Understand the purpose and goals of benchmarking AI systems, including performance assessment, resource evaluation, validation, and more. + +* Learn about the different types of benchmarks - micro, macro, and end-to-end - and their role in evaluating different aspects of an AI system. + +* Become familiar with the key components of an AI benchmark, including datasets, tasks, metrics, baselines, reproducibility rules, and more. + +* Understand the distinction between training and inference, and how each phase warrants specialized ML systems benchmarking. + +* Learn about system benchmarking concepts like throughput, latency, power, and computational efficiency. + +* Appreciate the evolution of model benchmarking from accuracy to more holistic metrics like fairness, robustness and real-world applicability. + +* Recognize the growing role of data benchmarking in evaluating issues like bias, noise, balance and diversity. + +* Understand the limitations of evaluating models, data, and systems in isolation, and the emerging need for integrated benchmarking. ::: ## Introduction -Explanation: Introducing the concept and importance of benchmarking sets the stage for the reader to understand why it is crucial in the evaluation and optimization of AI systems, especially in resource-constrained embedded environments where it is even more important! +Benchmarking provides the essential measurements needed to drive progress in machine learning and to truly understand system performance. As the physicist Lord Kelvin famously said, "To measure is to know." Benchmarks give us the ability to know the capabilities of different models, software, and hardware quantitatively. They allow ML developers to measure the inference time, memory usage, power consumption, and other metrics that characterize a system. Moreover, benchmarks create standardized processes for measurement, enabling fair comparisons across different solutions. + +When benchmarks are maintained over time, they become instrumental in capturing progress across generations of algorithms, datasets, and hardware. The models and techniques that set new records on ML benchmarks from one year to the next demonstrate tangible improvements in what's possible for on-device machine learning. By using benchmarks to measure, ML practitioners can know the real-world capabilities of their systems and have confidence that each step reflects genuine progress towards the state-of-the-art. + +Benchmarking has several important goals and objectives that guide its implementation for machine learning systems. + +- **Performance assessment.** This involves evaluating key metrics like the speed, accuracy, and efficiency of a given model. For instance, in a TinyML context, it is crucial to benchmark how quickly a voice assistant can recognize commands, as this evaluates real-time performance. + +- **Resource evaluation.** This means assessing the model's impact on critical system resources including battery life, memory usage, and computational overhead. A relevant example is comparing the battery drain of two different image recognition algorithms running on a wearable device. + +- **Validation and verification.** Benchmarking helps ensure the system functions correctly and meets specified requirements. One way is by checking the accuracy of an algorithm, like a heart rate monitor on a smartwatch, against readings from medical-grade equipment as a form of clinical validation. + +- **Competitive analysis.** This enables comparing solutions against competing offerings in the market. For example, benchmarking a custom object detection model versus common tinyML benchmarks like MobileNet and Tiny-YOLO. + +- **Credibility.** Accurate benchmarks uphold the credibility of AI solutions and the organizations that develop them. They demonstrate a commitment to transparency, honesty, and quality, which is essential in building trust with users and stakeholders. + +- **Regulation and Standardization**. As the AI industry continues to grow, there is an increasing need for regulation and standardization to ensure that AI solutions are safe, ethical, and effective. Accurate and reliable benchmarks are an essential component of this regulatory framework, as they provide the data and evidence needed to assess compliance with industry standards and legal requirements. + +This chapter will cover the 3 types of benchmarks in AI, the standard metrics, tools, and techniques designers use to optimize their systems, and the challenges and trends in benchmarking. + +## Historical Context + +### Standard Benchmarks + +The evolution of benchmarks in computing vividly illustrates the industry's relentless pursuit of excellence and innovation. In the early days of computing during the 1960s and 1970s, benchmarks were rudimentary and designed for mainframe computers. For example, the [Whetstone benchmark](https://en.wikipedia.org/wiki/Whetstone_(benchmark)), named after the Whetstone ALGOL compiler, was one of the first standardized tests to measure floating-point arithmetic performance of a CPU. These pioneering benchmarks prompted manufacturers to refine their architectures and algorithms to achieve better benchmark scores. + +The 1980s marked a significant shift with the rise of personal computers. As companies like IBM, Apple, and Commodore competed for market share, and so benchmarks became critical tools to enable fair competition. The [SPEC CPU benchmarks](https://www.spec.org/cpu/), introduced by the [System Performance Evaluation Cooperative (SPEC)](https://www.spec.org/), established standardized tests allowing objective comparisons between different machines. This standardization created a competitive environment, pushing silicon manufacturers and system creators to enhance their hardware and software offerings continually. + +With the 1990s came the era of graphics-intensive applications and video games. The need for benchmarks to evaluate graphics card performance led to the creation of [3DMark](https://www.3dmark.com/) by Futuremark. As gamers and professionals sought high-performance graphics cards, companies like NVIDIA and AMD were driven to rapid innovation, leading to major advancements in GPU technology like programmable shaders. + +The 2000s saw a surge in mobile phones and portable devices like tablets. With portability came the challenge of balancing performance and power consumption. Benchmarks like [MobileMark](https://bapco.com/products/mobilemark-2014/) by BAPCo evaluated not just speed but also battery life. This drove companies to develop more energy-efficient System-on-Chips (SOCs), leading to the emergence of architectures like ARM that prioritized power efficiency. + +The recent decade's focus has shifted towards cloud computing, big data, and artificial intelligence. Cloud services providers like Amazon Web Services and Google Cloud compete on performance, scalability, and cost-effectiveness. Tailored cloud benchmarks like [CloudSuite](http://cloudsuite.ch/) have become essential, driving providers to optimize their infrastructure for better services. + +### Custom Benchmarks + +In addition to industry-standard benchmarks, there are custom benchmarks that are specifically designed to meet the unique requirements of a particular application or task. They are tailored to the specific needs of the user or developer, ensuring that the performance metrics are directly relevant to the intended use of the AI model or system. Custom benchmarks can be created by individual organizations, researchers, or developers, and are often used in conjunction with industry standard benchmarks to provide a comprehensive evaluation of AI performance. + +For example, a hospital could develop a benchmark to assess an AI model for predicting patient readmission. This benchmark would incorporate metrics relevant to the hospital's patient population like demographics, medical history, and social factors. Similarly, a financial institution's fraud detection benchmark could focus on identifying fraudulent transactions accurately while minimizing false positives. In automotive, an autonomous vehicle benchmark may prioritize performance in diverse conditions, responding to obstacles, and safety. Retailers could benchmark recommendation systems using click-through rate, conversion rate, and customer satisfaction. Manufacturing companies might benchmark quality control systems on defect identification, efficiency, and waste reduction. In each industry, custom benchmarks provide organizations with evaluation criteria tailored to their unique needs and context. This allows for more meaningful assessment of how well AI systems meet requirements. + +The advantage of custom benchmarks lies in their flexibility and relevance. They can be designed to test specific aspects of performance that are critical to the success of the AI solution in its intended application. This allows for a more targeted and accurate assessment of the AI model or system's capabilities. Custom benchmarks also provide valuable insights into the performance of AI solutions in real-world scenarios, which can be crucial for identifying potential issues and areas for improvement. + +In AI, benchmarks play a crucial role in driving progress and innovation. While benchmarks have long been used in computing, their application to machine learning is relatively recent. AI-focused benchmarks aim to provide standardized metrics to evaluate and compare the performance of different algorithms, model architectures, and hardware platforms. + +### Community Concensus + +A key prepragoative for any benchmark to be impactful is that it must reflect the shared priorities and values of the broader research community. Benchmarks designed in isolation risk failing to gain acceptance if they overlook key metrics considered important by leading groups. Through collaborative development with open participation from academic labs, companies, and other stakeholders, benchmarks can incorporate collective input on critical capabilities worth measuring. This helps ensure the benchmarks evaluate aspects the community agrees are essential to advance the field. The process of reaching alignment on tasks and metrics itself supports converging on what matters most. + +Furthermore, benchmarks published with broad co-authorship from respected institutions carry authority and validity that convinces the community to adopt them as trusted standards. Benchmarks perceived as biased by particular corporate or institutional interests breed skepticism. Ongoing community engagement through workshops and challenges is also key after initial release, and that is what, for instance, led to the success of ImageNet. As research rapidly progresses, collective participation enables continual refinement and expansion of benchmarks over time. + +Finally, community-developed benchmarks released with open access accelerate adoption and consistent implementation. Shared open source code, documentation, models and infrastructure lower barriers for groups to benchmark solutions on an equal footing using standardized implementations. This consistency is critical for fair comparisons. Without coordination, labs and companies may implement benchmarks differently, reducing result reproducibility. + +Community consensus brings benchmarks lasting relevance while fragmentation causes confusion. Through collaborative development and transparent operation, benchmarks can become authoritative standards for tracking progress. Several of the benchmarks that we discuss in this chapter were developed and built by the community, for the community, and that is what ultimately led to their success. + +## AI Benchmarks: System, Model, and Data + +As AI systems grow in complexity and ubiquity, the need for comprehensive benchmarking becomes paramount. Within this context, benchmarks are often classified into three primary categories: Hardware, Model, and Data. Let's delve into why each of these buckets is essential and the significance of evaluating AI from these three distinct dimensions: + +#### System Benchmarks + +AI computations, especially those in deep learning, are resource-intensive. The hardware on which these computations run plays a pivotal role in determining the speed, efficiency, and scalability of AI solutions. Consequently, hardware benchmarks help evaluate the performance of CPUs, GPUs, TPUs, and other accelerators in the context of AI tasks. By understanding hardware performance, developers can make informed choices about which hardware platforms are best suited for specific AI applications. Furthermore, hardware manufacturers use these benchmarks to identify areas for improvement, driving innovation in AI-specific chip designs. + +#### Model Benchmarks + +The architecture, size, and complexity of AI models vary widely. Different models have different computational demands and offer varying levels of accuracy and efficiency. Model benchmarks help us assess the performance of various AI architectures on standardized tasks. They provide insights into the speed, accuracy, and resource demands of different models. By benchmarking models, researchers can identify best-performing architectures for specific tasks, guiding the AI community towards more efficient and effective solutions. Additionally, these benchmarks aid in tracking the progress of AI research, showcasing advancements in model design and optimization. + +#### Data Benchmarks + +AI, particularly machine learning, is inherently data-driven. The quality, size, and diversity of data influence the training efficacy and generalization capability of AI models. Data benchmarks focus on the datasets used in AI training and evaluation. They provide standardized datasets that the community can use to train and test models, ensuring a level playing field for comparisons. Moreover, these benchmarks highlight challenges in data quality, diversity, and representation, pushing the community to address biases and gaps in AI training data. By understanding data benchmarks, researchers can also gauge how models might perform in real-world scenarios, ensuring robustness and reliability. + +In the remainder of the sections, we will go through each of these benchmark types. The focus will be an in-depth exploration of system benchmarks, as these are critical to understanding and advancing machine learning system performance. We will cover model and data benchmarks briefly for a comprehensive perspective, but the emphasis and majority of the content will be devoted to system benchmarks. + +## System Benchmarking + +### Granularity + +Machine learning system benchmarking provides a structured and systematic approach to assess how well a system is performing across various dimensions. Given the complexity of ML systems, we can dissect their performance through different levels of granularity and obtain a comprehensive view of the system's efficiency, identify potential bottlenecks, and pinpoint areas for improvement. To this end, there are various types of benchmarks that have evolved over the years and continue to persist. + +![](images/benchmarking/end2end.png) + +#### Micro Benchmarks + +Micro-benchmarks in AI are specialized, focusing on the evaluation of distinct components or specific operations within a broader machine learning process. These benchmarks zero in on individual tasks, offering insights into the computational demands of a particular neural network layer, the efficiency of a unique optimization technique, or the throughput of a specific activation function. For instance, practitioners might use micro-benchmarks to measure the computational time required by a convolutional layer in a deep learning model or to evaluate the speed of data preprocessing that feeds data into the model. Such granular assessments are instrumental in fine-tuning and optimizing discrete aspects of AI models, ensuring that each component operates at its peak potential. + +These types of microbenchmarks include that zoom into very specific operations or components of the AI pipeline, such as the following: + +- Tensor Operations: Libraries like [cuDNN](https://developer.nvidia.com/cudnn) (by NVIDIA) often have benchmarks to measure the performance of individual tensor operations, such as convolutions or matrix multiplications, which are foundational to deep learning computations. + +- Activation Functions: Benchmarks that measure the speed and efficiency of various activation functions like ReLU, Sigmoid, or Tanh in isolation. + +- Layer Benchmarks: Evaluations of the computational efficiency of distinct neural network layers, such as a LSTM layer or a Transformer block, when operating on standardized input sizes. + +Example: [DeepBench](https://github.com/baidu-research/DeepBench), introduced by Baidu, is a good example of something that asseses the above. DeepBench assesses the performance of basic operations in deep learning models, providing insights into how different hardware platforms handle neural network training and inference. + +#### Macro Benchmarks + +Macro-benchmarks provide a holistic view, assessing the end-to-end performance of entire machine learning models or comprehensive AI systems. Rather than focusing on individual operations, macro-benchmarks evaluate the collective efficacy of models under real-world scenarios or tasks. For example, a macro-benchmark might assess the complete performance of a deep learning model undertaking image classification on a dataset like [ImageNet](https://www.image-net.org/). This includes gauging accuracy, computational speed, and resource consumption. Similarly, one might measure the cumulative time and resources needed to train a natural language processing model on extensive text corpora or evaluate the performance of an entire recommendation system, from data ingestion to final user-specific outputs. + +Examples: These benchmarks evaluate the AI model: + +- [MLPerf Inference](https://github.com/mlcommons/inference)[@reddi2020mlperf]: An industry-standard set of benchmarks for measuring the performance of machine learning software and hardware. MLPerf has a suite of dedicated benchmarks for specific scales, such as [MLPerf Mobile](https://github.com/mlcommons/mobile_app_open) for mobile class devices and [MLPerf Tiny](https://github.com/mlcommons/tiny), which focuses on microcontrollers and other resource-constrained devices. + +- [EEMBC's MLMark](https://github.com/eembc/mlmark): A benchmarking suite for evaluating the performance and power efficiency of embedded devices running machine learning workloads. This benchmark provides insights into how different hardware platforms handle tasks like image recognition or audio processing. + +- [AI-Benchmark](https://ai-benchmark.com/)[@ignatov2018ai]: A benchmarking tool designed for Android devices, it valuates the performance of AI tasks on mobile devices, encompassing various real-world scenarios like image recognition, face parsing, and optical character recognition. + +#### End-to-end Benchmarks + +End-to-End Benchmarks provide an all-inclusive evaluation that extends beyond the boundaries of the AI model itself. Instead of focusing solely on the computational efficiency or accuracy of a machine learning model, these benchmarks encompass the entire pipeline of an AI system. This includes initial data pre-processing, the core model's performance, post-processing of the model's outputs, and even other integral components like storage and network interactions. + +Data pre-processing is the first stage in many AI systems, transforming raw data into a format suitable for model training or inference. The efficiency, scalability, and accuracy of these pre-processing steps are vital for the overall system's performance. End-to-end benchmarks assess this phase, ensuring that data cleaning, normalization, augmentation, or any other transformation process doesn't become a bottleneck. + +The post-processing phase also takes center stage. This involves interpreting the model's raw outputs, possibly converting scores into meaningful categories, filtering results, or even integrating with other systems. In real-world applications, this phase is crucial for delivering actionable insights, and end-to-end benchmarks ensure it's both efficient and effective. + +Beyond the core AI operations, other system components play a pivotal role in the overall performance and user experience. Storage solutions, be it cloud-based, on-premises, or hybrid, can significantly impact data retrieval and storage times, especially with vast AI datasets. Similarly, network interactions, vital for cloud-based AI solutions or distributed systems, can become performance bottlenecks if not optimized. End-to-end benchmarks holistically evaluate these components, ensuring that the entire system, from data retrieval to final output delivery, operates seamlessly. + +To date, there are no public, end to end benchmarks that take into account the role of data storage, network and compute performance. Arguably, MLPerf Training and Inference, come close to the idea of an end to end benchmark but they are exclusively focused on ML model performance and do not represent real world deployment scenarios of how models are used in the field. Nonetheless, they provide a very useful signal that helps assess AI system performance. + +Given the inherent specificity of end-to-end benchmarking, it is typically performed internally at a company by instrumenting real production deployments of AI. This allows engineers to have a realistic understanding and breakdown of the performance, but given the sensitivity and specificity of the information, it is rarely reported outside of the company. + +#### Understanding the Trade-offs + +Different issues arise at different stages of an AI system. Micro-benchmarks help in fine-tuning individual components, macro-benchmarks aid in refining model architectures or algorithms, and end-to-end benchmarks guide the optimization of the entire workflow. By understanding where a problem lies, developers can apply targeted optimizations. + +Moreover, while individual components of an AI system might perform optimally in isolation, bottlenecks can emerge when they interact. End-to-end benchmarks, in particular, are crucial to ensure that the entire system, when operating collectively, meets desired performance and efficiency standards. + +Finally, by discerning where performance bottlenecks or inefficiencies lie, organizations can make informed decisions on where to allocate resources. For instance, if micro-benchmarks reveal inefficiencies in specific tensor operations, investments can be directed towards specialized hardware accelerators. Conversely, if end-to-end benchmarks indicate data retrieval issues, investments might be channeled towards better storage solutions. + +### Benchmark Components + +At its core, an AI benchmark is more than just a test or a score; it's a comprehensive evaluation framework. To understand this in-depth, let's break down the typical components that go into an AI benchmark. + +#### Standardized Datasets + +Datasets serve as the foundation for most AI benchmarks. They provide a consistent set of data on which models are trained and evaluated, ensuring a level playing field for comparisons. + +Example: ImageNet, a large-scale dataset containing millions of labeled images spanning thousands of categories, is a popular benchmarking standard for image classification tasks. + +#### Pre-defined Tasks + +A benchmark should have a clear objective or task that models aim to achieve. This task defines the problem the AI system is trying to solve. + +Example: For natural language processing benchmarks, tasks might include sentiment analysis, named entity recognition, or machine translation. + +#### Evaluation Metrics + +Once a task is defined, benchmarks require metrics to quantify performance. These metrics offer objective measures to compare different models or systems. + +In classification tasks, metrics like accuracy, precision, recall, and [F1 score](https://en.wikipedia.org/wiki/F-score) are commonly used. For regression tasks, mean squared error or mean absolute error might be employed. + +#### Baseline Models + +Benchmarks often include baseline models or reference implementations. These serve as starting points or minimum performance standards against which new models or techniques can be compared. + +Example: In many benchmark suites, simple models like linear regression or basic neural networks serve as baselines to provide context for more complex model evaluations. + +#### Hardware and Software Specifications + +Given the variability introduced by different hardware and software configurations, benchmarks often specify or document the hardware and software environments in which tests are conducted. + +Example: An AI benchmark might note that evaluations were conducted on an NVIDIA Tesla V100 GPU using TensorFlow v2.4. + +#### Environmental Conditions + +As external factors can influence benchmark results, it's essential to either control or document conditions like temperature, power source, or system background processes. + +Example: Mobile AI benchmarks might specify that tests were conducted at room temperature with devices plugged into a power source to eliminate battery-level variances. + +#### Reproducibility Rules + +To ensure benchmarks are credible and can be replicated by others in the community, they often include detailed protocols, covering everything from random seeds used to exact hyperparameters. + +Example: A benchmark for a reinforcement learning task might detail the exact training episodes, exploration-exploitation ratios, and reward structures used. + +#### Result Interpretation Guidelines + +Beyond raw scores or metrics, benchmarks often provide guidelines or context to interpret results, helping practitioners understand the broader implications. + +Example: A benchmark might highlight that while Model A scored higher than Model B in accuracy, Model B offers better real-time performance, making it more suitable for time-sensitive applications. + +### Training vs. Inference + +The development life cycle of a machine learning model involves two critical phases - training and inference. Training is the process of learning patterns from data to create the model. Inference refers to the model making predictions on new unlabeled data. Both phases play indispensable yet distinct roles. Consequently, each phase warrants rigorous benchmarking to evaluate performance metrics like speed, accuracy, and computational efficiency. + +Benchmarking the training phase provides insights into how different model architectures, hyperparameter values, and optimization algorithms impact the time and resources needed to train the model. For instance, benchmarking shows how neural network depth affects training time on a given dataset. Benchmarking also reveals how hardware accelerators like GPUs and TPUs can speed up training. + +On the other hand, benchmarking inference evaluates model performance in real-world conditions after deployment. Key metrics include latency, throughput, memory footprint, and power consumption. Inference benchmarking determines if an model meets the requirements of its target application regarding response time and device constraints, which is typically the focus of tinyML but we will discsuss these broadly to make sure we have a general understanding. + +### Training Benchmarks + +Training represents the phase where raw data is processed and ingested by the system to adjust and refine its parameters. Therefore, it is not just an algorithmic activity but also involves system-level considerations, including data pipelines, storage, computing resources, and orchestration mechanisms. The goal is to ensure that the ML system can efficiently learn from data, optimizing both the model's performance and the system's resource utilization. + +#### Purpose + +From an ML systems perspective, training benchmarks evaluate how well the system scales with increasing data volumes and computational demands. It's about understanding the interplay between hardware, software, and the data pipeline in the training process. + +Consider a distributed ML system designed to train on vast datasets, like those used in large-scale e-commerce product recommendations. A training benchmark would assess how efficiently the system scales across multiple nodes, how it manages data sharding, and how it handles failures or node drop-offs during the training process. + +Training benchmarks evaluate CPU, GPU, memory, and network utilization during the training phase, guiding system optimizations. When training a model in a cloud-based ML system, it's crucial to understand how resources are being utilized. Are GPUs being fully leveraged? Is there unnecessary memory overhead? Benchmarks can highlight bottlenecks or inefficiencies in resource utilization, leading to cost savings and performance improvements. + +Training an ML model is contingent on the timely and efficient delivery of data. Benchmarks in this context would also assess the efficiency of data pipelines, data preprocessing speed, and storage retrieval times. For real-time analytics systems, like those used in fraud detection, the speed at which training data is ingested, preprocessed, and fed into the model can be critical. Benchmarks would evaluate the latency of data pipelines, the efficiency of storage systems (like SSDs vs. HDDs), and the speed of data augmentation or transformation tasks. + +#### Metrics + +Training metrics, when viewed from a systems perspective, offer insights that transcend the conventional algorithmic performance indicators. These metrics not only measure the model's learning efficacy but also gauge the efficiency, scalability, and robustness of the entire ML system during the training phase. Let's delve deeper into these metrics and their significance. + +The following metrics are often considered important: + +1. Training Time: The time taken to train a model from scratch until it reaches a satisfactory performance level. It is a direct measure of the computational resources required to train a model. For example, [Google's BERT](https://arxiv.org/abs/1810.04805)[@devlin2018bert] model is a natural language processing model that requires several days to train on a massive corpus of text data using multiple GPUs. The long training time is a significant challenge in terms of resource consumption and cost. + +2. Scalability: How well the training process can handle increases in data size or model complexity. Scalability can be assessed by measuring training time, memory usage, and other resource consumption as data size or model complexity increases. [OpenAI's GPT-3](https://arxiv.org/abs/2005.14165)[@brown2020language] model has 175 billion parameters, making it one of the largest language models in existence. Training GPT-3 required extensive engineering efforts to scale up the training process to handle the massive model size. This involved the use of specialized hardware, distributed training, and other techniques to ensure that the model could be trained efficiently. + +3. Resource Utilization: The extent to which the training process utilizes available computational resources such as CPU, GPU, memory, and disk I/O. High resource utilization can indicate an efficient training process, while low utilization can suggest bottlenecks or inefficiencies. For instance, training a convolutional neural network (CNN) for image classification requires significant GPU resources. Utilizing multi-GPU setups and optimizing the training code for GPU acceleration can greatly improve resource utilization and training efficiency. + +4. Memory Consumption: The amount of memory used by the training process. Memory consumption can be a limiting factor for training large models or datasets. As an example, Google researchers faced significant memory consumption challenges when training BERT. The model has hundreds of millions of parameters, which require large amounts of memory to store. The researchers had to develop techniques to reduce memory consumption, such as gradient checkpointing and model parallelism. + +5. Energy Consumption: The amount of energy consumed during the training process. As machine learning models become larger and more complex, energy consumption has become an important consideration. Training large machine learning models can consume significant amounts of energy, leading to a large carbon footprint. For instance, the training of OpenAI's GPT-3 was estimated to have a carbon footprint equivalent to traveling by car for 700,000 kilometers. + +6. Throughput: The number of training samples processed per unit time. Higher throughput generally indicates a more efficient training process. When training a recommendation system for an e-commerce platform, the throughput is an important metric to consider. A high throughput ensures that the model can process large volumes of user interaction data in a timely manner, which is crucial for maintaining the relevance and accuracy of the recommendations. But it's also important to understand how to balance throughput with latency bounds. Therefore, often there is a latency-bounded throughput constraint that's imposed on service-level agreements for datacenter application deployments. + +7. Cost: The cost of training a model, which can include both computational and human resources. Cost is an important factor when considering the practicality and feasibility of training large or complex models. The cost of training large language models like GPT-3 is estimated to be in the range of millions of dollars. This cost includes computational resources, electricity, and human resources required for model development and training. + +8. Fault Tolerance and Robustness: The ability of the training process to handle failures or errors without crashing or producing incorrect results. This is important for ensuring the reliability of the training process. In a real-world scenario, where a machine learning model is being trained on a distributed system, network failures or hardware malfunctions can occur. In recent years, for instance, it has become abundantly clear that faults that arise from silent data corruption have emerged as a major issue. A fault-tolerant and robust training process can recover from such failures without compromising the integrity of the model. + +9. Ease of Use and Flexibility: The ease with which the training process can be set up and used, as well as its flexibility in handling different types of data and models. In companies like Google, efficiency can sometimes be measured in terms of the number of Software Engineer (SWE) years saved since that translates directly to impact. Ease of use and flexibility can reduce the time and effort required to train a model. TensorFlow and PyTorch are popular machine learning frameworks that provide user-friendly interfaces and flexible APIs for building and training machine learning models. These frameworks support a wide range of model architectures and are equipped with tools that simplify the training process. + +10. Reproducibility: The ability to reproduce the results of the training process. Reproducibility is important for verifying the correctness and validity of a model. However, there are often variations due to stochastic network characteristics and this makes it hard to reproduce the precise behavior of applications being trained, and this can present a challenge for benchmarking. + +By benchmarking for these types of metrics, we can obtain a comprehensive view of the performance and efficiency of the training process from a systems perspective, which can help identify areas for improvement and ensure that resources are used effectively. + +#### Tasks + +Selecting a handful of representative tasks for benchmarking machine learning systems is challenging because machine learning is applied to a diverse range of domains, each with its own unique characteristics and requirements. Here are some of the challenges faced in selecting representative tasks: + +1. Diversity of Applications: Machine learning is used in numerous fields such as healthcare, finance, natural language processing, computer vision, and many more. Each field has specific tasks that may not be representative of other fields. For example, image classification tasks in computer vision may not be relevant to financial fraud detection. + +2. Variability in Data Types and Quality: Different tasks require different types of data, such as text, images, videos, or numerical data. The quality and availability of data can vary greatly between tasks, making it difficult to select tasks that are representative of the general challenges faced in machine learning. + +3. Task Complexity and Difficulty: The complexity of tasks varies greatly, with some tasks being relatively straightforward, while others are highly complex and require sophisticated models and techniques. Selecting representative tasks that cover the range of complexities encountered in machine learning is a challenge. + +4. Ethical and Privacy Concerns: Some tasks may involve sensitive or private data, such as medical records or personal information. These tasks may have ethical and privacy concerns that need to be addressed, which can make them less suitable as representative tasks for benchmarking. + +5. Scalability and Resource Requirements: Different tasks may have different scalability and resource requirements. Some tasks may require extensive computational resources, while others can be performed with minimal resources. Selecting tasks that are representative of the general resource requirements in machine learning is difficult. + +6. Evaluation Metrics: The metrics used to evaluate the performance of machine learning models vary between tasks. Some tasks may have well-established evaluation metrics, while others may lack clear or standardized metrics. This can make it challenging to compare performance across different tasks. + +7. Generalizability of Results: The results obtained from benchmarking on a specific task may not be generalizable to other tasks. This means that the performance of a machine learning system on a selected task may not be indicative of its performance on other tasks. + +It is important to carefully consider these factors when designing benchmarks to ensure that they are meaningful and relevant to the diverse range of tasks encountered in machine learning. + +#### Benchmarks + +Here are some original works that laid the fundamental groundwork for developing systematic benchmarks for training machine learning systems. + +*[MLPerf Training Benchmark](https://github.com/mlcommons/training)* + +MLPerf is a suite of benchmarks designed to measure the performance of machine learning hardware, software, and services. The MLPerf Training benchmark[@mattson2020mlperf] focuses on the time it takes to train models to a target quality metric. It includes a diverse set of workloads, such as image classification, object detection, translation, and reinforcement learning. + +Metrics: + +- Training time to target quality + +- Throughput (examples per second) + +- Resource utilization (CPU, GPU, memory, disk I/O) + +*[DAWNBench](https://dawn.cs.stanford.edu/benchmark/)* + +DAWNBench[@coleman2017dawnbench] is a benchmark suite that focuses on end-to-end deep learning training time and inference performance. It includes common tasks such as image classification and question answering. + +Metrics: + +- Time to train to target accuracy + +- Inference latency + +- Cost (in terms of cloud compute and storage resources) + +*[Fathom](https://github.com/rdadolf/fathom)* + +Fathom[@adolf2016fathom] is a benchmark from Harvard University that includes a diverse set of workloads to evaluate the performance of deep learning models. It includes common tasks such as image classification, speech recognition, and language modeling. + +Metrics: + +- Operations per second (to measure computational efficiency) + +- Time to completion for each workload + +- Memory bandwidth + +*Example Use Case* + +Consider a scenario where we want to benchmark the training of an image classification model on a specific hardware platform. + +1. Task: The task is to train a convolutional neural network (CNN) for image classification on the CIFAR-10 dataset. + +2. Benchmark: We can use the MLPerf Training benchmark for this task. It includes an image classification workload that is relevant to our task. + +3. Metrics: We will measure the following metrics: + +- Training time to reach a target accuracy of 90%. + +- Throughput in terms of images processed per second. + +- GPU and CPU utilization during training. + +By measuring these metrics, we can assess the performance and efficiency of the training process on the selected hardware platform. This information can then be used to identify potential bottlenecks or areas for improvement. + +### Inference Benchmarks + +Inference in machine learning refers to the process of using a trained model to make predictions on new, unseen data. It is the phase where the model applies its learned knowledge to solve the problem it was designed for, such as classifying images, recognizing speech, or translating text. + +#### Purpose + +When we build machine learning models, our ultimate goal is to deploy them in real-world applications where they can provide accurate and reliable predictions on new, unseen data. This process of using a trained model to make predictions is known as inference. The real-world performance of a machine learning model can differ significantly from its performance on training or validation datasets, which makes benchmarking inference a crucial step in the development and deployment of machine learning models. + +Benchmarking inference allows us to evaluate how well a machine learning model performs in real-world scenarios. This evaluation ensures that the model is practical and reliable when deployed in applications, providing a more comprehensive understanding of the model's behavior with real data. Additionally, benchmarking can help identify potential bottlenecks or limitations in the model's performance. For example, if a model takes too long to make a prediction, it may be impractical for real-time applications such as autonomous driving or voice assistants. + +Resource efficiency is another critical aspect of inference, as it can be computationally intensive and require significant memory and processing power. Benchmarking helps ensure that the model is efficient in terms of resource usage, which is particularly important for edge devices with limited computational capabilities, such as smartphones or IoT devices. Moreover, benchmarking allows us to compare the performance of our model with competing models or previous versions of the same model. This comparison is essential for making informed decisions about which model to deploy in a specific application. + +Finally, ensuring that the model's predictions are not only accurate but also consistent across different data points is vital. Benchmarking helps verify the model's accuracy and consistency, ensuring that it meets the application's requirements. It also assesses the robustness of the model, ensuring that it can handle real-world data variability and still make accurate predictions. + +#### Metrics + +1. Accuracy: Accuracy is one of the most vital metrics when benchmarking machine learning models, quantifying the proportion of correct predictions made by the model compared to the true values or labels. For example, in the case of a spam detection model that can correctly classify 95 out of 100 email messages as spam or not spam, the accuracy of this model would be calculated as 95%. + +2. Latency: Latency is a performance metric that calculates the time lag or delay occurring between the receipt of an input and the production of the corresponding output by the machine learning system. An example that clearly depicts latency is a real-time translation application; if there exists a half-second delay from the moment a user inputs a sentence to the time the translated text is displayed by the app, then the system's latency is 0.5 seconds. + +3. Latency-Bounded Throughput: Latency-bounded throughput is a valuable metric that combines the aspects of latency and throughput, measuring the maximum throughput of a system while still meeting a specified latency constraint. For example, in a video streaming application that utilizes a machine learning model to automatically generate and display subtitles, latency-bounded throughput would measure how many video frames the system can process per second (throughput) while ensuring that the subtitles are displayed with no more than a 1-second delay (latency). This metric is particularly important in real-time applications where meeting latency requirements is crucial to the user experience. + +4. Throughput: Throughput assesses the system's capacity by measuring the total number of inferences or predictions a machine learning model can handle within a specific unit of time. Consider a speech recognition system that employs a Recurrent Neural Network (RNN) as its underlying model; if this system is capable of processing and understanding 50 different audio clips in a minute, then its throughput rate stands at 50 clips per minute. + +5. Inference Time: Inference time is a crucial metric that measures the duration a machine learning system, such as a Convolutional Neural Network (CNN) used in image recognition tasks, takes to process an input and generate a prediction or output. For instance, if a CNN takes approximately 2 milliseconds to accurately identify and label a cat within a given photo, then its inference time is said to be 2 milliseconds. + +6. Energy Efficiency: Energy efficiency is a metric that determines the amount of energy consumed by the machine learning model to perform a single inference. A prime example of this would be a natural language processing model built on a Transformer network architecture; if it utilizes 0.1 Joules of energy to translate a sentence from English to French, its energy efficiency is measured at 0.1 Joules per inference. + +7. Memory Usage: Memory usage quantifies the volume of RAM needed by a machine learning model to carry out inference tasks. A relevant example to illustrate this would be a face recognition system that is based on a CNN; if such a system requires 150 MB of RAM to process and recognize faces within an image, then its memory usage is 150 MB. + +#### Tasks + +By and large, the challenges in picking representative tasks for benchmarking inference machine learning systems are somewhat of the same taxonomy as what we have provided for training. Nevertheless, to be pedantic, let's discuss those in the context of inference machine learning systems. + +1. Diversity of Applications: Inference machine learning is employed across numerous domains such as healthcare, finance, entertainment, security, and more. Each domain has its unique tasks, and what's representative in one domain might not be in another. For example, an inference task for predicting stock prices in the financial domain might not be representative of image recognition tasks in the medical domain. + +2. Variability in Data Types: Different inference tasks require different types of data -- text, images, videos, numerical data, etc. Ensuring that benchmarks address the wide variety of data types used in real-world applications is challenging. For example, voice recognition systems process audio data, which is vastly different from the visual data processed by facial recognition systems. + +3. Task Complexity: The complexity of inference tasks can differ immensely, from basic classification tasks to intricate tasks requiring state-of-the-art models. For example, differentiating between two categories (binary classification) is typically simpler than detecting hundreds of object types in a crowded scene. + +4. Real-time Requirements: Some applications demand immediate or real-time responses, while others may allow for some delay. In autonomous driving, real-time object detection and decision-making are paramount, whereas a recommendation engine for a shopping website might tolerate slight delays. + +5. Scalability Concerns: Given the varied scale of applications, from edge devices to cloud-based servers, tasks must represent the diverse computational environments where inference occurs. For example, an inference task running on a smartphone's limited resources is quite different from one running on a powerful cloud server. + +6. Evaluation Metrics Diversity: Depending on the task, the metrics to evaluate performance can differ significantly. Finding a common ground or universally accepted metric for diverse tasks is a challenge. For example, precision and recall might be vital for a medical diagnosis task, whereas throughput (inferences per second) might be more crucial for video processing tasks. + +7. Ethical and Privacy Concerns: Especially in sensitive areas like facial recognition or personal data processing, there are concerns related to ethics and privacy. These concerns can impact the selection and nature of tasks used for benchmarking. For example, using real-world facial data for benchmarking can raise privacy issues, whereas synthetic data might not replicate real-world challenges. -- Importance of benchmarking in AI -- Objectives of benchmarking +8. Hardware Diversity: With a wide range of devices from GPUs, CPUs, TPUs, to custom ASICs used for inference, ensuring that tasks are representative across varied hardware is challenging. For example, a task optimized for inference on a GPU might perform sub-optimally on an edge device. -## Types of Benchmarks +#### Benchmarks -Explanation: Understanding the different types of benchmarks will help our readers tailor their performance evaluation activities to specific needs, whether they are evaluating low-level operations or entire application performance. +Here are some original works that laid the fundamental groundwork for developing systematic benchmarks for inference machine learning systems. -- System benchmarks - + Micro-benchmarks - + Macro-benchmarks - + Application-specific benchmarks -- Data benchmarks +*[MLPerf Inference Benchmark](https://github.com/mlcommons/inference)* -## Benchmarking Metrics +MLPerf Inference is a comprehensive suite of benchmarks that assess the performance of machine learning models during the inference phase. It encompasses a variety of workloads including image classification, object detection, and natural language processing, aiming to provide standardized and insightful metrics for evaluating different inference systems. -Explanation: Metrics are the yardsticks used to measure performance. This section is vital for understanding what aspects of an AI system's performance are being evaluated, such as accuracy, speed, or resource utilization. +Metrics: -- Accuracy -- Latency -- Throughput -- Power Consumption -- Memory Footprint -- End to end Metrics (User vs. System) +- Inference time -## Benchmarking Tools +- Latency -Explanation: Tools are the practical means to carry out benchmarking. Discussing available software and hardware tools equips readers with the resources they need to perform effective benchmarking. +- Throughput -- Software tools -- Hardware tools +- Accuracy -## Benchmarking Process +- Energy consumption -Explanation: Outlining the step-by-step process of benchmarking provides a structured approach for readers, ensuring that they can conduct benchmarks in a systematic and repeatable manner.e +*[AI Benchmark](https://ai-benchmark.com/)* -- Dataset Limitation/Sources -- Model Selection -- Test Environment Setup -- Running the Benchmarks -- Run Rules +AI Benchmark is a benchmarking tool that evaluates the performance of AI and machine learning models on mobile devices and edge computing platforms. It includes tests for image classification, object detection, and natural language processing tasks, providing a detailed analysis of the inference performance on different hardware platforms. -## Interpreting Results +Metrics: -Explanation: Benchmarking is only as valuable as the insights gained from it. This section teaches readers how to analyze the collected data, identify bottlenecks, and make meaningful comparisons. +- Inference time -- Analyzing the Data -- Identifying Bottlenecks -- Making Comparisons +- Latency -## Optimizing Based on Benchmarks +- Energy consumption -Explanation: The ultimate goal of benchmarking is to improve system performance. This section guides readers on how to use benchmark data for optimization, making it a critical part of the benchmarking lifecycle. +- Memory usage -- Tweaking Parameters -- Hardware Acceleration -- Software Optimization +- Throughput -## Challenges and Limitations +*[OpenVINO™ toolkit](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html)* -Explanation: Every methodology has its limitations, and benchmarking is no exception. Discussing these challenges helps readers set realistic expectations and interpret results with a critical mindset. +OpenVINO™ toolkit provides a benchmark tool to measure the performance of deep learning models for a variety of tasks such as image classification, object detection, and facial recognition on Intel hardware. It offers detailed insights into the inference performance of the models on different hardware configurations. -- Variability in Results -- Benchmarking Ethics +Metrics: -## Emerging Trends in Benchmarking +- Inference time -- Data-centric AI -- DataPerf -- DataComp +- Throughput + +- Latency + +- CPU and GPU utilization + +*Example Use Case* + +Consider a scenario where we want to evaluate the inference performance of an object detection model on a specific edge device. + +Task: The task is to perform real-time object detection on video streams, detecting and identifying objects such as vehicles, pedestrians, and traffic signs. + +Benchmark: We can use the AI Benchmark for this task as it focuses on evaluating inference performance on edge devices, which is suitable for our scenario. + +Metrics: We will measure the following metrics: + +- Inference time to process each video frame + +- Latency to generate the bounding boxes for detected objects + +- Energy consumption during the inference process + +- Throughput in terms of video frames processed per second + +By measuring these metrics, we can assess the performance of the object detection model on the edge device and identify any potential bottlenecks or areas for optimization to enhance real-time processing capabilities. + +### Benchmark Example + +In order to properly illustrate the components of a systems benchmark, we can look at the keyword spotting benchmark in MLPerf Tiny and explain the motivation behind each decision. + +#### Task + +Keyword spotting was selected as a task because it is a common usecase in TinyML that has been well established for years. Additionally the typical hardware used for keyword spotting differs substantially from the offerings of other benchmarks such as MLPerf Inference's speech recognition task. + +#### Dataset + +[Google Speech Commands](https://www.tensorflow.org/datasets/catalog/speech_commands)[@warden2018speech] was selected as the best dataset to represent the task. The dataset is well established in the research community and has permissive licensing which allows it to be easily used in a benchmark. + +#### Model + +The next core component is the model which will act as the primary workload for the benchmark. The model should be well established as a solution to the selected task and not necessarily the state of the art solution. The model selected is a simple depthwise seperable convolution model. This architecture is not the state of the art solution to the task, but it is well established and not designed for a specific hardware platform like many of the state of the art solutions. The benchmark also establishes a reference training recipe, despite being an inference benchmark, in order to be fully reproducible and transparent. + +#### Metrics + +Latency was selected as the primary metric for the benchmark, as keyword spotting systems need to react quickly to maintain user satisfaction. Additionally, given that TinyML systems are often battery powered, energy consumption is measured to ensure the hardware platform is efficient. The accuracy of the model is also measure to ensure that the optimizations applied by a submitter, such as quantization, don't degrade the accuracy beyond a threshold. + +#### Benchmark Harness + +MLPerf Tiny uses [EEMBCs EnergyRunner™ benchmark harness](https://github.com/eembc/energyrunner) to load the inputs to the model and to isolate and measure the energy consumption of the device. When measuring energy consumption it's critical to select a harness that is accurate at the expected power levels of the devices under test, and simple enough to not become a burden for participants of the benchmark. + +#### Baseline Submission + +Baseline submissions are critical for contextualizing results and acting as a reference point to help participants get started. The baseline submission should prioritise simplicity and readability over state of the art performance. The keyword spotting baseline uses a standard [STM microcontroller](https://www.st.com/en/microcontrollers-microprocessors.html) as it's hardware and [TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers)[@david2021tensorflow] as it's inference framework. + +### Challenges and Limitations + +While benchmarking provides a structured methodology for performance evaluation in complex domains like artificial intelligence and computing, the process also poses several challenges. If not properly addressed, these challenges can undermine the credibility and accuracy of benchmarking results. Some of the predominant difficulties faced in benchmarking include the following: + +- Incomplete problem coverage - Benchmark tasks may not fully represent the problem space. For instance, common image classification datasets like [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) have limited diversity in image types. Algorithms tuned for such benchmarks may fail to generalize well to real-world datasets. + +- Statistical insignificance - Benchmarks must have enough trials and data samples to produce statistically significant results. For example, benchmarking an OCR model on only a few text scans may not adequately capture its true error rates. + +- Limited reproducibility - Varying hardware, software versions, codebases and other factors can reduce reproducibility of benchmark results. MLPerf addresses this by providing reference implementations and environment specification. + +- Misalignment with end goals - Benchmarks focusing only on speed or accuracy metrics may misalign with real-world objectives like cost and power efficiency. Benchmarks must reflect all critical performance axes. + +- Rapid staleness - Due to the fast pace of advancements in AI and computing, benchmarks and their datasets can become outdated quickly. Maintaining up-to-date benchmarks is thus a persistent challenge. + +But of all these, perhaps the most important challenge is dealing with benchmark engineering. + +#### Hardware Lottery + +The ["hardware lottery"](https://arxiv.org/abs/2009.06489) in benchmarking machine learning systems refers to the situation where the success or efficiency of a machine learning model is significantly influenced by the compatibility of the model with the underlying hardware[@chu2021discovering]. In other words, some models perform exceptionally well because they are a good fit for the particular characteristics or capabilities of the hardware on which they are run, rather than because they are intrinsically superior models. Unfortunately, the hardware used is often omitted from papers or given only brief mentions, making reproducing results difficult if not impossible. + +For instance, certain machine learning models may be designed and optimized to take advantage of parallel processing capabilities of specific hardware accelerators, such as Graphics Processing Units (GPUs) or Tensor Processing Units (TPUs). As a result, these models might show superior performance when benchmarked on such hardware, compared to other models that are not optimized for the hardware. + +For example, a 2018 paper introduced a new convolutional neural network architecture for image classification that achieved state-of-the-art accuracy on ImageNet. However, the paper only mentioned that the model was trained on 8 GPUs, without specifying the model, memory size, or other relevant details. A follow-up study tried to reproduce the results but found that training the same model on commonly available GPUs achieved 10% lower accuracy, even after hyperparameter tuning. The original hardware likely had far higher memory bandwidth and compute power. As another example, training times for large language models can vary drastically based on the GPUs used. + +The "hardware lottery" can introduce challenges and biases in benchmarking machine learning systems, as the performance of the model is not solely dependent on the model's architecture or algorithm, but also on the compatibility and synergies with the underlying hardware. This can make it difficult to fairly compare different models and to identify the best model based on its intrinsic merits. It can also lead to a situation where the community converges on models that are a good fit for the popular hardware of the day, potentially overlooking other models that might be superior but are not compatible with the current hardware trends. + +[Hardware Lottery](./images/benchmarking/hardware_lottery.png) + +#### Benchmark Engineering + +Hardware lottery occurs when a machine learning model unintentionally performs exceptionally well or poorly on a specific hardware setup due to unforeseen compatibility or incompatibility. The model is not explicitly designed or optimized for that particular hardware by the developers or engineers; rather, it happens to align or (mis)align with the hardware's capabilities or limitations. In this case, the performance of the model on the hardware is a byproduct of coincidence rather than design. + +In contrast to the accidental hardware lottery, benchmark engineering involves deliberately optimizing or designing a machine learning model to perform exceptionally well on specific hardware, often to win benchmarks or competitions. This intentional optimization might include tweaking the model's architecture, algorithms, or parameters to take full advantage of the hardware's features and capabilities. + +#### Problem + +Benchmark engineering refers to the process of tweaking or modifying an AI system to optimize its performance on specific benchmark tests, often at the expense of generalizability or real-world performance. This can include adjusting hyperparameters, training data, or other aspects of the system specifically to achieve high scores on benchmark metrics, without necessarily improving the overall functionality or utility of the system. + +The motivation behind benchmark engineering often stems from the desire to achieve high performance scores for marketing or competitive purposes. High benchmark scores can be used to demonstrate the superiority of an AI system compared to competitors, and can be a key selling point for potential users or investors. This pressure to perform well on benchmarks can sometimes lead to the prioritization of benchmark-specific optimizations over more holistic improvements to the system. + +It can lead to a number of risks and challenges. One of the primary risks is that the AI system may not perform as well in real-world applications as the benchmark scores suggest. This can lead to user dissatisfaction, reputational damage, and potential safety or ethical concerns. Furthermore, benchmark engineering can contribute to a lack of transparency and accountability in the AI community, as it can be difficult to discern how much of an AI system's performance is due to genuine improvements versus benchmark-specific optimizations. + +To mitigate the risks associated with benchmark engineering, it is important for the AI community to prioritize transparency and accountability. This can include clearly disclosing any optimizations or adjustments made specifically for benchmark tests, as well as providing more comprehensive evaluations of AI systems that include real-world performance metrics in addition to benchmark scores. Additionally, it is important for researchers and developers to prioritize holistic improvements to AI systems that improve their generalizability and functionality across a range of applications, rather than focusing solely on benchmark-specific optimizations. + +#### Issues + +One of the primary problems with benchmark engineering is that it can compromise the real-world performance of AI systems. When developers focus on optimizing their systems to achieve high scores on specific benchmark tests, they may neglect other important aspects of system performance that are crucial in real-world applications. For example, an AI system designed for image recognition might be engineered to perform exceptionally well on a benchmark test that includes a specific set of images, but struggle to accurately recognize images that are slightly different from those in the test set. + +Another issue with benchmark engineering is that it can result in AI systems that lack generalizability. In other words, while the system may perform well on the benchmark test, it may not be able to handle a diverse range of inputs or scenarios. For instance, an AI model developed for natural language processing might be engineered to achieve high scores on a benchmark test that includes a specific type of text, but fail to accurately process text that falls outside of that specific type. + +It can also lead to misleading results. When AI systems are engineered to perform well on benchmark tests, the results may not accurately reflect the true capabilities of the system. This can be problematic for users or investors who rely on benchmark scores to make informed decisions about which AI systems to use or invest in. For example, an AI system that has been engineered to achieve high scores on a benchmark test for speech recognition might not actually be capable of accurately recognizing speech in real-world situations, leading users or investors to make decisions based on inaccurate information. + +#### Mitigation + +There are several ways to mitigate benchmark engineering. Transparency in the benchmarking process is crucial to maintaining the accuracy and reliability of benchmarks. This involves clearly disclosing the methodologies, data sets, and evaluation criteria used in benchmark tests, as well as any optimizations or adjustments made to the AI system for the purpose of the benchmark. + +One way to achieve transparency is through the use of open-source benchmarks. Open-source benchmarks are made publicly available, allowing researchers, developers, and other stakeholders to review, critique, and contribute to the benchmark, thereby ensuring its accuracy and reliability. This collaborative approach also facilitates the sharing of best practices and the development of more robust and comprehensive benchmarks. + +Another method for achieving transparency is through peer review of benchmarks. This involves having independent experts review and validate the benchmark's methodology, data sets, and results to ensure their credibility and reliability. Peer review can provide a valuable means of verifying the accuracy of benchmark tests and can help to build confidence in the results. + +Standardization of benchmarks is another important solution to mitigate benchmark engineering. Standardized benchmarks provide a common framework for evaluating AI systems, ensuring consistency and comparability across different systems and applications. This can be achieved through the development of industry-wide standards and best practices for benchmarking, as well as through the use of common metrics and evaluation criteria. + +Third-party verification of results can also be a valuable tool in mitigating benchmark engineering. This involves having an independent third party verify the results of a benchmark test to ensure their credibility and reliability. Third-party verification can help to build confidence in the results and can provide a valuable means of validating the performance and capabilities of AI systems. + +Resource: [Benchmarking TinyML Systems: Challenges and Directions](https://arxiv.org/pdf/2003.04821.pdf)[@banbury2020benchmarking] + +![](images/benchmarking/mlperf_tiny.png) + +Figure 1: The modular design of MLPerf Tiny enables both the direct comparison of solutions and the demonstration of an improvement over the reference. The reference implementations are fully implemented solutions that allow individual components to be swapped out. The components in green can be modified in either division, and the orange components can only be modified in the open division. The reference implementations also act as the baseline for the results. + +Source: MLPerf Tiny Benchmark (https://arxiv.org/pdf/2106.07597.pdf) + +## Model Benchmarking + +Benchmarking machine learning models is important for determining the effectiveness and efficiency of various machine learning algorithms in solving specific tasks or problems. By analyzing the results obtained from benchmarking, developers and researchers can identify the strengths and weaknesses of their models, leading to more informed decisions on model selection and further optimization. + +The evolution and progress of machine learning models are intrinsically linked to the availability and quality of data sets. In the world of machine learning, data acts as the raw material that powers the algorithms, allowing them to learn, adapt, and ultimately perform tasks that were traditionally the domain of humans. Therefore, it is important to understand this history. + +### Historical Context + +Machine learning datasets have a rich history and have evolved significantly over the years, growing in size, complexity, and diversity to meet the ever-increasing demands of the field. Let's take a closer look at this evolution, starting from one of the earliest and most iconic datasets -- MNIST. + +#### MNIST (1998) + +The [MNIST dataset](https://www.tensorflow.org/datasets/catalog/mnist), created by Yann LeCun, Corinna Cortes, and Christopher J.C. Burges in 1998, can be considered a cornerstone in the history of machine learning datasets. It consists of 70,000 labeled 28x28 pixel grayscale images of handwritten digits (0-9). MNIST has been widely used for benchmarking algorithms in image processing and machine learning, serving as a starting point for many researchers and practitioners in the field. + +![](images/benchmarking/mnist.png) + +Source: https://en.wikipedia.org/wiki/File:MnistExamplesModified.png + +#### ImageNet (2009) + +Fast forward to 2009, and we see the introduction of the [ImageNet dataset](https://www.tensorflow.org/datasets/catalog/imagenet2012), which marked a significant leap in the scale and complexity of datasets. ImageNet consists of over 14 million labeled images spanning more than 20,000 categories. It was developed by Fei-Fei Li and her team with the goal of advancing research in object recognition and computer vision. The dataset became synonymous with the ImageNet Large Scale Visual Recognition Challenge (ILSVRC), an annual competition that played a crucial role in the development of deep learning models, including the famous AlexNet in 2012. + +![](images/benchmarking/imagenet.png) + +Source: https://cv.gluon.ai/_images/imagenet_banner.jpeg + +#### COCO (2014) + +The [Common Objects in Context (COCO) dataset](https://cocodataset.org/)[@lin2014microsoft], released in 2014, further expanded the landscape of machine learning datasets by introducing a richer set of annotations. COCO consists of images containing complex scenes with multiple objects, and each image is annotated with object bounding boxes, segmentation masks, and captions. This dataset has been instrumental in advancing research in object detection, segmentation, and image captioning. + +![](images/benchmarking//coco.png) +​​https://cocodataset.org/images/coco-examples.jpg + +#### GPT-3 (2020) + +While the above examples primarily focus on image datasets, there have been significant developments in text datasets as well. One notable example is GPT-3[@brown2020language], developed by OpenAI. GPT-3 is a language model trained on a diverse range of internet text. Although the dataset used to train GPT-3 is not publicly available, the model itself, consisting of 175 billion parameters, is a testament to the scale and complexity of modern machine learning datasets and models. + +#### Present and Future + +Today, we have a plethora of datasets spanning various domains, including healthcare, finance, social sciences, and more. The following characteristics are how we can taxonomiize the space and growth of machine learning datasets that fuel model development. + +1. Diversity of Data Sets: The variety of data sets available to researchers and engineers has expanded dramatically over the years, covering a wide range of fields, including natural language processing, image recognition, and more. This diversity has fueled the development of specialized machine learning models tailored to specific tasks, such as translation, speech recognition, and facial recognition. + +2. Volume of Data: The sheer volume of data that has become available in the digital age has also played a crucial role in advancing machine learning models. Large data sets enable models to capture the complexity and nuances of real-world phenomena, leading to more accurate and reliable predictions. + +3. Quality and Cleanliness of Data: The quality of data is another critical factor that influences the performance of machine learning models. Clean, well-labeled, and unbiased data sets are essential for training models that are robust and fair. + +4. Open Access to Data: The availability of open-access data sets has also contributed significantly to the progress in machine learning. Open data allows researchers from around the world to collaborate, share insights, and build upon each other's work, leading to faster innovation and development of more advanced models. + +5. Ethics and Privacy Concerns: As data sets continue to grow in size and complexity, ethical considerations and privacy concerns become increasingly important. There is an ongoing debate about the balance between leveraging data for machine learning advancements and protecting individuals' privacy rights. + +The development of machine learning models is heavily reliant on the availability of diverse, large, high-quality, and open-access data sets. As we move forward, addressing the ethical considerations and privacy concerns associated with the use of large data sets is crucial to ensure that machine learning technologies benefit society as a whole. There is a growing awareness that data acts as the rocket fuel for machine learning, driving and fueling the development of machine learning models. Consequently, an increasing amount of focus is being placed on the development of the data sets themselves. We will explore this in further detail in the data benchmarking section. + +### Model Metrics + +The evolution of machine learning model evaluation has witnessed a transition from a narrow focus on accuracy to a more comprehensive approach that takes into account a range of factors, from ethical considerations and real-world applicability to practical constraints like model size and efficiency. This shift reflects the maturation of the field as machine learning models are increasingly applied in diverse and complex real-world scenarios. + +#### Accuracy + +Accuracy is one of the most intuitive and commonly used metrics for evaluating machine learning models. At its core, accuracy measures the proportion of correct predictions made by the model out of all predictions. As an example, imagine we have developed a machine learning model to classify images as either containing a cat or not. If we test this model on a dataset of 100 images, and it correctly identifies 90 of them, we would calculate its accuracy as 90%. + +In the initial stages of machine learning, accuracy was often the primary, if not the only, metric considered when evaluating model performance. This is perhaps understandable, given its straightforward nature and ease of interpretation. However, as the field has progressed, the limitations of relying solely on accuracy have become more apparent. + +Consider the example of a medical diagnosis model that has an accuracy of 95%. While at first glance this may seem impressive, we must delve deeper to fully assess the model's performance. If the model fails to accurately diagnose severe conditions that, while rare, can have severe consequences, its high accuracy may not be as meaningful. A pertinent example of this is [Google's retinopathy machine learning model](https://about.google/intl/ALL_us/stories/seeingpotential/), which was designed to diagnose diabetic retinopathy and diabetic macular edema from retinal photographs. + +The Google model demonstrated impressive accuracy levels in lab settings, but when deployed in real-world clinical environments in Thailand, [it faced significant challenges](https://www.technologyreview.com/2020/04/27/1000658/google-medical-ai-accurate-lab-real-life-clinic-covid-diabetes-retina-disease/). In the real-world setting, the model encountered diverse patient populations, varying image quality, and a range of different medical conditions that it had not been exposed to during its training. Consequently, its performance was compromised, and it struggled to maintain the same levels of accuracy that had been observed in lab settings. This example serves as a clear reminder that while high accuracy is an important and desirable attribute for a medical diagnosis model, it must be evaluated in conjunction with other factors, such as the model's ability to generalize to different populations and handle diverse and unpredictable real-world conditions, to truly understand its value and potential impact on patient care. + +Similarly, if the model performs well on average but exhibits significant disparities in performance across different demographic groups, this too would be a cause for concern. + +The evolution of machine learning has thus seen a shift towards a more holistic approach to model evaluation, taking into account not just accuracy, but also other crucial factors such as fairness, transparency, and real-world applicability. A prime example of this is the [Gender Shades project](http://gendershades.org/) at MIT Media Lab, led by Joy Buolamwini, which highlighted significant racial and gender biases in commercial facial recognition systems. The project evaluated the performance of three facial recognition technologies developed by IBM, Microsoft, and Face++ and found that they all exhibited biases, performing better on lighter-skinned and male faces compared to darker-skinned and female faces. + +While accuracy remains a fundamental and valuable metric for evaluating machine learning models, it is clear that a more comprehensive approach is required to fully assess a model's performance. This means considering additional metrics that account for fairness, transparency, and real-world applicability, as well as conducting rigorous testing across diverse datasets to uncover and mitigate any potential biases. The move towards a more holistic approach to model evaluation reflects the maturation of the field and its increasing recognition of the real-world implications and ethical considerations associated with deploying machine learning models. + +#### Fairness + +Fairness in machine learning models is a multifaceted and critical aspect that requires careful attention, particularly in high-stakes applications that significantly affect people's lives, such as in loan approval processes, hiring, and criminal justice. It refers to the equitable treatment of all individuals, irrespective of their demographic or social attributes such as race, gender, age, or socioeconomic status. + +When evaluating models, simply relying on accuracy can be insufficient and potentially misleading. For instance, consider a loan approval model that boasts a 95% accuracy rate. While this figure may appear impressive at first glance, it does not reveal how the model performs across different demographic groups. If this model consistently discriminates against a particular group, its accuracy is less commendable, and its fairness comes into question. + +Discrimination can manifest in various forms, such as direct discrimination, where a model explicitly uses sensitive attributes like race or gender in its decision-making process, or indirect discrimination, where seemingly neutral variables correlate with sensitive attributes, indirectly influencing the model's outcomes. An infamous example of the latter is the COMPAS tool used in the US criminal justice system, which exhibited racial biases in predicting recidivism rates, despite not explicitly using race as a variable. + +Addressing fairness involves careful examination of the model's performance across diverse groups, identification of potential biases, and rectification of disparities through corrective measures such as re-balancing datasets, adjusting model parameters, and implementing fairness-aware algorithms. Researchers and practitioners are continuously developing metrics and methodologies tailored to specific use cases to evaluate fairness in real-world scenarios. For example, disparate impact analysis, demographic parity, and equal opportunity are some of the metrics employed to assess fairness. + +Additionally, transparency and interpretability of models are fundamental to achieving fairness. Understanding how a model makes decisions can reveal potential biases and enable stakeholders to hold developers accountable. Open-source tools like [AI Fairness 360](https://ai-fairness-360.org/) by IBM and [Fairness Indicators](https://www.tensorflow.org/tfx/guide/fairness_indicators) by TensorFlow are being developed to facilitate fairness assessments and mitigation of biases in machine learning models. + +Ensuring fairness in machine learning models particularly in applications that significantly impact people's lives. It requires rigorous evaluation of the model's performance across diverse groups, careful identification and mitigation of biases, and implementation of transparency and interpretability measures. By addressing fairness in a comprehensive manner, we can work towards developing machine learning models that are equitable, just, and beneficial for society as a whole. + +#### Complexity + +##### Parameters* + +In the initial stages of machine learning, model benchmarking often relied on parameter counts as a proxy for model complexity. The rationale was that more parameters typically lead to a more complex model, which should, in turn, deliver better performance. However, this approach has proven to be inadequate as it doesn't account for the computational cost associated with processing a large number of parameters. + +For example, GPT-3, developed by OpenAI, is a language model that boasts an astounding 175 billion parameters. While it achieves state-of-the-art performance on a variety of natural language processing tasks, its size and the computational resources required to run it make it impractical for deployment in many real-world scenarios, especially those with limited computational capabilities. + +The reliance on parameter counts as a proxy for model complexity also fails to consider the efficiency of the model. A model with fewer parameters might be just as effective, if not more so, than a model with a higher parameter count if it is optimized for efficiency. For instance, MobileNets, developed by Google, are a family of models designed specifically for mobile and edge devices. They utilize depth-wise separable convolutions to reduce the number of parameters and computational cost, while still achieving competitive performance. + +In light of these limitations, the field has moved towards a more holistic approach to model benchmarking that considers not just parameter counts, but also other crucial factors such as floating-point operations per second (FLOPs), memory consumption, and latency. FLOPs, in particular, have emerged as an important metric as they provide a more accurate representation of the computational load a model imposes. This shift towards a more comprehensive approach to model benchmarking reflects a recognition of the need to balance performance with practicality, ensuring that models are not just effective, but also efficient and deployable in real-world scenarios. + +##### FLOPS + +The size of a machine learning model is an essential aspect that directly impacts its usability in practical scenarios, especially when computational resources are limited. Traditionally, the number of parameters in a model was often used as a proxy for its size, with the underlying assumption being that more parameters would translate to better performance. However, this simplistic view does not consider the computational cost associated with processing these parameters. This is where the concept of floating-point operations per second (FLOPs) comes into play, providing a more accurate representation of the computational load a model imposes. + +FLOPs measure the number of floating-point operations a model performs to generate a prediction. For example, a model with a high number of FLOPs requires substantial computational resources to process the vast number of operations, which may render it impractical for certain applications. Conversely, a model with a lower FLOP count is more lightweight and can be easily deployed in scenarios where computational resources are limited. + +Let's consider an example. BERT (Bidirectional Encoder Representations from Transformers)[@devlin2018bert], a popular natural language processing model, has over 340 million parameters, making it a large model with high accuracy and impressive performance across a range of tasks. However, the sheer size of BERT, coupled with its high FLOP count, makes it a computationally intensive model that may not be suitable for real-time applications or deployment on edge devices with limited computational capabilities. + +In light of this, there has been a growing interest in developing smaller models that can achieve similar performance levels as their larger counterparts while being more efficient in terms of computational load. DistilBERT, for instance, is a smaller version of BERT that retains 97% of its performance while being 40% smaller in terms of parameter count. The reduction in size also translates to a lower FLOP count, making DistilBERT a more practical choice for resource-constrained scenarios. + +To sum up, while parameter count provides a useful indication of model size, it is not a comprehensive metric as it does not consider the computational cost associated with processing these parameters. FLOPs, on the other hand, offer a more accurate representation of a model's computational load and are thus an essential consideration when deploying machine learning models in real-world scenarios, particularly when computational resources are limited. The evolution from relying solely on parameter count to also considering FLOPs signifies a maturation in the field, reflecting a greater awareness of the practical constraints and challenges associated with deploying machine learning models in diverse settings. + +##### Efficiency + +Efficiency metrics, such as memory consumption and latency/throughput, have also gained prominence. These metrics are particularly crucial when deploying models on edge devices or in real-time applications, as they measure how quickly a model can process data and how much memory it requires. In this context, Pareto curves are often used to visualize the trade-off between different metrics, helping stakeholders make informed decisions about which model is best suited to their needs. + +### Lessons Learned + +Model benchmarking has offered us several valuable insights that can be leveraged to drive innovation in system benchmarks. The progression of machine learning models has been profoundly influenced by the advent of leaderboards and the open-source availability of models and datasets. These elements have served as significant catalysts, propelling innovation and accelerating the integration of cutting-edge models into production environments. However, these are not the only contributors to the development of machine learning benchmarks, as we will explore further. + +Leaderboards play a vital role in providing an objective and transparent method for researchers and practitioners to evaluate the efficacy of different models, ranking them based on their performance in benchmarks. This system fosters a competitive environment, encouraging the development of models that are not only accurate but also efficient. The ImageNet Large Scale Visual Recognition Challenge (ILSVRC) is a prime example of this, with its annual leaderboard significantly contributing to the development of groundbreaking models such as AlexNet. + +Open-source access to state-of-the-art models and datasets further democratizes the field of machine learning, facilitating collaboration among researchers and practitioners worldwide. This open access accelerates the process of testing, validation, and deployment of new models in production environments, as evidenced by the widespread adoption of models like BERT and GPT-3 in various applications, from natural language processing to more complex, multi-modal tasks. + +Community collaboration platforms like Kaggle have revolutionized the field by hosting competitions that unite data scientists from across the globe to solve intricate problems, with specific benchmarks serving as the goalposts for innovation and model development. + +Moreover, the availability of diverse and high-quality datasets is paramount in training and testing machine learning models. Datasets such as ImageNet have played an instrumental role in the evolution of image recognition models, while extensive text datasets have facilitated advancements in natural language processing models. + +Lastly, the contributions of academic and research institutions cannot be overstated. Their role in publishing research papers, sharing findings at conferences, and fostering collaboration between various institutions has significantly contributed to the advancement of machine learning models and benchmarks. + +#### Emerging Trends + +As machine learning models become more sophisticated, so do the benchmarks required to accurately assess them. There are several emerging benchmarks and datasets that are gaining popularity due to their ability to evaluate models in more complex and realistic scenarios: + +**Multimodal Datasets:** These datasets contain multiple types of data, such as text, images, and audio, to better represent real-world situations. An example is the VQA (Visual Question Answering) dataset[@antol2015vqa], where models are tested on their ability to answer text-based questions about images. + +**Fairness and Bias Evaluation:** There is an increasing focus on creating benchmarks that assess the fairness and bias of machine learning models. Examples include the [AI Fairness 360](https://ai-fairness-360.org/) toolkit, which offers a comprehensive set of metrics and datasets for evaluating bias in models. + +**Out-of-Distribution Generalization**: Testing how well models perform on data that is different from the original training distribution. This evaluates the model's ability to generalize to new, unseen data. Example benchmarks are Wilds[@koh2021wilds], RxRx, and ANC-Bench. + +**Adversarial Robustness:** Evaluating model performance under adversarial attacks or perturbations to the input data. This tests the model's robustness. Example benchmarks are ImageNet-A[@hendrycks2021natural], ImageNet-C[@xie2020adversarial], and CIFAR-10.1. + +**Real-World Performance:** Testing models on real-world datasets that closely match end tasks, rather than just canned benchmark datasets. Examples are medical imaging datasets for healthcare tasks or actual customer support chat logs for dialogue systems. + +**Energy and Compute Efficiency:** Benchmarks that measure the computational resources required to achieve a particular accuracy. This evaluates the model's efficiency. Examples are MLPerf and Greenbench, and these were already discussed in the Systems benchmarking section. + +**Interpretability and Explainability:** Benchmarks that assess how easy it is to understand and explain a model's internal logic and predictions. Example metrics are faithfulness to input gradients and coherence of explanations. + +### Limitations and Challenges + +While model benchmarks are an essential tool in the assessment of machine learning models, there are several limitations and challenges that should be addressed to ensure that they accurately reflect a model's performance in real-world scenarios. + +**Dataset does not Correspond to Real-World Scenarios:** Often, the data used in model benchmarks is cleaned and preprocessed to such an extent that it may not accurately represent the data that a model would encounter in real-world applications. This idealized version of the data can lead to overestimations of a model's performance. In the case of the ImageNet dataset, the images are well-labeled and categorized, but in a real-world scenario, a model may need to deal with images that are blurry, poorly lit, or taken from awkward angles. This discrepancy can significantly affect the model's performance. + +**Sim2Real Gap:** The Sim2Real gap refers to the difference in performance of a model when transitioning from a simulated environment to a real-world environment. This gap is often observed in robotics, where a robot trained in a simulated environment struggles to perform tasks in the real world due to the complexity and unpredictability of real-world environments. A robot trained to pick up objects in a simulated environment may struggle to perform the same task in the real world because the simulated environment does not accurately represent the complexities of real-world physics, lighting, and object variability. + +**Challenges in Creating Datasets:** Creating a dataset for model benchmarking is a challenging task that requires careful consideration of various factors such as data quality, diversity, and representation. As discussed in the data engineering section, ensuring that the data is clean, unbiased, and representative of the real-world scenario is crucial for the accuracy and reliability of the benchmark. For example, when creating a dataset for a healthcare-related task, it is important to ensure that the data is representative of the entire population and not biased towards a particular demographic. This ensures that the model performs well across diverse patient populations. + +Model benchmarks are essential in measuring the capability of a model architecture in solving a fixed task, but it is important to address the limitations and challenges associated with them. This includes ensuring that the dataset accurately represents real-world scenarios, addressing the Sim2Real gap, and overcoming the challenges associated with creating unbiased and representative datasets. By addressing these challenges, and many others, we can ensure that model benchmarks provide a more accurate and reliable assessment of a model's performance in real-world applications. + +The [Speech Commands dataset](https://arxiv.org/pdf/1804.03209.pdf), and its successor [MSWC](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/file/fe131d7f5a6b38b23cc967316c13dae2-Paper-round2.pdf), are common benchmarks for one of the quintessential TinyML applications, keyword spotting. Speech Commands establish streaming error metrics beyond the standard top-1 classification accuracy that are more relevant to the keyword spotting use case. Use case relevant metrics are what elevates a dataset to a model benchmark. + +## Data Benchmarking + +For the past several years, the field of AI has been focused on developing increasingly sophisticated machine learning models like large language models. The goal has been to create models capable of human-level or superhuman performance on a wide range of tasks by training them on massive datasets. This model-centric approach produced rapid progress, with models attaining state-of-the-art results on many established benchmarks. + +However, there are growing concerns about issues like bias, safety, and robustness that persist even in models that achieve high accuracy on standard benchmarks. Additionally, some popular datasets used for evaluating models are beginning to saturate, with models reaching near perfect performance on existing test splits [@kiela2021dynabench]. As a simple example, there are test images in the classic MNIST handwritten digit dataset which may look indecipherable to most human evaluators, but nonetheless were assigned a label when the dataset was created - models which happen to agree with those labels may appear to exhibit superhuman performance but instead may only be capturing idiosyncrasies of the labeling and acquisition process from the dataset's creation in 1994. In the same spirit, computer vision researchers now ask "Are we done with ImageNet?" [@beyer2020we]. This highlights limitations in the conventional model-centric approach of optimizing accuracy on fixed datasets through architectural innovations. + +An alternative paradigm is emerging called data-centric AI. Rather than treating data as static and focusing narrowly on model performance, this approach recognizes that models are only as good as their training data. So the emphasis shifts to curating high-quality datasets that better reflect real-world complexity, developing more informative evaluation benchmarks, and carefully considering how data is sampled, preprocessed, and augmented. The goal is to optimize model behavior by improving the data, rather than just optimizing metrics on flawed datasets. Data-centric AI critically examines and enhances the data itself to produce beneficial AI. This reflects an important evolution in mindset as the field addresses the shortcomings of narrow benchmarking. + +In this section, we will explore the key differences between model-centric and data-centric approaches to AI. This distinction has important implications for how we benchmark AI systems. Specifically, we will see how a focus on data quality and efficiency can directly improve machine learning performance, as an alternative to solely optimizing model architectures. The data-centric approach recognizes that models are only as good as their training data. So enhancing data curation, evaluation benchmarks, and data handling processes can produce AI systems that are safer, fairer, and more robust. Rethinking benchmarking to prioritize data alongside models represents an important evolution as the field aims to deliver trustworthy real-world impact. + +![](images/benchmarking/dynabench.png) + +### Limitations of Model-Centric AI + +In the model-centric AI era, a prominent characteristic was the development of complex model architectures. Researchers and practitioners dedicated substantial effort to devise sophisticated and intricate models in the quest for superior performance. This frequently involved the incorporation of additional layers and the fine-tuning of a multitude of hyperparameters to achieve incremental improvements in accuracy. Concurrently, there was a significant emphasis on leveraging advanced algorithms. These algorithms, often at the forefront of the latest research, were employed to enhance the performance of AI models. The primary aim of these algorithms was to optimize the learning process of models, thereby extracting maximal information from the training data. + +While the model-centric approach has been central to many advancements in AI, it has several shortcomings. First, the development of complex model architectures can often lead to overfitting. This is where the model performs well on the training data but fails to generalize to new, unseen data. The additional layers and complexity can capture noise in the training data as if it were a real pattern, which harms the model's performance on new data. + +Second, the reliance on advanced algorithms can sometimes obscure the real understanding of a model's functioning. These algorithms often act as a black box, making it difficult to interpret how the model is making decisions. This lack of transparency can be a significant hurdle, especially in critical applications such as healthcare and finance, where understanding the model's decision-making process is crucial. + +Third, the emphasis on achieving state-of-the-art results on benchmark datasets can sometimes be misleading. These datasets are often not fully representative of the complexities and variability found in real-world data. A model that performs well on a benchmark dataset may not necessarily generalize well to new, unseen data in a real-world application. This discrepancy can lead to a false sense of confidence in the model's capabilities and hinder its practical applicability. + +Lastly, the model-centric approach often relies on large labeled datasets for training. However, in many real-world scenarios, obtaining such datasets is difficult and costly. This reliance on large datasets also limits the applicability of AI in domains where data is scarce or expensive to label. + +As a result of the above reasons, and many more, the AI community is shifting to a more data-centric approach. Rather than focusing just on model architecture, researchers are now prioritizing curating high-quality datasets, developing better evaluation benchmarks, and considering how data is sampled and preprocessed. The key idea is that models are only as good as their training data. So focusing on getting the right data will allow us to develop AI systems that are more fair, safe, and aligned with human values. This data-centric shift represents an important change in mindset as AI progresses. + +### The Shift Toward Data-centric AI + +Data-centric AI is a paradigm that emphasizes the importance of high-quality, well-labeled, and diverse datasets in the development of AI models. In contrast to the model-centric approach, which focuses on refining and iterating on the model architecture and algorithm to improve performance, data-centric AI prioritizes the quality of the input data as the primary driver of improved model performance. High-quality data is [clean, well-labeled](https://landing.ai/blog/tips-for-a-data-centric-ai-approach/), and representative of the real-world scenarios the model will encounter. In contrast, low-quality data can lead to poor model performance, regardless of the complexity or sophistication of the model architecture. + +Data-centric AI puts a strong emphasis on the cleaning and labeling of data. Cleaning involves the removal of outliers, handling missing values, and addressing other data inconsistencies. Labeling, on the other hand, involves assigning meaningful and accurate labels to the data. Both these processes are crucial in ensuring that the AI model is trained on accurate and relevant data. Another important aspect of the data-centric approach is data augmentation. This involves artificially increasing the size and diversity of the dataset by applying various transformations to the data, such as rotation, scaling, and flipping training images. Data augmentation helps in improving the model's robustness and generalization capabilities. + +There are several benefits to adopting a data-centric approach to AI development. First and foremost, it leads to improved model performance and generalization capabilities. By ensuring that the model is trained on high-quality, diverse data, the model is better able to generalize to new, unseen data [@gaviria2022dollar]. + +Additionally, a data-centric approach can often lead to simpler models that are easier to interpret and maintain. This is because the emphasis is on the data, rather than the model architecture, meaning that simpler models can achieve high performance when trained on high-quality data. + +The shift towards data-centric AI represents a significant paradigm shift. By prioritizing the quality of the input data, this approach aims to improve model performance and generalization capabilities, ultimately leading to more robust and reliable AI systems. As we continue to advance in our understanding and application of AI, the data-centric approach is likely to play a pivotal role in shaping the future of this field. + +### Benchmarking Data + +Data benchmarking aims to evaluate common issues in datasets, such as identifying label errors, noisy features, representation imbalance (for example, out of the 1000 classes in Imagenet-1K, there are over 100 categories which are just types of dogs), class imbalance (where some classes have many more samples than others), whether models trained on a given dataset can generalize to out-of-distribution features, or what types of biases might exist in a given dataset[@gaviria2022dollar]. In its simplest form, data benchmarking aims to improve accuracy on a test set by removing noisy or mislabeled training samples while keeping the model architecture fixed. Recent competitions in data benchmarking have invited participants to submit novel augmentation strategies and active learning techniques. + +Data-centric techniques continue to gain attention in benchmarking, especially as foundation models are increasingly trained on self-supervised objectives. Compared to smaller datasets like Imagenet-1K, massive datasets commonly used in self-supervised learning such as Common Crawl, OpenImages, and LAION-5B contain an order of magnitude higher amounts of noise, duplicates, bias, and potentially offensive data. + +[DataComp](https://www.datacomp.ai/) is a recently-launched dataset competition which targets evaluation of large corpora. DataComp focuses on language-image pairs used to train CLIP models. The introductory whitepaper finds that, when the total compute budget for training is held constant, the best-performing CLIP models on downstream tasks such as ImageNet classification are trained on just 30% of the available training sample pool. This suggests that proper filtering of large corpora is critical to improving the accuracy of foundation models. Similarly, Demystifying CLIP Data [@xu2023demystifying] asks whether the success of CLIP is attributable to the architecture or the dataset. + +[DataPerf](https://www.dataperf.org/) is another recent effort which focuses on benchmarking data in a wide range of modalities. DataPerf provides rounds of online competition to spur improvement in datasets. The inaugural offering launched with challenges in vision, speech, acquisition, debugging, and text prompting for image generation. + +### Data Efficiency + +As machine learning models grow larger and more complex and compute resources more scarce in the face of rising demand, it becomes challenging to meet the requirements for computation even with the largest machine learning fleets. To overcome these challenges and ensure machine learning system scalability, it is necessary to explore novel opportunities that augment conventional approaches to resource scaling. + +Improving data quality can be a useful method to significantly impact machine learning system performance. One of the primary benefits of enhancing data quality is the potential to reduce the size of the training dataset while still maintaining, or even improving, model performance. This reduction in data size has a direct relationship to the amount of training time required, thereby allowing models to converge more quickly and efficiently. But achieving this balance between data quality and dataset size is a challenging task that requires the development of sophisticated methods, algorithms, and techniques. + +There are several approaches that can be taken to improve data quality. These methods include and are not limited to the following: + +- Data Cleaning: This involves handling missing values, correcting errors, and removing outliers. Clean data ensures that the model is not learning from noise or inaccuracies. + +- Data Interpretability and Explainability: Common techniques include LIME [@ribeiro2016should] which provides insight into the decision boundaries of classifiers, and Shapley values [@lundberg2017unified] which estimate the importance of individual samples in contributing to a model's predictions. + +- Feature Engineering: Transforming or creating new features can significantly improve model performance by providing more relevant information for learning. + +- Data Augmentation: Augmenting data by creating new samples through various transformations can help improve model robustness and generalization. + +- Active Learning: This is a semi-supervised learning approach where the model actively queries a human oracle to label the most informative samples [@coleman2022similarity]. This ensures that the model is trained on the most relevant data. + +- Dimensionality Reduction: Techniques like PCA can be used to reduce the number of features in a dataset, thereby reducing complexity and training time. + +There are many other methods in the wild. But the goal is the same. By refining the dataset and ensuring it is of the highest quality, we can directly reduce the training time required for models to converge. However, achieving this requires the development and implementation of sophisticated methods, algorithms, and techniques that can clean, preprocess, and augment data while retaining the most informative samples. This is an ongoing challenge that will require continued research and innovation in the field of machine learning. + +## The Trifecta + +While system, model, and data benchmarks have traditionally been studied in isolation, there is a growing recognition that to fully understand and advance AI we must take a more holistic view. By iterating between benchmarking systems, models, and datasets together, novel insights may emerge that are not apparent when these components are analyzed separately. System performance impacts model accuracy, model capabilities drive data needs, and data characteristics shape system requirements. + +Benchmarking the triad of system, model, and data in an integrated fashion will likely lead to new discoveries about the co-design of AI systems, the generalization properties of models, and the role of data curation and quality in enabling performance. Rather than narrow benchmarks of individual components, the future of AI requires benchmarks that evaluate the symbiotic relationship between computing platforms, algorithms, and training data. This systems-level perspective will be critical to overcoming current limitations and unlocking the next level of AI capabilities. + +![](images/benchmarking/trifecta.png) + +The figure illustrates the many potential ways to interlace and interplay data benchmarking, model benchmarking, and system infrastructure benchmarking together. Through exploring these intricate interactions, we are likely to uncover new optimization opportunities and capabilities for enhancement. The triad of data, model, and system benchmarks offers a rich space for co-design and co-optimization. + +While this integrated perspective represents an emerging trend, the field has much more to discover about the synergies and trade-offs between these different components. As we iteratively benchmark combinations of data, models, and systems, entirely new insights will emerge that remain hidden when these elements are studied in isolation. This multi-faceted benchmarking approach charting the intersections of data, algorithms, and hardware promises to be a fruitful avenue for major progress in AI, even though it is still in its early stages. ## Conclusion -Explanation: Summarizing the key takeaways and looking at future trends provides closure to the chapter and gives readers a sense of the evolving landscape of AI benchmarking. +What gets measured gets improved. This chapter has explored the multifaceted nature of benchmarking spanning systems, models, and data. Benchmarking is important to advancing AI by providing the essential measurements to track progress. + +ML system benchmarks enable optimization across metrics like speed, efficiency, and scalability. Model benchmarks drive innovation through standardized tasks and metrics beyond just accuracy. And data benchmarks highlight issues of quality, balance and representation. + +Importantly, evaluating these components in isolation has limitations. The future will likely see more integrated benchmarking that explores the interplay between system benchmarks, model benchmarks and data benchmarks. This view promises new insights into the co-design of data, algorithms and infrastructure. + +As AI grows more complex, comprehensive benchmarking becomes even more critical. Standards must continuously evolve to measure new capabilities and reveal limitations. Close collaboration between industry, academics and national labls etc. is essential to develop benchmarks that are rigorous, transparent and socially beneficial. + +Benchmarking provides the compass to guide progress in AI. By persistently measuring and openly sharing results, we can navigate towards systems that are performant, robust and trustworthy. If AI is to properly serve societail and human needs, it must be benchmarked with humanity's best interests in mind. To this end, there are emerging areas such as benchmarking the safety of AI systems but that's for another day and perhaps something we can discuss further in Generative AI! -- Summary -- Future Trends in AI Benchmarking \ No newline at end of file +Benchmarking is a continuously evolving topic. The article [The Olympics of AI: Benchmarking Machine Learning Systems](https://towardsdatascience.com/the-olympics-of-ai-benchmarking-machine-learning-systems-c4b2051fbd2b) covers several emerging subfields in AI benchmarking, including robotics, extended reality, and neuromorphic computing that we encourage the reader to pursue. \ No newline at end of file diff --git a/case_studies.qmd b/case_studies.qmd index 7f5c276b..9523d102 100644 --- a/case_studies.qmd +++ b/case_studies.qmd @@ -1,6 +1,6 @@ # Case Studies -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * coming soon. diff --git a/community.qmd b/community.qmd index a326911a..a8228fb5 100644 --- a/community.qmd +++ b/community.qmd @@ -17,11 +17,14 @@ Welcome to our dedicated hub for TinyML enthusiasts. Whether you are a seasoned 1. **TinyML Foundation** Website: [TinyML Foundation](https://tinyml.org/) Description: The official website offers a wealth of information including research, news, and events. - + 2. **Edge Impulse Blog** Website: [Blog](https://www.edgeimpulse.com/blog) Description: Contains several articles, tutorials, and resources on TinyML. - + +3. **Tiny Machine Learning Open Education Initiative (TinyMLedu)** + Website: [TinyML Open Education Initiative](https://tinymledu.org/) + Description: The website offers links to educational materials on TinyML, training events and research papers. ## Social Media Groups 1. **LinkedIn Groups** diff --git a/contributors.qmd b/contributors.qmd index 71c8c2a4..63ddc35c 100644 --- a/contributors.qmd +++ b/contributors.qmd @@ -8,19 +8,26 @@ We extend our sincere thanks to the diverse group of individuals who have genero + - - + + + + - - + + + + + + + + - - - +
Marcelo Rovai
Marcelo Rovai

📖
sjohri20
sjohri20

📖
Ikechukwu Uchendu
Ikechukwu Uchendu

📖
naeemkh
naeemkh

📖
oishib
oishib

📖
Henry Bae
Henry Bae

📖
Marco Zennaro
Marco Zennaro

📖
Divya
Divya

📖
ishapira
ishapira

📖
Matthew Stewart
Matthew Stewart

📖
Marcelo Rovai
Marcelo Rovai

📖
Jessica Quaye
Jessica Quaye

📖
Colby Banbury
Colby Banbury

📖
Shvetank Prakash
Shvetank Prakash

📖
Mark Mazumder
Mark Mazumder

📖
sophiacho1
sophiacho1

📖
Ikechukwu Uchendu
Ikechukwu Uchendu

📖
naeemkh
naeemkh

📖
Vijay Janapa Reddi
Vijay Janapa Reddi

📖
Marco Zennaro
Marco Zennaro

📖
oishib
oishib

📖
Jessica Quaye
Jessica Quaye

📖
Matthew Stewart
Matthew Stewart

📖
diff --git a/data_engineering.qmd b/data_engineering.qmd index f252306e..47d987fe 100644 --- a/data_engineering.qmd +++ b/data_engineering.qmd @@ -1,8 +1,10 @@ # Data Engineering +![_DALL·E 3 Prompt: Illustration in a rectangular format with a cool blue color palette visualizing the Data Engineering process. Starting on the left with icons of raw data sources, they connect to a central hub symbolized by swirling gears and pipelines in shades of blue. This represents the transformation, cleaning, and storage processes. On the right, datasets in refined formats are symbolized by sleek database icons and a machine learning model. Flow lines in varying blue tones connect each element, emphasizing the transition and importance of each data engineering stage._](./images/cover_data_engineering.png) + Data is the lifeblood of AI systems. Without good data, even the most advanced machine learning algorithms will fail. In this section, we will dive into the intricacies of building high-quality datasets to fuel our AI models. Data engineering encompasses the processes of collecting, storing, processing, and managing data for training machine learning models. -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * Understand the importance of clearly defining the problem statement and objectives when embarking on a ML project. diff --git a/dl_primer.qmd b/dl_primer.qmd index 3ee20490..400ff778 100644 --- a/dl_primer.qmd +++ b/dl_primer.qmd @@ -1,8 +1,10 @@ # Deep Learning Primer +![_DALL·E 3 Prompt: Photo of a classic classroom with a large blackboard dominating one wall. Chalk drawings showcase a detailed deep neural network with several hidden layers, and each node and connection is precisely labeled with white chalk. The rustic wooden floor and brick walls provide a contrast to the modern concepts. Surrounding the room, posters mounted on frames emphasize deep learning themes: convolutional networks, transformers, neurons, activation functions, and more._](./images/cover_dl_primer.png) + This section offers a brief introduction to deep learning, starting with an overview of its history, applications, and relevance to embedded AI systems. It examines the core concepts like neural networks, highlighting key components like perceptrons, multilayer perceptrons, activation functions, and computational graphs. The primer also briefly explores major deep learning architecture, contrasting their applications and uses. Additionally, it compares deep learning to traditional machine learning to equip readers with the general conceptual building blocks to make informed choices between deep learning and traditional ML techniques based on problem constraints, setting the stage for more advanced techniques and applications that will follow in subsequent chapters. -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * Understand the basic concepts and definitions of deep neural networks. diff --git a/efficient_ai.qmd b/efficient_ai.qmd index 84dcfee6..8b5b9d83 100644 --- a/efficient_ai.qmd +++ b/efficient_ai.qmd @@ -1,6 +1,8 @@ # Efficient AI -::: {.callout-tip collapse="true"} +Efficiency in artificial intelligence (AI) is not simply a luxury; it is a necessity. In this chapter, we dive into the key concepts that underpin efficiency in AI systems. The computational demands placed on neural networks can be daunting, even for the most minimal of systems. For AI to be seamlessly integrated into everyday devices and essential systems, it must perform optimally within the constraints of limited resources, all while maintaining its efficacy. The pursuit of efficiency guarantees that AI models are streamlined, rapid, and sustainable, thereby widening their applicability across a diverse array of platforms and scenarios. + +::: {.callout-tip} ## Learning Objectives * coming soon. @@ -9,8 +11,6 @@ ## Introduction -In this chapter, we dive into the concepts that govern efficiency in AI systems. It is of paramount importance, especially in the context of embedded TinyML systems. The computational demands of neural networks can be overwhelming, even in the smallest of systems. Efficiency in AI isn't just a luxury---it's a necessity. For AI to truly be integrated into everyday devices and critical systems, it must operate within the constraints of limited resources without compromising its effectiveness. The drive for efficiency ensures that AI models are lean, fast, and sustainable, making their application viable across a broader range of platforms and scenarios. - Training models can consume a significant amount of energy, sometimes equivalent to the carbon footprint of sizable industrial processes. We will cover some of these sustainability details in the [AI Sustainability](./sustainable_ai.qmd) chapter. On the deployment side, if these models are not optimized for efficiency, they can quickly drain device batteries, demand excessive memory, or fall short of real-time processing needs. Through this introduction, our objective is to elucidate the nuances of efficiency, setting the groundwork for a comprehensive exploration in the subsequent chapters. ## The Need for Efficient AI @@ -150,4 +150,4 @@ We saw that efficient model architectures can be useful for optimizations. Model Together, these form a holistic framework for efficient AI. But the journey doesn't end here. Achieving optimally efficient intelligence requires continued research and innovation. As models become more sophisticated, datasets grow larger, and applications diversify into specialized domains, efficiency must evolve in lockstep. Measuring real-world impact would need nuanced benchmarks and standardized metrics beyond simplistic accuracy figures. -Moreover, efficient AI expands beyond technological optimization but also encompasses costs, environmental impact, and ethical considerations for the broader societal good. As AI permeates across industries and daily lives, a comprehensive outlook on efficiency underpins its sustainable and responsible progress. The subsequent chapters will build upon these foundational concepts, providing actionable insights and hands-on best practices for developing and deploying efficient AI solutions. \ No newline at end of file +Moreover, efficient AI expands beyond technological optimization but also encompasses costs, environmental impact, and ethical considerations for the broader societal good. As AI permeates across industries and daily lives, a comprehensive outlook on efficiency underpins its sustainable and responsible progress. The subsequent chapters will build upon these foundational concepts, providing actionable insights and hands-on best practices for developing and deploying efficient AI solutions. diff --git a/embedded_ml.qmd b/embedded_ml.qmd index 70ad6b02..79d30f79 100644 --- a/embedded_ml.qmd +++ b/embedded_ml.qmd @@ -1,8 +1,10 @@ # Embedded AI +![_DALL·E 3 Prompt: Illustration in a rectangular format depicting the merger of embedded systems with Embedded AI. The left half of the image portrays traditional embedded systems, including microcontrollers and processors, detailed and precise. The right half showcases the world of artificial intelligence, with abstract representations of machine learning models, neurons, and data flow. The two halves are distinctly separated, emphasizing the individual significance of embedded tech and AI, but they come together in harmony at the center._](./images/cover_embedded_ai.png) + Before delving into the intricacies of TinyML, it's crucial to grasp the distinctions among Cloud ML, Edge ML, and TinyML. In this chapter, we'll explore each of these facets individually before comparing and contrasting them. -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * Compare Cloud ML, Edge ML, and TinyML in terms of processing location, latency, privacy, computational power, etc. @@ -253,11 +255,11 @@ The embedded ML landscape is in a state of rapid evolution, poised to enable int Now would be a great time for you to try out a small computer vision model out of the box. -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Nicla Vision If you want to play with an embedded system, try out the Nicla Vision -[Computer Vision](./embedded_ml_exercise.qmd) +[Computer Vision](./image_classification.qmd) ::: \ No newline at end of file diff --git a/embedded_ml_exercise.qmd b/embedded_ml_exercise.qmd deleted file mode 100644 index dec56a5b..00000000 --- a/embedded_ml_exercise.qmd +++ /dev/null @@ -1,727 +0,0 @@ -# CV on Nicla Vision {.unnumbered} - -As we initiate our studies into embedded machine learning or tinyML, -it\'s impossible to overlook the transformative impact of Computer -Vision (CV) and Artificial Intelligence (AI) in our lives. These two -intertwined disciplines redefine what machines can perceive and -accomplish, from autonomous vehicles and robotics to healthcare and -surveillance. - -More and more, we are facing an artificial intelligence (AI) revolution -where, as stated by Gartner, **Edge AI** has a very high impact -potential, and **it is for now**! - -![](images_4/media/image2.jpg){width="4.729166666666667in" -height="4.895833333333333in"} - -In the \"bull-eye\" of emerging technologies, radar is the *Edge -Computer Vision*, and when we talk about Machine Learning (ML) applied -to vision, the first thing that comes to mind is **Image -Classification**, a kind of ML \"Hello World\"! - -This exercise will explore a computer vision project utilizing -Convolutional Neural Networks (CNNs) for real-time image classification. -Leveraging TensorFlow\'s robust ecosystem, we\'ll implement a -pre-trained MobileNet model and adapt it for edge deployment. The focus -will be optimizing the model to run efficiently on resource-constrained -hardware without sacrificing accuracy. - -We\'ll employ techniques like quantization and pruning to reduce the -computational load. By the end of this tutorial, you\'ll have a working -prototype capable of classifying images in real time, all running on a -low-power embedded system based on the Arduino Nicla Vision board. - -## Computer Vision - -At its core, computer vision aims to enable machines to interpret and -make decisions based on visual data from the world---essentially -mimicking the capability of the human optical system. Conversely, AI is -a broader field encompassing machine learning, natural language -processing, and robotics, among other technologies. When you bring AI -algorithms into computer vision projects, you supercharge the system\'s -ability to understand, interpret, and react to visual stimuli. - -When discussing Computer Vision projects applied to embedded devices, -the most common applications that come to mind are *Image -Classification* and *Object Detection*. - -![](images_4/media/image15.jpg){width="6.5in" -height="2.8333333333333335in"} - -Both models can be implemented on tiny devices like the Arduino Nicla -Vision and used on real projects. Let\'s start with the first one. - -## Image Classification Project - -The first step in any ML project is to define our goal. In this case, it -is to detect and classify two specific objects present in one image. For -this project, we will use two small toys: a *robot* and a small -Brazilian parrot (named *Periquito*). Also, we will collect images of a -*background* where those two objects are absent. - -![](images_4/media/image36.jpg){width="6.5in" -height="3.638888888888889in"} - -## Data Collection - -Once you have defined your Machine Learning project goal, the next and -most crucial step is the dataset collection. You can use the Edge -Impulse Studio, the OpenMV IDE we installed, or even your phone for the -image capture. Here, we will use the OpenMV IDE for that. - -**Collecting Dataset with OpenMV IDE** - -First, create in your computer a folder where your data will be saved, -for example, \"data.\" Next, on the OpenMV IDE, go to Tools \> Dataset -Editor and select New Dataset to start the dataset collection: - -![](images_4/media/image29.png){width="6.291666666666667in" -height="4.010416666666667in"} - -The IDE will ask you to open the file where your data will be saved and -choose the \"data\" folder that was created. Note that new icons will -appear on the Left panel. - -![](images_4/media/image46.png){width="0.9583333333333334in" -height="1.5208333333333333in"} - -Using the upper icon (1), enter with the first class name, for example, -\"periquito\": - -![](images_4/media/image22.png){width="3.25in" -height="2.65625in"} - -Run the dataset_capture_script.py, and clicking on the bottom icon (2), -will start capturing images: - -![](images_4/media/image43.png){width="6.5in" -height="4.041666666666667in"} - -Repeat the same procedure with the other classes - -![](images_4/media/image6.jpg){width="6.5in" -height="3.0972222222222223in"} - -> *We suggest around 60 images from each category. Try to capture -> different angles, backgrounds, and light conditions.* - -The stored images use a QVGA frame size 320x240 and RGB565 (color pixel -format). - -After capturing your dataset, close the Dataset Editor Tool on the Tools -\> Dataset Editor. - -On your computer, you will end with a dataset that contains three -classes: periquito, robot, and background. - -![](images_4/media/image20.png){width="6.5in" -height="2.2083333333333335in"} - -You should return to Edge Impulse Studio and upload the dataset to your -project. - -## Training the model with Edge Impulse Studio - -We will use the Edge Impulse Studio for training our model. Enter your -account credentials at Edge Impulse and create a new project: - -![](images_4/media/image45.png){width="6.5in" -height="4.263888888888889in"} - -> *Here, you can clone a similar project:* -> *[NICLA-Vision_Image_Classification](https://studio.edgeimpulse.com/public/273858/latest).* - -## Dataset - -Using the EI Studio (or *Studio*), we will pass over four main steps to -have our model ready for use on the Nicla Vision board: Dataset, -Impulse, Tests, and Deploy (on the Edge Device, in this case, the -NiclaV). - -![](images_4/media/image41.jpg){width="6.5in" -height="4.194444444444445in"} - -Regarding the Dataset, it is essential to point out that our Original -Dataset, captured with the OpenMV IDE, will be split into three parts: -Training, Validation, and Test. The Test Set will be divided from the -beginning and left a part to be used only in the Test phase after -training. The Validation Set will be used during training. - -![](images_4/media/image7.jpg){width="6.5in" -height="4.763888888888889in"} - -On Studio, go to the Data acquisition tab, and on the UPLOAD DATA -section, upload from your computer the files from chosen categories: - -![](images_4/media/image39.png){width="6.5in" -height="4.263888888888889in"} - -Left to the Studio to automatically split the original dataset into -training and test and choose the label related to that specific data: - -![](images_4/media/image30.png){width="6.5in" -height="4.263888888888889in"} - -Repeat the procedure for all three classes. At the end, you should see -your \"raw data in the Studio: - -![](images_4/media/image11.png){width="6.5in" -height="4.263888888888889in"} - -The Studio allows you to explore your data, showing a complete view of -all the data in your project. You can clear, inspect, or change labels -by clicking on individual data items. In our case, a simple project, the -data seems OK. - -![](images_4/media/image44.png){width="6.5in" -height="4.263888888888889in"} - -## The Impulse Design - -In this phase, we should define how to: - -- Pre-process our data, which consists of resizing the individual - > images and determining the color depth to use (RGB or Grayscale) - > and - -- Design a Model that will be \"Transfer Learning (Images)\" to - > fine-tune a pre-trained MobileNet V2 image classification model on - > our data. This method performs well even with relatively small - > image datasets (around 150 images in our case). - -![](images_4/media/image23.jpg){width="6.5in" -height="4.0in"} - -Transfer Learning with MobileNet offers a streamlined approach to model -training, which is especially beneficial for resource-constrained -environments and projects with limited labeled data. MobileNet, known -for its lightweight architecture, is a pre-trained model that has -already learned valuable features from a large dataset (ImageNet). - -![](images_4/media/image9.jpg){width="6.5in" -height="1.9305555555555556in"} - -By leveraging these learned features, you can train a new model for your -specific task with fewer data and computational resources yet achieve -competitive accuracy. - -![](images_4/media/image32.jpg){width="6.5in" -height="2.3055555555555554in"} - -This approach significantly reduces training time and computational -cost, making it ideal for quick prototyping and deployment on embedded -devices where efficiency is paramount. - -Go to the Impulse Design Tab and create the *impulse*, defining an image -size of 96x96 and squashing them (squared form, without crop). Select -Image and Transfer Learning blocks. Save the Impulse. - -![](images_4/media/image16.png){width="6.5in" -height="4.263888888888889in"} - -### **Image Pre-Processing** - -All input QVGA/RGB565 images will be converted to 27,640 features -(96x96x3). - -![](images_4/media/image17.png){width="6.5in" -height="4.319444444444445in"} - -Press \[Save parameters\] and Generate all features: - -![](images_4/media/image5.png){width="6.5in" -height="4.263888888888889in"} - -## Model Design - -In 2007, Google introduced -[[MobileNetV1]{.underline}](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html), -a family of general-purpose computer vision neural networks designed -with mobile devices in mind to support classification, detection, and -more. MobileNets are small, low-latency, low-power models parameterized -to meet the resource constraints of various use cases. in 2018, Google -launched [MobileNetV2: Inverted Residuals and Linear -Bottlenecks](https://arxiv.org/abs/1801.04381). - -MobileNet V1 and MobileNet V2 aim for mobile efficiency and embedded -vision applications but differ in architectural complexity and -performance. While both use depthwise separable convolutions to reduce -the computational cost, MobileNet V2 introduces Inverted Residual Blocks -and Linear Bottlenecks to enhance performance. These new features allow -V2 to capture more complex features using fewer parameters, making it -computationally more efficient and generally more accurate than its -predecessor. Additionally, V2 employs a non-linear activation in the -intermediate expansion layer. Still, it uses a linear activation for the -bottleneck layer, a design choice found to preserve important -information through the network better. MobileNet V2 offers a more -optimized architecture for higher accuracy and efficiency and will be -used in this project. - -Although the base MobileNet architecture is already tiny and has low -latency, many times, a specific use case or application may require the -model to be smaller and faster. MobileNets introduces a straightforward -parameter α (alpha) called width multiplier to construct these smaller, -less computationally expensive models. The role of the width multiplier -α is to thin a network uniformly at each layer. - -Edge Impulse Studio has available MobileNetV1 (96x96 images) and V2 -(96x96 and 160x160 images), with several different **α** values (from -0.05 to 1.0). For example, you will get the highest accuracy with V2, -160x160 images, and α=1.0. Of course, there is a trade-off. The higher -the accuracy, the more memory (around 1.3M RAM and 2.6M ROM) will be -needed to run the model, implying more latency. The smaller footprint -will be obtained at another extreme with MobileNetV1 and α=0.10 (around -53.2K RAM and 101K ROM). - -![](images_4/media/image27.jpg){width="6.5in" -height="3.5277777777777777in"} - -For this project, we will use **MobileNetV2 96x96 0.1**, which estimates -a memory cost of 265.3 KB in RAM. This model should be OK for the Nicla -Vision with 1MB of SRAM. On the Transfer Learning Tab, select this -model: - -![](images_4/media/image24.png){width="6.5in" -height="4.263888888888889in"} - -Another necessary technique to be used with Deep Learning is **Data -Augmentation**. Data augmentation is a method that can help improve the -accuracy of machine learning models, creating additional artificial -data. A data augmentation system makes small, random changes to your -training data during the training process (such as flipping, cropping, -or rotating the images). - -Under the rood, here you can see how Edge Impulse implements a data -Augmentation policy on your data: - -```python -# Implements the data augmentation policy -def augment_image(image, label): - # Flips the image randomly - image = tf.image.random_flip_left_right(image) - - # Increase the image size, then randomly crop it down to - # the original dimensions - resize_factor = random.uniform(1, 1.2) - new_height = math.floor(resize_factor * INPUT_SHAPE[0]) - new_width = math.floor(resize_factor * INPUT_SHAPE[1]) - image = tf.image.resize_with_crop_or_pad(image, new_height, new_width) - image = tf.image.random_crop(image, size=INPUT_SHAPE) - - # Vary the brightness of the image - image = tf.image.random_brightness(image, max_delta=0.2) - - return image, label - -``` -Exposure to these variations during training can help prevent your model -from taking shortcuts by \"memorizing\" superficial clues in your -training data, meaning it may better reflect the deep underlying -patterns in your dataset. - -The final layer of our model will have 12 neurons with a 15% dropout for -overfitting prevention. Here is the Training result: - -![](images_4/media/image31.jpg){width="6.5in" -height="3.5in"} - -The result is excellent, with 77ms of latency, which should result in -13fps (frames per second) during inference. - -## Model Testing - -![](images_4/media/image10.jpg){width="6.5in" -height="3.8472222222222223in"} - -Now, you should take the data put apart at the start of the project and -run the trained model having them as input: - -![](images_4/media/image34.png){width="3.1041666666666665in" -height="1.7083333333333333in"} - -The result was, again, excellent. - -![](images_4/media/image12.png){width="6.5in" -height="4.263888888888889in"} - -## Deploying the model - -At this point, we can deploy the trained model as.tflite and use the -OpenMV IDE to run it using MicroPython, or we can deploy it as a C/C++ -or an Arduino library. - -![](images_4/media/image28.jpg){width="6.5in" -height="3.763888888888889in"} - -**Arduino Library** - -First, Let\'s deploy it as an Arduino Library: - -![](images_4/media/image48.png){width="6.5in" -height="4.263888888888889in"} - -You should install the library as.zip on the Arduino IDE and run the -sketch nicla_vision_camera.ino available in Examples under your library -name. - -> *Note that Arduino Nicla Vision has, by default, 512KB of RAM -> allocated for the M7 core and an additional 244KB on the M4 address -> space. In the code, this allocation was changed to 288 kB to guarantee -> that the model will run on the device -> (malloc_addblock((void\*)0x30000000, 288 \* 1024);).* - -The result was good, with 86ms of measured latency. - -![](images_4/media/image25.jpg){width="6.5in" -height="3.4444444444444446in"} - -Here is a short video showing the inference results: -[[https://youtu.be/bZPZZJblU-o]{.underline}](https://youtu.be/bZPZZJblU-o) - -**OpenMV** - -It is possible to deploy the trained model to be used with OpenMV in two -ways: as a library and as a firmware. - -Three files are generated as a library: the.tflite model, a list with -the labels, and a simple MicroPython script that can make inferences -using the model. - -![](images_4/media/image26.png){width="6.5in" -height="1.0in"} - -Running this model as a.tflite directly in the Nicla was impossible. So, -we can sacrifice the accuracy using a smaller model or deploy the model -as an OpenMV Firmware (FW). As an FW, the Edge Impulse Studio generates -optimized models, libraries, and frameworks needed to make the -inference. Let\'s explore this last one. - -Select OpenMV Firmware on the Deploy Tab and press \[Build\]. - -![](images_4/media/image3.png){width="6.5in" -height="4.263888888888889in"} - -On your computer, you will find a ZIP file. Open it: - -![](images_4/media/image33.png){width="6.5in" height="2.625in"} - -Use the Bootloader tool on the OpenMV IDE to load the FW on your board: - -![](images_4/media/image35.jpg){width="6.5in" height="3.625in"} - -Select the appropriate file (.bin for Nicla-Vision): - -![](images_4/media/image8.png){width="6.5in" height="1.9722222222222223in"} - -After the download is finished, press OK: - -![DFU firmware update complete!.png](images_4/media/image40.png){width="3.875in" height="5.708333333333333in"} - -If a message says that the FW is outdated, DO NOT UPGRADE. Select -\[NO\]. - -![](images_4/media/image42.png){width="4.572916666666667in" -height="2.875in"} - -Now, open the script **ei_image_classification.py** that was downloaded -from the Studio and the.bin file for the Nicla. - -![](images_4/media/image14.png){width="6.5in" -height="4.0in"} - -And run it. Pointing the camera to the objects we want to classify, the -inference result will be displayed on the Serial Terminal. - -![](images_4/media/image37.png){width="6.5in" -height="3.736111111111111in"} - -**Changing Code to add labels:** - -The code provided by Edge Impulse can be modified so that we can see, -for test reasons, the inference result directly on the image displayed -on the OpenMV IDE. - -[[Upload the code from -GitHub,]{.underline}](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Micropython/nicla_image_classification.py) -or modify it as below: - -```python -# Marcelo Rovai - NICLA Vision - Image Classification -# Adapted from Edge Impulse - OpenMV Image Classification Example -# @24Aug23 - -import sensor, image, time, os, tf, uos, gc - -sensor.reset() # Reset and initialize the sensor. -sensor.set_pixformat(sensor.RGB565) # Set pxl fmt to RGB565 (or GRAYSCALE) -sensor.set_framesize(sensor.QVGA) # Set frame size to QVGA (320x240) -sensor.set_windowing((240, 240)) # Set 240x240 window. -sensor.skip_frames(time=2000) # Let the camera adjust. - -net = None -labels = None - -try: - # Load built in model - labels, net = tf.load_builtin_model('trained') -except Exception as e: - raise Exception(e) - -clock = time.clock() -while(True): - clock.tick() # Starts tracking elapsed time. - - img = sensor.snapshot() - - # default settings just do one detection - for obj in net.classify(img, - min_scale=1.0, - scale_mul=0.8, - x_overlap=0.5, - y_overlap=0.5): - fps = clock.fps() - lat = clock.avg() - - print("**********\nPrediction:") - img.draw_rectangle(obj.rect()) - # This combines the labels and confidence values into a list of tuples - predictions_list = list(zip(labels, obj.output())) - - max_val = predictions_list[0][1] - max_lbl = 'background' - for i in range(len(predictions_list)): - val = predictions_list[i][1] - lbl = predictions_list[i][0] - - if val > max_val: - max_val = val - max_lbl = lbl - - # Print label with the highest probability - if max_val < 0.5: - max_lbl = 'uncertain' - print("{} with a prob of {:.2f}".format(max_lbl, max_val)) - print("FPS: {:.2f} fps ==> latency: {:.0f} ms".format(fps, lat)) - - # Draw label with highest probability to image viewer - img.draw_string( - 10, 10, - max_lbl + "\n{:.2f}".format(max_val), - mono_space = False, - scale=2 - ) - -``` - -Here you can see the result: - -![](images_4/media/image47.jpg){width="6.5in" -height="2.9444444444444446in"} - -Note that the latency (136 ms) is almost double what we got directly -with the Arduino IDE. This is because we are using the IDE as an -interface and the time to wait for the camera to be ready. If we start -the clock just before the inference: - -![](images_4/media/image13.jpg){width="6.5in" -height="2.0972222222222223in"} - -The latency will drop to only 71 ms. - -![](images_4/media/image1.jpg){width="3.5520833333333335in" -height="1.53125in"} - -> *The NiclaV runs about half as fast when connected to the IDE. The FPS should increase once disconnected.* - -### **Post-Processing with LEDs** - -When working with embedded machine learning, we are looking for devices -that can continually proceed with the inference and result, taking some -action directly on the physical world and not displaying the result on a -connected computer. To simulate this, we will define one LED to light up -for each one of the possible inference results. - -![](images_4/media/image38.jpg){width="6.5in" -height="3.236111111111111in"} - -For that, we should [[upload the code from -GitHub]{.underline}](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Micropython/nicla_image_classification_LED.py) -or change the last code to include the LEDs: - -```python -# Marcelo Rovai - NICLA Vision - Image Classification with LEDs -# Adapted from Edge Impulse - OpenMV Image Classification Example -# @24Aug23 - -import sensor, image, time, os, tf, uos, gc, pyb - -ledRed = pyb.LED(1) -ledGre = pyb.LED(2) -ledBlu = pyb.LED(3) - -sensor.reset() # Reset and initialize the sensor. -sensor.set_pixformat(sensor.RGB565) # Set pixl fmt to RGB565 (or GRAYSCALE) -sensor.set_framesize(sensor.QVGA) # Set frame size to QVGA (320x240) -sensor.set_windowing((240, 240)) # Set 240x240 window. -sensor.skip_frames(time=2000) # Let the camera adjust. - -net = None -labels = None - -ledRed.off() -ledGre.off() -ledBlu.off() - -try: - # Load built in model - labels, net = tf.load_builtin_model('trained') -except Exception as e: - raise Exception(e) - -clock = time.clock() - - -def setLEDs(max_lbl): - - if max_lbl == 'uncertain': - ledRed.on() - ledGre.off() - ledBlu.off() - - if max_lbl == 'periquito': - ledRed.off() - ledGre.on() - ledBlu.off() - - if max_lbl == 'robot': - ledRed.off() - ledGre.off() - ledBlu.on() - - if max_lbl == 'background': - ledRed.off() - ledGre.off() - ledBlu.off() - - -while(True): - img = sensor.snapshot() - clock.tick() # Starts tracking elapsed time. - - # default settings just do one detection. - for obj in net.classify(img, - min_scale=1.0, - scale_mul=0.8, - x_overlap=0.5, - y_overlap=0.5): - fps = clock.fps() - lat = clock.avg() - - print("**********\nPrediction:") - img.draw_rectangle(obj.rect()) - # This combines the labels and confidence values into a list of tuples - predictions_list = list(zip(labels, obj.output())) - - max_val = predictions_list[0][1] - max_lbl = 'background' - for i in range(len(predictions_list)): - val = predictions_list[i][1] - lbl = predictions_list[i][0] - - if val > max_val: - max_val = val - max_lbl = lbl - - # Print label and turn on LED with the highest probability - if max_val < 0.8: - max_lbl = 'uncertain' - - setLEDs(max_lbl) - - print("{} with a prob of {:.2f}".format(max_lbl, max_val)) - print("FPS: {:.2f} fps ==> latency: {:.0f} ms".format(fps, lat)) - - # Draw label with highest probability to image viewer - img.draw_string( - 10, 10, - max_lbl + "\n{:.2f}".format(max_val), - mono_space = False, - scale=2 - ) - -``` - -Now, each time that a class gets a result superior of 0.8, the -correspondent LED will be light on as below: - -- Led Red 0n: Uncertain (no one class is over 0.8) - -- Led Green 0n: Periquito \> 0.8 - -- Led Blue 0n: Robot \> 0.8 - -- All LEDs Off: Background \> 0.8 - -Here is the result: - -![](images_4/media/image18.jpg){width="6.5in" -height="3.6527777777777777in"} - -In more detail - -![](images_4/media/image21.jpg){width="6.5in" -height="2.0972222222222223in"} - -### **Image Classification (non-official) Benchmark** - -Several development boards can be used for embedded machine learning -(tinyML), and the most common ones for Computer Vision applications -(with low energy), are the ESP32 CAM, the Seeed XIAO ESP32S3 Sense, the -Arduinos Nicla Vison, and Portenta. - -![](images_4/media/image19.jpg){width="6.5in" -height="4.194444444444445in"} - -Using the opportunity, the same trained model was deployed on the -ESP-CAM, the XIAO, and Portenta (in this one, the model was trained -again, using grayscaled images to be compatible with its camera. Here is -the result, deploying the models as Arduino\'s Library: - -![](images_4/media/image4.jpg){width="6.5in" -height="3.4444444444444446in"} - -## Conclusion - -Before we finish, consider that Computer Vision is more than just image -classification. For example, you can develop Edge Machine Learning -projects around vision in several areas, such as: - -- **Autonomous Vehicles**: Use sensor fusion, lidar data, and computer - > vision algorithms to navigate and make decisions. - -- **Healthcare**: Automated diagnosis of diseases through MRI, X-ray, - > and CT scan image analysis - -- **Retail**: Automated checkout systems that identify products as - > they pass through a scanner. - -- **Security and Surveillance**: Facial recognition, anomaly - > detection, and object tracking in real-time video feeds. - -- **Augmented Reality**: Object detection and classification to - > overlay digital information in the real world. - -- **Industrial Automation**: Visual inspection of products, predictive - > maintenance, and robot and drone guidance. - -- **Agriculture**: Drone-based crop monitoring and automated - > harvesting. - -- **Natural Language Processing**: Image captioning and visual - > question answering. - -- **Gesture Recognition**: For gaming, sign language translation, and - > human-machine interaction. - -- **Content Recommendation**: Image-based recommendation systems in - > e-commerce. diff --git a/embedded_sys.qmd b/embedded_sys.qmd index da6632a4..f0eae41b 100644 --- a/embedded_sys.qmd +++ b/embedded_sys.qmd @@ -1,10 +1,12 @@ -## Embedded Systems +# Embedded Systems + +![_DALL·E 3 Prompt: Illustration of a modern smart device, like a wearable watch or smart thermostat, opened up to reveal its inner components. Within the device, there are tiny robots analyzing and tweaking the circuits. On the device's display, a machine learning model is being trained, showing data points and accuracy metrics, representing the convergence of embedded systems and AI._](./images/cover_embedded_sys.png) In the domain of TinyML, embedded systems serve as the bedrock, providing a robust platform where intelligent algorithms can function both efficiently and effectively. Defined by their specialized roles and real-time computational capabilities, these systems act as the convergence point where data and computation intersect on a micro-scale. Tailored to meet the demands of specific tasks, they excel in optimizing performance, energy usage, and spatial efficiency—key considerations in the successful implementation of TinyML solutions. As we journey further into this chapter, we will demystify the intricate yet captivating realm of embedded systems, gaining insights into their structural design, operational features, and the crucial part they play in enabling TinyML applications. From an introduction to the fundamentals of microcontroller units to a deep dive into the interfaces and peripherals that amplify their capabilities, this chapter aims to be a comprehensive guide for understanding the nuanced aspects of embedded systems within the TinyML landscape. -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * Understand the definition, characteristics, history, and importance of embedded systems, especially in relation to tinyML. @@ -382,11 +384,11 @@ As we gaze into the future, it's clear that the realm of embedded systems stands Now would be a great time for you to get your hands on a real embedded device, and get it setup. -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Nicla Vision If you want to play with an embedded system, try out the Nicla Vision -[Setup Nicla Vision](./embedded_sys_exercise.qmd) +[Setup Nicla Vision](./niclav_sys.qmd) ::: \ No newline at end of file diff --git a/embedded_sys_exercise.qmd b/embedded_sys_exercise.qmd deleted file mode 100644 index 2fc19cab..00000000 --- a/embedded_sys_exercise.qmd +++ /dev/null @@ -1,482 +0,0 @@ -# Setup Nicla Vision {.unnumbered} - -The [Arduino Nicla -Vision](https://docs.arduino.cc/hardware/nicla-vision) (sometimes called -*NiclaV*) is a development board that includes two processors that can -run tasks in parallel. It is part of a family of development boards with -the same form factor but designed for specific tasks, such as the [Nicla -Sense -ME](https://www.bosch-sensortec.com/software-tools/tools/arduino-nicla-sense-me/) -and the [Nicla -Voice](https://store-usa.arduino.cc/products/nicla-voice?_gl=1*l3abc6*_ga*MTQ3NzE4Mjk4Mi4xNjQwMDIwOTk5*_ga_NEXN8H46L5*MTY5NjM0Mzk1My4xMDIuMS4xNjk2MzQ0MjQ1LjAuMC4w). -The *Niclas* can efficiently run processes created with TensorFlow™ -Lite. For example, one of the cores of the NiclaV computing a computer -vision algorithm on the fly (inference), while the other leads with -low-level operations like controlling a motor and communicating or -acting as a user interface. - -> *The onboard wireless module allows the management of WiFi and -> Bluetooth Low Energy (BLE) connectivity simultaneously.* - -![](images_2/media/image29.jpg){width="6.5in" -height="3.861111111111111in"} - -## Two Parallel Cores - -The central processor is the dual-core -[STM32H747,](https://content.arduino.cc/assets/Arduino-Portenta-H7_Datasheet_stm32h747xi.pdf?_gl=1*6quciu*_ga*MTQ3NzE4Mjk4Mi4xNjQwMDIwOTk5*_ga_NEXN8H46L5*MTY0NzQ0NTg1My4xMS4xLjE2NDc0NDYzMzkuMA..) -including a Cortex® M7 at 480 MHz and a Cortex® M4 at 240 MHz. The two -cores communicate via a Remote Procedure Call mechanism that seamlessly -allows calling functions on the other processor. Both processors share -all the on-chip peripherals and can run: - -- Arduino sketches on top of the Arm® Mbed™ OS - -- Native Mbed™ applications - -- MicroPython / JavaScript via an interpreter - -- TensorFlow™ Lite - -![](images_2/media/image22.jpg){width="5.78125in" -height="5.78125in"} - -## Memory - -Memory is crucial for embedded machine learning projects. The NiclaV -board can host up to 16 MB of QSPI Flash for storage. However, it is -essential to consider that the MCU SRAM is the one to be used with -machine learning inferences; the STM32H747 is only 1MB, shared by both -processors. This MCU also has incorporated 2MB of FLASH, mainly for code -storage. - -## Sensors - -- **Camera**: A GC2145 2 MP Color CMOS Camera. - -- **Microphone**: A - > [MP34DT05,](https://content.arduino.cc/assets/Nano_BLE_Sense_mp34dt05-a.pdf?_gl=1*12fxus9*_ga*MTQ3NzE4Mjk4Mi4xNjQwMDIwOTk5*_ga_NEXN8H46L5*MTY0NzQ0NTg1My4xMS4xLjE2NDc0NDc3NzMuMA..) - > an ultra-compact, low-power, omnidirectional, digital MEMS - > microphone built with a capacitive sensing element and an IC - > interface. - -- **6-Axis IMU**: 3D gyroscope and 3D accelerometer data from the - > LSM6DSOX 6-axis IMU. - -- **Time of Flight Sensor**: The VL53L1CBV0FY Time-of-Flight sensor - > adds accurate and low power-ranging capabilities to the Nicla - > Vision. The invisible near-infrared VCSEL laser (including the - > analog driver) is encapsulated with receiving optics in an - > all-in-one small module below the camera. - -### **HW Installation (Arduino IDE)** - -Start connecting the board (USB-C) to your computer : - -![](images_2/media/image14.jpg){width="6.5in" -height="3.0833333333333335in"} - -Install the Mbed OS core for Nicla boards in the Arduino IDE. Having the -IDE open, navigate to Tools \> Board \> Board Manager, look for Arduino -Nicla Vision on the search window, and install the board. - -![](images_2/media/image2.jpg){width="6.5in" -height="2.7083333333333335in"} - -Next, go to Tools \> Board \> Arduino Mbed OS Nicla Boards and select -Arduino Nicla Vision. Having your board connected to the USB, you should -see the Nicla on Port and select it. - -> *Open the Blink sketch on Examples/Basic and run it using the IDE -> Upload button. You should see the Built-in LED (green RGB) blinking, -> which means the Nicla board is correctly installed and functional!* - -## Testing the Microphone - -On Arduino IDE, go to Examples \> PDM \> PDMSerialPlotter, open and run -the sketch. Open the Plotter and see the audio representation from the -microphone: - -![](images_2/media/image9.png){width="6.5in" -height="4.361111111111111in"} - -> *Vary the frequency of the sound you generate and confirm that the mic -> is working correctly.* - -## Testing the IMU - -Before testing the IMU, it will be necessary to install the LSM6DSOX -library. For that, go to Library Manager and look for LSM6DSOX. Install -the library provided by Arduino: - -![](images_2/media/image19.jpg){width="6.5in" -height="2.4027777777777777in"} - -Next, go to Examples \> Arduino_LSM6DSOX \> SimpleAccelerometer and run -the accelerometer test (you can also run Gyro and board temperature): - -![](images_2/media/image28.png){width="6.5in" -height="4.361111111111111in"} - -### **Testing the ToF (Time of Flight) Sensor** - -As we did with IMU, installing the ToF library, the VL53L1X is -necessary. For that, go to Library Manager and look for VL53L1X. Install -the library provided by Pololu: - -![](images_2/media/image15.jpg){width="6.5in" -height="2.4583333333333335in"} - -Next, run the sketch -[proximity_detection.ino](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Micropython/distance_image_meter.py): - -![](images_2/media/image12.png){width="4.947916666666667in" -height="4.635416666666667in"} - -On the Serial Monitor, you will see the distance from the camera and an -object in front of it (max of 4m). - -![](images_2/media/image13.jpg){width="6.5in" -height="4.847222222222222in"} - -## Testing the Camera - -We can also test the camera using, for example, the code provided on -Examples \> Camera \> CameraCaptureRawBytes. We can not see the image -directly, but it is possible to get the raw image data generated by the -camera. - -Anyway, the best test with the camera is to see a live image. For that, -we will use another IDE, the OpenMV. - -## Installing the OpenMV IDE - -OpenMV IDE is the premier integrated development environment for use -with OpenMV Cameras and the one on the Portenta. It features a powerful -text editor, debug terminal, and frame buffer viewer with a histogram -display. We will use MicroPython to program the camera. - -Go to the [OpenMV IDE page](https://openmv.io/pages/download), download -the correct version for your Operating System, and follow the -instructions for its installation on your computer. - -![](images_2/media/image21.png){width="6.5in" -height="4.791666666666667in"} - -The IDE should open, defaulting the helloworld_1.py code on its Code -Area. If not, you can open it from Files \> Examples \> HelloWord \> -helloword.py - -![](images_2/media/image7.png){width="6.5in" -height="4.444444444444445in"} - -Any messages sent through a serial connection (using print() or error -messages) will be displayed on the **Serial Terminal** during run time. -The image captured by a camera will be displayed in the **Camera -Viewer** Area (or Frame Buffer) and in the Histogram area, immediately -below the Camera Viewer. - -OpenMV IDE is the premier integrated development environment with OpenMV -Cameras and the Arduino Pro boards. It features a powerful text editor, -debug terminal, and frame buffer viewer with a histogram display. We -will use MicroPython to program the Nicla Vision. - -> *Before connecting the Nicla to the OpenMV IDE, ensure you have the -> latest bootloader version. To that, go to your Arduino IDE, select the -> Nicla board, and open the sketch on Examples \> STM_32H747_System -> STM_32H747_updateBootloader. Upload the code to your board. The Serial -> Monitor will guide you.* - -After updating the bootloader, put the Nicla Vision in bootloader mode -by double-pressing the reset button on the board. The built-in green LED -will start fading in and out. Now return to the OpenMV IDE and click on -the connect icon (Left ToolBar): - -![](images_2/media/image23.jpg){width="4.010416666666667in" -height="1.0520833333333333in"} - -A pop-up will tell you that a board in DFU mode was detected and ask you -how you would like to proceed. First, select \"Install the latest -release firmware.\" This action will install the latest OpenMV firmware -on the Nicla Vision. - -![](images_2/media/image10.png){width="6.5in" -height="2.6805555555555554in"} - -You can leave the option of erasing the internal file system unselected -and click \[OK\]. - -Nicla\'s green LED will start flashing while the OpenMV firmware is -uploaded to the board, and a terminal window will then open, showing the -flashing progress. - -![](images_2/media/image5.png){width="4.854166666666667in" -height="3.5416666666666665in"} - -Wait until the green LED stops flashing and fading. When the process -ends, you will see a message saying, \"DFU firmware update complete!\". -Press \[OK\]. - -![](images_2/media/image1.png){width="3.875in" -height="5.708333333333333in"} - -A green play button appears when the Nicla Vison connects to the Tool -Bar. - -![](images_2/media/image18.jpg){width="4.791666666666667in" -height="1.4791666666666667in"} - -Also, note that a drive named "NO NAME" will appear on your computer.: - -![](images_2/media/image3.png){width="6.447916666666667in" -height="2.4166666666666665in"} - -Every time you press the \[RESET\] button on the board, it automatically -executes the main.py script stored on it. You can load the -[main.py](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Micropython/main.py) -code on the IDE (File \> Open File\...). - -![](images_2/media/image16.png){width="4.239583333333333in" -height="3.8229166666666665in"} - -> *This code is the \"Blink\" code, confirming that the HW is OK.* - -For testing the camera, let\'s run helloword_1.py. For that, select the -script on File \> Examples \> HelloWorld \> helloword.py, - -When clicking the green play button, the MicroPython script -(hellowolrd.py) on the Code Area will be uploaded and run on the Nicla -Vision. On-Camera Viewer, you will start to see the video streaming. The -Serial Monitor will show us the FPS (Frames per second), which should be -around 14fps. - -![](images_2/media/image6.png){width="6.5in" -height="3.9722222222222223in"} - -Let\'s go through the [helloworld.py](http://helloworld.py/) script: - -```python -# Hello World Example 2 -# -# Welcome to the OpenMV IDE! Click on the green run arrow button below to run the script! - -import sensor, image, time - -sensor.reset() # Reset and initialize the sensor. -sensor.set_pixformat(sensor.RGB565) # Set pixel format to RGB565 (or GRAYSCALE) -sensor.set_framesize(sensor.QVGA) # Set frame size to QVGA (320x240) -sensor.skip_frames(time = 2000) # Wait for settings take effect. -clock = time.clock() # Create a clock object to track the FPS. - -while(True): - clock.tick() # Update the FPS clock. - img = sensor.snapshot() # Take a picture and return the image. - print(clock.fps()) -``` - - -In GitHub, you can find the Python scripts used here. - -The code can be split into two parts: - -- **Setup**: Where the libraries are imported and initialized, and the - > variables are defined and initiated. - -- **Loop**: (while loop) part of the code that runs continually. The - > image (img variable) is captured (a frame). Each of those frames - > can be used for inference in Machine Learning Applications. - -To interrupt the program execution, press the red \[X\] button. - -> *Note: OpenMV Cam runs about half as fast when connected to the IDE. -> The FPS should increase once disconnected.* - -In [[the GitHub, You can find other Python -scripts]{.underline}](https://github.com/Mjrovai/Arduino_Nicla_Vision/tree/main/Micropython). -Try to test the onboard sensors. - -## Connecting the Nicla Vision to Edge Impulse Studio - -We will use the Edge Impulse Studio later in other exercises. [Edge -Impulse I](https://www.edgeimpulse.com/)s a leading development platform -for machine learning on edge devices. - -Edge Impulse officially supports the Nicla Vision. So, for starting, -please create a new project on the Studio and connect the Nicla to it. -For that, follow the steps: - -- Download the [last EI - > Firmware](https://cdn.edgeimpulse.com/firmware/arduino-nicla-vision.zip) - > and unzip it. - -- Open the zip file on your computer and select the uploader related - > to your OS: - -![](images_2/media/image17.png){width="4.416666666666667in" -height="1.5520833333333333in"} - -- Put the Nicla-Vision on Boot Mode, pressing the reset button twice. - -- Execute the specific batch code for your OS for uploading the binary - > (arduino-nicla-vision.bin) to your board. - -Go to your project on the Studio, and on the Data Acquisition tab, -select WebUSB (1). A window will appear; choose the option that shows -that the Nicla is pared (2) and press \[Connect\] (3). - -![](images_2/media/image27.png){width="6.5in" -height="4.319444444444445in"} - -In the Collect Data section on the Data Acquisition tab, you can choose -what sensor data you will pick. - -![](images_2/media/image25.png){width="6.5in" -height="4.319444444444445in"} - -For example. IMU data: - -![](images_2/media/image8.png){width="6.5in" -height="4.319444444444445in"} - -Or Image: - -![](images_2/media/image4.png){width="6.5in" -height="4.319444444444445in"} - -And so on. You can also test an external sensor connected to the Nicla -ADC (pin 0) and the other onboard sensors, such as the microphone and -the ToF. - -### **Expanding the Nicla Vision Board (optional)** - -A last item to be explored is that sometimes, during prototyping, it is -essential to experiment with external sensors and devices, and an -excellent expansion to the Nicla is the [Arduino MKR Connector Carrier -(Grove -compatible)](https://store-usa.arduino.cc/products/arduino-mkr-connector-carrier-grove-compatible). - -The shield has 14 Grove connectors: five single analog inputs, one -single analog input, five single digital I/Os, one double digital I/O, -one I2C, and one UART. All connectors are 5V compatible. - -> *Note that besides all 17 Nicla Vision pins that will be connected to -> the Shield Groves, some Grove connections are disconnected.* - -![](images_2/media/image20.jpg){width="6.5in" -height="4.875in"} - -This shield is MKR compatible and can be used with the Nicla Vision and -the Portenta. - -![](images_2/media/image26.jpg){width="4.34375in" -height="5.78125in"} - -For example, suppose that on a TinyML project, you want to send -inference results using a LoRaWan device and add information about local -luminosity. Besides, with offline operations, a local low-power display -as an OLED display is advised. This setup can be seen here: - -![](images_2/media/image11.jpg){width="6.5in" -height="4.708333333333333in"} - -The [Grove Light -Sensor](https://wiki.seeedstudio.com/Grove-Light_Sensor/) would be -connected to one of the single Analog pins (A0/PC4), the [LoRaWan -device](https://wiki.seeedstudio.com/Grove_LoRa_E5_New_Version/) to the -UART, and the [OLED](https://arduino.cl/producto/display-oled-grove/) to -the I2C connector. - -The Nicla Pins 3 (Tx) and 4 (Rx) are connected with the Shield Serial -connector. The UART communication is used with the LoRaWan device. Here -is a simple code to use the UART.: - -```python -# UART Test - By: marcelo_rovai - Sat Sep 23 2023 - -import time -from pyb import UART -from pyb import LED - -redLED = LED(1) # built-in red LED - -# Init UART object. -# Nicla Vision's UART (TX/RX pins) is on "LP1" -uart = UART("LP1", 9600) - -while(True): - uart.write("Hello World!\r\n") - redLED.toggle() - time.sleep_ms(1000) - -``` - -To verify if the UART is working, you should, for example, connect -another device as an [Arduino -UNO](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Arduino-IDE/teste_uart_UNO/teste_uart_UNO.ino), -displaying the Hello Word. - -![](images_2/media/image24.gif){width="2.8125in" -height="3.75in"} - -Here is a Hello World code to be used with the I2C OLED. The MicroPython -SSD1306 OLED driver (ssd1306.py), created by Adafruit, should also be -uploaded to the Nicla (the -[[ssd1306.py]{.underline}](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Micropython/ssd1306.py) -can be found in GitHub). - - -```python -# Nicla_OLED_Hello_World - By: marcelo_rovai - Sat Sep 30 2023 - -#Save on device: MicroPython SSD1306 OLED driver, I2C and SPI interfaces created by Adafruit -import ssd1306 - -from machine import I2C -i2c = I2C(1) - -oled_width = 128 -oled_height = 64 -oled = ssd1306.SSD1306_I2C(oled_width, oled_height, i2c) - -oled.text('Hello, World', 10, 10) -oled.show() -``` - -Finally, here is a simple script to read the ADC value on pin \"PC4\" -(Nicla pin A0): -```python - -# Light Sensor (A0) - By: marcelo_rovai - Wed Oct 4 2023 - -import pyb -from time import sleep - -adc = pyb.ADC(pyb.Pin("PC4")) # create an analog object from a pin -val = adc.read() # read an analog value - -while (True): - - val = adc.read() - print ("Light={}".format (val)) - sleep (1) -``` - -The ADC can be used for other valuable sensors, such as -[Temperature](https://wiki.seeedstudio.com/Grove-Temperature_Sensor_V1.2/). - -> *Note that the above scripts ([[downloaded from -> Github]{.underline}](https://github.com/Mjrovai/Arduino_Nicla_Vision/tree/main/Micropython)) -> only introduce how to connect external devices with the Nicla Vision -> board using MicroPython.* - -## Conclusion - -The Arduino Nicla Vision is an excellent *tiny device* for industrial -and professional uses! However, it is powerful, trustworthy, low power, -and has suitable sensors for the most common embedded machine learning -applications such as vision, movement, sensor fusion, and sound. - -> *On the* *[GitHub -> repository,](https://github.com/Mjrovai/Arduino_Nicla_Vision/tree/main) -> you will find the last version of all the codes used or commented on -> in this exercise.* diff --git a/ethics.qmd b/ethics.qmd index ea57f163..7935f9f9 100644 --- a/ethics.qmd +++ b/ethics.qmd @@ -1,6 +1,6 @@ # Ethical AI -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * coming soon. diff --git a/frameworks.qmd b/frameworks.qmd index 85f2ab9b..b99429df 100644 --- a/frameworks.qmd +++ b/frameworks.qmd @@ -1,119 +1,1731 @@ # AI Frameworks -::: {.callout-tip collapse="true"} +![_DALL·E 3 Prompt: Illustration in a rectangular format, designed for a professional textbook, where the content spans the entire width. The vibrant chart represents training and inference frameworks for ML. Icons for TensorFlow, Keras, PyTorch, ONNX, and TensorRT are spread out, filling the entire horizontal space, and aligned vertically. Each icon is accompanied by brief annotations detailing their features. The lively colors like blues, greens, and oranges highlight the icons and sections against a soft gradient background. The distinction between training and inference frameworks is accentuated through color-coded sections, with clean lines and modern typography maintaining clarity and focus._](./images/cover_ml_frameworks.png) + +In this chapter, we explore the landscape of AI frameworks that serve as the foundation for developing machine learning systems. AI frameworks provide the essential tools, libraries, and environments necessary to design, train, and deploy machine learning models. We delve into the evolutionary trajectory of these frameworks, dissect the workings of TensorFlow, and provide insights into the core components and advanced features that define these frameworks. + +Furthermore, we investigate the specialization of frameworks tailored to specific needs, the emergence of frameworks specifically designed for embedded AI, and the criteria for selecting the most suitable framework for your project. This exploration will be rounded off by a glimpse into the future trends that are expected to shape the landscape of ML frameworks in the coming years. + +::: {.callout-tip} ## Learning Objectives -* coming soon. +* Understand the evolution and capabilities of major machine learning frameworks. This includes graph execution models, programming paradigms, hardware acceleration support, and how they have expanded over time. + +* Learn the core components and functionality of frameworks like computational graphs, data pipelines, optimization algorithms, training loops, etc. that enable efficient model building. + +* Compare frameworks across different environments like cloud, edge, and tinyML. Learn how frameworks specialize based on computational constraints and hardware. + +* Dive deeper into embedded and tinyML focused frameworks like TensorFlow Lite Micro, CMSIS-NN, TinyEngine etc. and how they optimize for microcontrollers. + +* Explore model conversion and deployment considerations when choosing a framework, including aspects like latency, memory usage, and hardware support. + +* Evaluate key factors in selecting the right framework like performance, hardware compatibility, community support, ease of use, etc. based on the specific project needs and constraints. + +* Understand the limitations of current frameworks and potential future trends like using ML to improve frameworks, decomposed ML systems, and high performance compilers. ::: ## Introduction -Explanation: Discuss what ML frameworks are and why they are important. Also, elaborate on the aspects involved in understanding how an ML framework is developed and deployed. +Machine learning frameworks provide the tools and infrastructure to +efficiently build, train, and deploy machine learning models. In this +chapter, we will explore the evolution and key capabilities of major +frameworks like [TensorFlow (TF)](https://www.tensorflow.org/), [PyTorch](https://pytorch.org/), and specialized frameworks for +embedded devices. We will dive into the components like computational +graphs, optimization algorithms, hardware acceleration, and more that +enable developers to quickly construct performant models. Understanding +these frameworks is essential to leverage the power of deep learning +across the spectrum from cloud to edge devices. -- Definition of ML Frameworks -- What is an ML framework? -- Why are ML frameworks important? -- Go over the design and implementation -- Examples of ML frameworks -- Challenges of embedded systems +ML frameworks handle much of the complexity of model development through +high-level APIs and domain-specific languages that allow practitioners +to quickly construct models by combining pre-made components and +abstractions. For example, frameworks like TensorFlow and PyTorch +provide Python APIs to define neural network architectures using layers, +optimizers, datasets, and more. This enables rapid iteration compared to +coding every model detail from scratch. -## Evolution of AI Frameworks +A key capability offered by frameworks is distributed training engines +that can scale model training across clusters of GPUs and TPUs. This +makes it feasible to train state-of-the-art models with billions or +trillions of parameters on vast datasets. Frameworks also integrate with +specialized hardware like NVIDIA GPUs to further accelerate training via +optimizations like parallelization and efficient matrix operations. -- High-level vs. low-level frameworks -- Static vs. dynamic computation graph frameworks -- Plot showing number of different frameworks and shrinking +In addition, frameworks simplify deploying finished models into +production through tools like [TensorFlow Serving](https://www.tensorflow.org/tfx/guide/serving) for scalable model +serving and [TensorFlow Lite](https://www.tensorflow.org/lite) for optimization on mobile and edge devices. +Other valuable capabilities include visualization, model optimization +techniques like quantization and pruning, and monitoring metrics during +training. -## Types of AI Frameworks +Leading open source frameworks like TensorFlow, PyTorch, and [MXNet](https://mxnet.apache.org/versions/1.9.1/) power +much of AI research and development today. Commercial offerings like +[Amazon SageMaker](https://aws.amazon.com/pm/sagemaker/?trk=b6c2fafb-22b1-4a97-a2f7-7e4ab2c7aa28&sc_channel=ps&ef_id=CjwKCAjws9ipBhB1EiwAccEi1JpbBz6j4t7sRUoAiKFDc0mi59faZYge5MuFecAU6zGDQYTFz9NnaBoCV-wQAvD_BwE:G:s&s_kwcid=AL!4422!3!651751060692!e!!g!!amazon%20sagemaker!19852662230!145019225977) and [Microsoft Azure Machine Learning](https://azure.microsoft.com/en-us/free/machine-learning/search/?ef_id=_k_CjwKCAjws9ipBhB1EiwAccEi1JVOThls797Sj3Li96_GYjoJQDx_EWaXNsDaEWeFbIaRkESUCkq64xoCSmwQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAjws9ipBhB1EiwAccEi1JVOThls797Sj3Li96_GYjoJQDx_EWaXNsDaEWeFbIaRkESUCkq64xoCSmwQAvD_BwE_k_&gad=1&gclid=CjwKCAjws9ipBhB1EiwAccEi1JVOThls797Sj3Li96_GYjoJQDx_EWaXNsDaEWeFbIaRkESUCkq64xoCSmwQAvD_BwE) integrate these +open source frameworks with proprietary capabilities and enterprise +tools. -- Cloud-based AI frameworks -- Edge AI frameworks -- TinyML frameworks +Machine learning engineers and practitioners leverage these robust +frameworks to focus on high-value tasks like model architecture, feature +engineering, and hyperparameter tuning instead of infrastructure. The +goal is to efficiently build and deploy performant models that solve +real-world problems. -## Popular AI Frameworks +In this chapter, we will explore today\'s leading cloud frameworks and +how they have adapted models and tools specifically for embedded and +edge deployment. We will compare programming models, supported hardware, +optimization capabilities, and more to fully understand how frameworks +enable scalable machine learning from the cloud to the edge. -Explanation: Discuss the most common types of ML frameworks available and provide a high-level overview, so that we can set into motion what makes embedded ML frameworks unique. +## Framework Evolution -- TensorFlow, PyTorch, Keras, ONNX Runtime, Scikit-learn -- Key Features and Advantages -- API and Programming Paradigms -- Table comparing the different frameworks +Machine learning frameworks have evolved significantly over time to meet +the diverse needs of machine learning practitioners and advancements in +AI techniques. A few decades ago, building and training machine learning +models required extensive low-level coding and infrastructure. Machine +learning frameworks have evolved considerably over the past decade to +meet the expanding needs of practitioners and rapid advances in deep +learning techniques. Early neural network research was constrained by +insufficient data and compute power. Building and training machine +learning models required extensive low-level coding and infrastructure. +But the release of large datasets like [ImageNet](https://www.image-net.org/) [@deng2009imagenet] and advancements +in parallel GPU computing unlocked the potential for far deeper neural +networks. -## Basic Components +The first ML frameworks, [Theano](https://pypi.org/project/Theano/#:~:text=Theano%20is%20a%20Python%20library,a%20similar%20interface%20to%20NumPy's.) by @al2016theano and [Caffe](https://caffe.berkeleyvision.org/) by @jia2014caffe, were developed +by academic institutions (Montreal Institute for Learning Algorithms, +Berkeley Vision and Learning Center). Amid a growing interest in deep +learning due to state-of-the-art performance of AlexNet @krizhevsky2012imagenet on the +ImageNet dataset, private companies and individuals began developing ML +frameworks, resulting in frameworks such as [Keras](https://keras.io/) by @chollet2018keras, [Chainer](https://chainer.org/) by @tokui2015chainer, +TensorFlow from Google [@abadi2016tensorflow], [CNTK](https://learn.microsoft.com/en-us/cognitive-toolkit/) by Microsoft [@seide2016cntk], and PyTorch by +Facebook [@paszke2019pytorch]. -- Computational graphs -- Tensor data structures -- Distributed training -- Model optimizations -- Code generation -- Differentiable programming -- Hardware acceleration support (GPUs, TPUs) +Many of these ML frameworks can be divided into categories, namely +high-level vs. low-level frameworks and static vs. dynamic computational +graph frameworks. High-level frameworks provide a higher level of +abstraction than low-level frameworks. That is, high-level frameworks +have pre-built functions and modules for common ML tasks, such as +creating, training, and evaluating common ML models as well as +preprocessing data, engineering features, and visualizing data, which +low-level frameworks do not have. Thus, high-level frameworks may be +easier to use, but are not as customizable as low-level frameworks (i.e. +users of low-level frameworks can define custom layers, loss functions, +optimization algorithms, etc.). Examples of high-level frameworks +include TensorFlow/Keras and PyTorch. Examples of low-level ML +frameworks include TensorFlow with low-level APIs, Theano, Caffe, +Chainer, and CNTK. -## Advanced Features +Frameworks like Theano and Caffe used static computational graphs which +required rigidly defining the full model architecture upfront. Static +graphs require upfront declaration and limit flexibility. Dynamic graphs +construct on-the-fly for more iterative development. But around 2016, +frameworks began adopting dynamic graphs like PyTorch and TensorFlow 2.0 +which can construct graphs on-the-fly. This provides greater flexibility +for model development. We will discuss these concepts and details later +on in the AI Training section. -- AutoML, No-Code/Low-Code ML -- Transfer learning -- Federated learning -- Model conversion -- Distributed training -- End-to-End ML Platforms +The development of these frameworks facilitated an explosion in model +size and complexity over time---from early multilayer perceptrons and +convolutional networks to modern transformers with billions or trillions +of parameters. In 2016, ResNet models by @he2016deep achieved record ImageNet accuracy +with over 150 layers and 25 million parameters. Then in 2020, the GPT-3 +language model from OpenAI [@brown2020language] pushed parameters to an astonishing 175 billion using +model parallelism in frameworks to train across thousands of GPUs and +TPUs. -## Embedded AI Constraints +Each generation of frameworks unlocked new capabilities that powered +advancement: -Explanation: Describe the constraints of embedded systems, referring to the previous chapters, and remind readers about the challenges and why we need to consider creating lean and efficient solutions. +- Theano and TensorFlow (2015) introduced computational graphs and automatic differentiation to simplify model building. -### Hardware +- CNTK (2016) pioneered efficient distributed training by combining model and data parallelism. -- Memory Usage -- Processing Power -- Energy Efficiency -- Storage Limitations -- Hardware Diversity +- PyTorch (2016) provided imperative programming and dynamic graphs for flexible experimentation. -### Software +- TensorFlow 2.0 (2019) made eager execution default for intuitiveness and debugging. + +- TensorFlow Graphics (2020) added 3D data structures to handle point clouds and meshes. + +In recent years, there has been a convergence on the frameworks. +TensorFlow and PyTorch have become the overwhelmingly dominant ML +frameworks, representing more than 95% of ML frameworks used in research +and production. Keras was integrated into TensorFlow in 2019; Preferred +Networks transitioned Chainer to PyTorch in 2019; and Microsoft stopped +actively developing CNTK in 2022 in favor of supporting PyTorch on +Windows. + +![Popularity of ML frameworks in the United States as measured by Google +web searches](images_ml_frameworks/image6.png){width="3.821385608048994in" +height="2.5558081802274715in"} + +However, a one-size-fits-all approach does not work well across the +spectrum from cloud to tiny edge devices. Different frameworks represent +various philosophies around graph execution, declarative versus +imperative APIs, and more. Declarative defines what the program should +do while imperative focuses on how it should do it step-by-step. For +instance, TensorFlow uses graph execution and declarative-style modeling +while PyTorch adopts eager execution and imperative modeling for more +Pythonic flexibility. Each approach carries tradeoffs that we will +discuss later in the Basic Components section. + +Today\'s advanced frameworks enable practitioners to develop and deploy +increasingly complex models - a key driver of innovation in the AI +field. But they continue to evolve and expand their capabilities for the +next generation of machine learning. To understand how these systems +continue to evolve, we will dive deeper into TensorFlow as an example of +how the framework grew in complexity over time. + +## DeepDive into TensorFlow + +TensorFlow was developed by the Google Brain team and was released as an +open-source software library on November 9, 2015. It was designed for +numerical computation using data flow graphs and has since become +popular for a wide range of machine learning and deep learning +applications. + +TensorFlow is both a training and inference framework and provides +built-in functionality to handle everything from model creation and +training, to deployment. Since its initial development, the TensorFlow +ecosystem has grown to include many different "varieties" of TensorFlow +that are each intended to allow users to support ML on different +platforms. In this section, we will mainly discuss only the core +package. + +### TF Ecosystem + +1. [TensorFlow Core](https://www.tensorflow.org/tutorials): primary package that most developers engage with. It provides a comprehensive, flexible platform for defining, training, and deploying machine learning models. It includes tf.keras as its high-level API. + +2. [TensorFlow Lite](https://www.tensorflow.org/lite): designed for deploying lightweight models on mobile, embedded, and edge devices. It offers tools to convert TensorFlow models to a more compact format suitable for limited-resource devices and provides optimized pre-trained models for mobile. + +3. [TensorFlow.js](https://www.tensorflow.org/js): JavaScript library that allows training and deployment of machine learning models directly in the browser or on Node.js. It also provides tools for porting pre-trained TensorFlow models to the browser-friendly format. + +4. [TensorFlow on Edge Devices (Coral)](https://developers.googleblog.com/2019/03/introducing-coral-our-platform-for.html): platform of hardware components and software tools from Google that allows the execution of TensorFlow models on edge devices, leveraging Edge TPUs for acceleration. + +5. [TensorFlow Federated (TFF)](https://www.tensorflow.org/federated): framework for machine learning and other computations on decentralized data. TFF facilitates federated learning, allowing model training across many devices without centralizing the data. + +6. [TensorFlow Graphics](https://www.tensorflow.org/graphics): library for using TensorFlow to carry out graphics-related tasks, including 3D shapes and point clouds processing, using deep learning. + +7. [TensorFlow Hub](https://www.tensorflow.org/hub): repository of reusable machine learning model components to allow developers to reuse pre-trained model components, facilitating transfer learning and model composition + +8. [TensorFlow Serving](https://www.tensorflow.org/tfx/guide/serving): framework designed for serving and deploying machine learning models for inference in production environments. It provides tools for versioning and dynamically updating deployed models without service interruption. + +9. [TensorFlow Extended (TFX)](https://www.tensorflow.org/tfx): end-to-end platform designed to deploy and manage machine learning pipelines in production settings. TFX encompasses components for data validation, preprocessing, model training, validation, and serving. + +TensorFlow was developed to address the limitations of DistBelief [@abadi2016tensorflow]---the +framework in use at Google from 2011 to 2015---by providing flexibility +along three axes: 1) defining new layers, 2) refining training +algorithms, and 3) defining new training algorithms. To understand what +limitations in DistBelief led to the development of TensorFlow, we will +first give a brief overview of the Parameter Server Architecture that +DistBelief employed [@dean2012large]. + +The Parameter Server (PS) architecture is a popular design for +distributing the training of machine learning models, especially deep +neural networks, across multiple machines. The fundamental idea is to +separate the storage and management of model parameters from the +computation used to update these parameters: + +**Storage**: The storage and management of model parameters were handled +by the stateful parameter server processes. Given the large scale of +models and the distributed nature of the system, these parameters were +sharded across multiple parameter servers. Each server maintained a +portion of the model parameters, making it \"stateful\" as it had to +maintain and manage this state across the training process. + +**Computation**: The worker processes, which could be run in parallel, +were stateless and purely computational, processing data and computing +gradients without maintaining any state or long-term memory [@li2014communication]. + +DistBelief and its architecture defined above were crucial in enabling +distributed deep learning at Google but also introduced limitations that +motivated the development of TensorFlow: + +### Static Computation Graph + +In the parameter server architecture, model parameters are distributed +across various parameter servers. Since DistBelief was primarily +designed for the neural network paradigm, parameters corresponded to a +fixed structure of the neural network. If the computation graph were +dynamic, the distribution and coordination of parameters would become +significantly more complicated. For example, a change in the graph might +require the initialization of new parameters or the removal of existing +ones, complicating the management and synchronization tasks of the +parameter servers. This made it harder to implement models outside the +neural framework or models that required dynamic computation graphs. + +TensorFlow was designed to be a more general computation framework where +the computation is expressed as a data flow graph. This allows for a +wider variety of machine learning models and algorithms outside of just +neural networks, and provides flexibility in refining models. + +### Usability & Deployment + +The parameter server model involves a clear delineation of roles (worker +nodes and parameter servers), and is optimized for data center +deployments which might not be optimal for all use cases. For instance, +on edge devices or in other non-data center environments, this division +introduces overheads or complexities. + +TensorFlow was built to run on multiple platforms, from mobile devices +and edge devices, to cloud infrastructure. It also aimed to provide ease +of use between local and distributed training, and to be more +lightweight, and developer friendly. + +### Architecture Design + +Rather than using the parameter server architecture, TensorFlow instead +deploys tasks across a cluster. These tasks are named processes that can +communicate over a network, and each can execute TensorFlow\'s core +construct: the dataflow graph, and interface with various computing +devices (like CPUs or GPUs). This graph is a directed representation +where nodes symbolize computational operations, and edges depict the +tensors (data) flowing between these operations. + +Despite the absence of traditional parameter servers, some tasks, called +"PS tasks", still perform the role of storing and managing parameters, +reminiscent of parameter servers in other systems. The remaining tasks, +which usually handle computation, data processing, and gradient +calculations, are referred to as \"worker tasks.\" TensorFlow\'s PS +tasks can execute any computation representable by the dataflow graph, +meaning they aren\'t just limited to parameter storage, and the +computation can be distributed. This capability makes them significantly +more versatile and gives users the power to program the PS tasks using +the standard TensorFlow interface, the same one they\'d use to define +their models. As mentioned above, dataflow graphs' structure also makes +it inherently good for parallelism allowing for processing of large +datasets. + +### Built-in Functionality & Keras + +TensorFlow includes libraries to help users develop and deploy more +use-case specific models, and since this framework is open-source, this +list continues to grow. These libraries address the entire ML +development life-cycle: data preparation, model building, deployment, as +well as responsible AI. + +Additionally, one of TensorFlow's biggest advantages is its integration +with Keras, though as we will cover in the next section, Pytorch recently also added a Keras integration. Keras is another ML framework that was built to be extremely +user-friendly and as a result has a high level of abstraction. We will +cover Keras in more depth later in this chapter, but when discussing its +integration with TensorFlow, the most important thing to note is that it +was originally built to be backend agnostic. This means users could +abstract away these complexities, offering a cleaner, more intuitive way +to define and train models without worrying about compatibility issues +with different backends. TensorFlow users had some complaints about the +usability and readability of TensorFlow's API, so as TF gained +prominence it integrated Keras as its high-level API. This integration +offered major benefits to TensorFlow users since it introduced more +intuitive readability, and portability of models while still taking +advantage of powerful backend features, Google support, and +infrastructure to deploy models on various platforms. + +### Limitations and Challenges + +TensorFlow is one of the most popular deep learning frameworks but does +have criticisms and weaknesses-- mostly focusing on usability, and +resource usage. The rapid pace of updates through its support from +Google, while advantageous, has sometimes led to issues of backward +compatibility, deprecated functions, and shifting documentation. +Additionally, even with the Keras implementation, the syntax and +learning curve of TensorFlow can be difficult for new users. One major +critique of TensorFlow is its high overhead and memory consumption due +to the range of built in libraries and support. Some of these concerns +can be addressed by using pared down versions, but can still be limiting +in resource-constrained environments. + +### PyTorch vs. TensorFlow + +PyTorch and TensorFlow have established themselves as frontrunners in +the industry. Both frameworks offer robust functionalities, but they +differ in terms of their design philosophies, ease of use, ecosystem, +and deployment capabilities. + +**Design Philosophy and Programming Paradigm:** PyTorch uses a dynamic +computational graph, termed as eager execution. This makes it intuitive +and facilitates debugging since operations are executed immediately and +can be inspected on-the-fly. In comparison, earlier versions of +TensorFlow were centered around a static computational graph, which +required the graph\'s complete definition before execution. However, +TensorFlow 2.0 introduced eager execution by default, making it more +aligned with PyTorch in this regard. PyTorch\'s dynamic nature and +Python based approach has enabled its simplicity and flexibility, +particularly for rapid prototyping. TensorFlow\'s static graph approach +in its earlier versions had a steeper learning curve; the introduction +of TensorFlow 2.0, with its Keras integration as the high-level API, has +significantly simplified the development process. + +**Deployment:** PyTorch is heavily favored in research environments, +deploying PyTorch models in production settings was traditionally +challenging. However, with the introduction of TorchScript and the +TorchServe tool, deployment has become more feasible. One of +TensorFlow\'s strengths lies in its scalability and deployment +capabilities, especially on embedded and mobile platforms with +TensorFlow Lite. TensorFlow Serving and TensorFlow.js further facilitate +deployment in various environments, thus giving it a broader reach in +the ecosystem. + +**Performance:** Both frameworks offer efficient hardware acceleration +for their operations. However, TensorFlow has a slightly more robust +optimization workflow, such as the XLA (Accelerated Linear Algebra) +compiler, which can further boost performance. Its static computational +graph, in the early versions, was also advantageous for certain +optimizations. + +**Ecosystem:** PyTorch has a growing ecosystem with tools like +TorchServe for serving models and libraries like TorchVision, TorchText, +and TorchAudio for specific domains. As we mentioned earlier, TensorFlow +has a broad and mature ecosystem. TensorFlow Extended (TFX) provides an +end-to-end platform for deploying production machine learning pipelines. +Other tools and libraries include TensorFlow Lite, TensorFlow.js, +TensorFlow Hub, and TensorFlow Serving. + +Here's a summarizing comparative analysis: + +| Feature/Aspect | PyTorch | TensorFlow | +|-----------------------------|------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------| +| Design Philosophy | Dynamic computational graph (eager execution) | Static computational graph (early versions); Eager execution in TensorFlow 2.0 | +| Deployment | Traditionally challenging; Improved with TorchScript & TorchServe | Scalable, especially on embedded platforms with TensorFlow Lite | +| Performance & Optimization | Efficient GPU acceleration | Robust optimization with XLA compiler | +| Ecosystem | TorchServe, TorchVision, TorchText, TorchAudio | TensorFlow Extended (TFX), TensorFlow Lite, TensorFlow.js, TensorFlow Hub, TensorFlow Serving | +| Ease of Use | Preferred for its Pythonic approach and rapid prototyping | Initially steep learning curve; Simplified with Keras in TensorFlow 2.0 | + + +## Basic Framework Components + +### Tensor data structures + +To understand tensors, let us start from the familiar concepts in linear +algebra. Vectors can be represented as a stack of numbers in a +1-dimensional array. Matrices follow the same idea, and one can think of +them as many vectors being stacked on each other, making it 2 +dimensional. Higher dimensional tensors work the same way. A +3-dimensional tensor is simply a set of matrices stacked on top of each +other in another direction. The figure below demonstrates this step. +Therefore, vectors and matrices can be considered special cases of +tensors, with 1D and 2D dimensions respectively. + +![Visualization of Tensor Data Structure](images_ml_frameworks/image2.png){width="3.9791666666666665in" height="1.9672287839020122in" caption="Visualization of Tensor Data Structure" align="center"} + +Defining formally, in machine learning, tensors are a multi-dimensional +array of numbers. The number of dimensions defines the rank of the +tensor. As a generalization of linear algebra, the study of tensors is +called multilinear algebra. There are noticeable similarities between +matrices and higher ranked tensors. First, it is possible to extend the +definitions given in linear algebra to tensors, such as with +eigenvalues, eigenvectors, and rank (in the linear algebra sense) . +Furthermore, with the way that we have defined tensors, it is possible +to turn higher dimensional tensors into matrices. This turns out to be +very critical in practice, as multiplication of abstract representations +of higher dimensional tensors are often completed by first converting +them into matrices for multiplication. + +Tensors offer a flexible data structure with its ability to represent +data in higher dimensions. For example, to represent color image data, +for each of the pixel values (in 2 dimensions), one needs the color +values for red, green and blue. With tensors, it is easy to contain +image data in a single 3-dimensional tensor with each of the numbers +within it representing a certain color value in the certain location of +the image. Extending even further, if we wanted to store a series of +images, we can simply extend the dimensions such that the new dimension +(to create a 4-dimensional tensor) represents the different images that +we have. This is exactly what the famous [MNIST](https://www.tensorflow.org/datasets/catalog/mnist) dataset does, +loading a single 4-dimensional tensor when one calls to load the +dataset, allowing a compact representation of all the data in one place. + +### Computational graphs + +#### Graph Definition + +Computational graphs are a key component of deep learning frameworks +like TensorFlow and PyTorch. They allow us to express complex neural +network architectures in a way that can be efficiently executed and +differentiated. A computational graph consists of a directed acyclic +graph (DAG) where each node represents an operation or variable, and +edges represent data dependencies between them. + +For example, a node might represent a matrix multiplication operation, +taking two input matrices (or tensors) and producing an output matrix +(or tensor). To visualize this, consider the simple example below. The +directed acyclic graph above computes $z = x \times y$, where each of +the variables are just numbers. + +![Basic Example of Computational Graph](images_ml_frameworks/image1.png){width="50%" height="auto" align="center" caption="Basic Example of Computational Graph"} + +Underneath the hood, the computational graphs represent abstractions for +common layers like convolutional, pooling, recurrent, and dense layers, +with data including activations, weights, biases, are represented in +tensors. Convolutional layers form the backbone of CNN models for +computer vision. They detect spatial patterns in input data through +learned filters. Recurrent layers like LSTMs and GRUs enable processing +sequential data for tasks like language translation. Attention layers +are used in transformers to draw global context from the entire input. + +Broadly speaking, layers are higher level abstractions that define +computations on top of those tensors. For example, a Dense layer +performs a matrix multiplication and addition between input/weight/bias +tensors. Note that a layer operates on tensors as inputs and outputs and +the layer itself is not a tensor. Some key differences: + +- Layers contain states like weights and biases. Tensors are + stateless, just holding data. + +- Layers can modify internal state during training. Tensors are + immutable/read-only. + +- Layers are higher level abstractions. Tensors are lower level, + directly representing data and math operations. + +- Layers define fixed computation patterns. Tensors flow between + layers during execution. + +- Layers are used indirectly when building models. Tensors flow + > between layers during execution. + +So while tensors are a core data structure that layers consume and +produce, layers have additional functionality for defining parameterized +operations and training. While a layer configures tensor operations +under the hood, the layer itself remains distinct from the tensor +objects. The layer abstraction makes building and training neural +networks much more intuitive. This sort of abstraction enables +developers to build models by stacking these layers together, without +having to implement the layer logic themselves. For example, calling +`tf.keras.layers.Conv2D` in TensorFlow creates a convolutional layer. The +framework handles computing the convolutions, managing parameters, etc. +This simplifies model development, allowing developers to focus on +architecture rather than low-level implementations. Layer abstractions +utilize highly optimized implementations for performance. They also +enable portability, as the same architecture can run on different +hardware backends like GPUs and TPUs. + +In addition, computational graphs include activation functions like +ReLU, sigmoid, and tanh that are essential to neural networks and many +frameworks provide these as standard abstractions. These functions +introduce non-linearities that enable models to approximate complex +functions. Frameworks provide these as simple, pre-defined operations +that can be used when constructing models. For example, tf.nn.relu in +TensorFlow. This abstraction enables flexibility, as developers can +easily swap activation functions for tuning performance. Pre-defined +activations are also optimized by the framework for faster execution. + +In recent years, models like ResNets and MobileNets have emerged as +popular architectures, with current frameworks pre-packaging these as +computational graphs. Rather than worrying about the fine details, +developers can utilize them as a starting point, customizing as needed +by substituting layers. This simplifies and speeds up model development, +avoiding reinventing architectures from scratch. Pre-defined models +include well-tested, optimized implementations that ensure good +performance. Their modular design also enables transferring learned +features to new tasks via transfer learning. In essence, these +pre-defined architectures provide high-performance building blocks to +quickly create robust models. + +These layer abstractions, activation functions, and predefined +architectures provided by the frameworks are what constitute a +computational graph. When a user defines a layer in a framework (e.g. +tf.keras.layers.Dense()), the framework is configuring computational +graph nodes and edges to represent that layer. The layer parameters like +weights and biases become variables in the graph. The layer computations +become operation nodes (such as the x and y in the figure above). When +you call an activation function like tf.nn.relu(), the framework adds a +ReLU operation node to the graph. Predefined architectures are just +pre-configured subgraphs that can be inserted into your model\'s graph. +Thus, model definition via high-level abstractions creates a +computational graph. The layers, activations, and architectures we use +become graph nodes and edges. + +When we define a neural network architecture in a framework, we are +implicitly constructing a computational graph. The framework uses this +graph to determine operations to run during training and inference. +Computational graphs bring several advantages over raw code and that's +one of the core functionalities that is offered by a good ML framework: + +- Explicit representation of data flow and operations + +- Ability to optimize graph before execution + +- Automatic differentiation for training + +- Language agnosticism - graph can be translated to run on GPUs, TPUs, etc. + +- Portability - graph can be serialized, saved, and restored later + +Computational graphs are the fundamental building blocks of ML +frameworks. Model definition via high-level abstractions creates a +computational graph. The layers, activations, and architectures we use +become graph nodes and edges. The framework compilers and optimizers +operate on this graph to generate executable code. Essentially, the +abstractions provide a developer-friendly API for building computational +graphs. Under the hood, it\'s still graphs all the way down! So while +you may not directly manipulate graphs as a framework user, they enable +your high-level model specifications to be efficiently executed. The +abstractions simplify model-building while computational graphs make it +possible. + +#### Static vs. Dynamic Graphs + +Deep learning frameworks have traditionally followed one of two +approaches for expressing computational graphs. + +**Static graphs (declare-then-execute):** With this model, the entire +computational graph must be defined upfront before it can be run. All +operations and data dependencies must be specified during the +declaration phase. TensorFlow originally followed this static approach - +models were defined in a separate context, then a session was created to +run them. The benefit of static graphs is they allow more aggressive +optimization, since the framework can see the full graph. But it also +tends to be less flexible for research and interactivity. Changes to the +graph require re-declaring the full model. + +For example: + +```{{python}} +x = tf.placeholder(tf.float32) +y = tf.matmul(x, weights) + biases +``` + +The model is defined separately from execution, like building a +blueprint. For TensorFlow 1.x, this is done using tf.Graph(). All ops +and variables must be declared upfront. Subsequently, the graph is +compiled and optimized before running. Execution is done later by +feeding in tensor values. + +**Dynamic graphs (define-by-run):** In contrast to declare (all) first +and then execute, the graph is built dynamically as execution happens. +There is no separate declaration phase - operations execute immediately +as they are defined. This style is more imperative and flexible, +facilitating experimentation. + +PyTorch uses dynamic graphs, building the graph on-the-fly as execution +happens. For example, consider the following code snippet, where the +graph is built as the execution is taking place: + +```{{python}} +x = torch.randn(4,784) +y = torch.matmul(x, weights) + biases +``` + +In the above example, there are no separate compile/build/run phases. +Ops define and execute immediately. With dynamic graphs, definition is +intertwined with execution. This provides a more intuitive, interactive +workflow. But the downside is less potential for optimizations, since +the framework only sees the graph as it is built. + +Recently, however, the distinction has blurred as frameworks adopt both +modes. TensorFlow 2.0 defaults to dynamic graph mode, while still +letting users work with static graphs when needed. Dynamic declaration +makes frameworks easier to use, while static models provide optimization +benefits. The ideal framework offers both options. + +Static graph declaration provides optimization opportunities but less +interactivity. While dynamic execution offers flexibility and ease of +use, it may have performance overhead. Here is a table comparing the +pros and cons of static vs dynamic execution graphs: + +| Execution Graph | Pros | Cons | +| --- | --- | --- | +| Static (Declare-then-execute) | Enable graph optimizations by seeing full model ahead of time
Can export and deploy frozen graphs
Graph is packaged independently of code | Less flexible for research and iteration
Changes require rebuilding graph
Execution has separate compile and run phases | +| Dynamic (Define-by-run) | Intuitive imperative style like Python code
Interleave graph build with execution
Easy to modify graphs
Debugging seamlessly fits workflow | Harder to optimize without full graph
Possible slowdowns from graph building during execution
Can require more memory | + +### Data Pipeline Tools + +Computational graphs can only be as good as the data they learn from and +work on. Therefore, feeding training data efficiently is crucial for +optimizing deep neural networks performance, though it is often +overlooked as one of the core functionalities. Many modern AI frameworks +provide specialized pipelines to ingest, process, and augment datasets +for model training. + +#### Data Loaders + +At the core of these pipelines are data loaders, which handle reading +examples from storage formats like CSV files or image folders. Reading +training examples from sources like files, databases, object storage, +etc. is the job of the data loaders. Deep learning models require +diverse data formats depending on the application. Among the popular +formats are CSV: A versatile, simple format often used for tabular data. +TFRecord: TensorFlow\'s proprietary format, optimized for performance. +Parquet: Columnar storage, offering efficient data compression and +retrieval. JPEG/PNG: Commonly used for image data. WAV/MP3: Prevalent +formats for audio data. For instance, `tf.data` is TensorFlows's +dataloading pipeline: . + +Data loaders batch examples to leverage vectorization support in +hardware. Batching refers to grouping multiple data points for +simultaneous processing, leveraging the vectorized computation +capabilities of hardware like GPUs. While typical batch sizes range from +32-512 examples, the optimal size often depends on the memory footprint +of the data and the specific hardware constraints. Advanced loaders can +stream virtually unlimited datasets from disk and cloud storage. +Streaming large datasets from disk or networks instead of loading fully +into memory. This enables virtually unlimited dataset sizes. + +Data loaders can also shuffle data across epochs for randomization, and +preprocess features in parallel with model training to expedite the +training process. Randomly shuffling the order of examples between +training epochs reduces bias and improves generalization. + +Data loaders also support caching and prefetching strategies to optimize +data delivery for fast, smooth model training. Caching preprocessed +batches in memory so they can be reused efficiently during multiple +training steps. Caching these batches in memory eliminates redundant +processing. Prefetching, on the other hand, involves preloading +subsequent batches, ensuring that the model never idles waiting for +data. + +### Data Augmentation + +Besides loading, data augmentation expands datasets synthetically. +Augmentations apply random transformations like flipping, cropping, +rotating, altering color, adding noise etc. for images. For audio, +common augmentations involve mixing clips with background noise, or +modulating speed/pitch/volume. + +Augmentations increase variation in the training data. Frameworks like +TensorFlow and PyTorch simplify applying random augmentations each epoch +by integrating into the data pipeline.By programmatically increasing +variation in the training data distribution, augmentations reduce +overfitting and improve model generalization. + +Many frameworks make it easy to integrate augmentations into the data +pipeline so they are applied on-the-fly each epoch. Together, performant +data loaders and extensive augmentations enable practitioners to feed +massive, varied datasets to neural networks efficiently. Hands-off data +pipelines represent a significant improvement in usability and +productivity. They allow developers to focus more on model architecture +and less on data wrangling when training deep learning models. + +### Optimization Algorithms + +Training a neural network is fundamentally an iterative process that +seeks to minimize a loss function. At its core, the goal is to fine-tune +the model weights and parameters to produce predictions as close as +possible to the true target labels. Machine learning frameworks have +greatly streamlined this process by offering extensive support in three +critical areas: loss functions, optimization algorithms, and +regularization techniques. + +Loss Functions are useful to quantify the difference between the +model\'s predictions and the true values. Different datasets require a +different loss function to perform properly, as the loss function tells +the computer the "objective" for it to aim to. Commonly used loss +functions are Mean Squared Error (MSE) for regression tasks and +Cross-Entropy Loss for classification tasks. + +To demonstrate some of the loss functions, imagine that you have a set of inputs and the corresponding outputs, $Y_n$ that denotes the output of $n$'th value. The inputs are fed into the model, and the model outputs a prediction, which we can call $\hat{Y_n}$. With the predicted value and the real value, we can for example use the MSE to calculate the loss function: + +$$MSE = \frac{1}{N}\sum_{n=1}^{N}(Y_n - \hat{Y_n})^2$$ + +If the problem is a classification problem, we do not want to use the MSE, since the distance between the predicted value and the real value does not have significant meaning. For example, if one wants to recognize handwritten models, while 9 is further away from 2, it does not mean that the model is more wrong by making the prediction. Therefore, we use the cross-entropy loss function, which is defined as: + +$$Cross-Entropy = -\sum_{n=1}^{N}Y_n\log(\hat{Y_n})$$ + + + +Once the loss like above is computed, we need methods to adjust the model\'s +parameters to reduce this loss or error during the training process. To +do so, current frameworks use a gradient based approach, where it +computes how much changes tuning the weights in a certain way changes +the value of the loss function. Knowing this gradient, the model moves +in the direction that reduces the gradient. There are many challenges +associated with this, however, primarily stemming from the fact that the +optimization problem is not convex, making it very easy to solve, and +more details about this will come in the AI Training section. Modern +frameworks come equipped with efficient implementations of several +optimization algorithms, many of which are variants of gradient descent +algorithms with stochastic methods and adaptive learning rates. More +information with clear examples can be found in the AI Training section. + +Last but not least, overly complex models tend to overfit, meaning they +perform well on the training data but fail to generalize to new, unseen +data (see Overfitting). To counteract this, regularization methods are +employed to penalize model complexity and encourage it to learn simpler +patterns. Dropout for instance randomly sets a fraction of input units +to 0 at each update during training, which helps prevent overfitting. + +However, there are cases where the problem is more complex than what the model can represent, and this may result in underfitting. Therefore, choosing the right model architecture is also a critical step in the training process. Further heuristics and techniques are discussed in the AI Training section. + +Frameworks also provide efficient implementations of gradient descent, +Adagrad, Adadelta, and Adam. Adding regularization like dropout and +L1/L2 penalties prevents overfitting during training. Batch +normalization accelerates training by normalizing inputs to layers. + +### Model Training Support + +Before training a defined neural network model, a compilation step is +required. During this step, the high-level architecture of the neural +network is transformed into an optimized, executable format. This +process comprises several steps. The construction of the computational +graph is the first step. It represents all the mathematical operations +and data flow within the model. We discussed this earlier. + +During training, the focus is on executing the computational graph. +Every parameter within the graph, such as weights and biases, is +assigned an initial value. This value might be random or based on a +predefined logic, depending on the chosen initialization method. + +The next critical step is memory allocation. Essential memory is +reserved for the model\'s operations on both CPUs and GPUs, ensuring +efficient data processing. The model\'s operations are then mapped to +the available hardware resources, particularly GPUs or TPUs, to expedite +computation. Once compilation is finalized, the model is prepared for +training. + +The training process employs various tools to enhance efficiency. Batch +processing is commonly used to maximize computational throughput. +Techniques like vectorization enable operations on entire data arrays, +rather than proceeding element-wise, which bolsters speed. Optimizations +such as kernel fusion (refer to the Optimizations chapter) amalgamate +multiple operations into a single action, minimizing computational +overhead. Operations can also be segmented into phases, facilitating the +concurrent processing of different mini-batches at various stages. + +Frameworks consistently checkpoint the state, preserving intermediate +model versions during training. This ensures that if an interruption +occurs, the progress isn\'t wholly lost, and training can recommence +from the last checkpoint. Additionally, the system vigilantly monitors +the model\'s performance against a validation data set. Should the model +begin to overfit (that is, if its performance on the validation set +declines), training is automatically halted, conserving computational +resources and time. + +ML frameworks incorporate a blend of model compilation, enhanced batch +processing methods, and utilities such as checkpointing and early +stopping. These resources manage the complex aspects of performance, +enabling practitioners to zero in on model development and training. As +a result, developers experience both speed and ease when utilizing the +capabilities of neural networks. + +### Validation and Analysis + +After training deep learning models, frameworks provide utilities to +evaluate performance and gain insights into the models\' workings. These +tools enable disciplined experimentation and debugging. + +#### Evaluation Metrics + +Frameworks include implementations of common evaluation metrics for +validation: + +- Accuracy - Fraction of correct predictions overall. Widely used for classification. + +- Precision - Of positive predictions, how many were actually positive. Useful for imbalanced datasets. + +- Recall - Of actual positives, how many did we predict correctly. Measures completeness. + +- F1-score - Harmonic mean of precision and recall. Combines both metrics. + +- AUC-ROC - Area under ROC curve. Used for classification threshold analysis. + +- MAP - Mean Average Precision. Evaluates ranked predictions in retrieval/detection. + +- Confusion Matrix - Matrix that shows the true positives, true negatives, false positives, and false negatives. Provides a more detailed view of classification performance. + +These metrics quantify model performance on validation data for +comparison. + +#### Visualization + +Visualization tools provide insight into models: + +- Loss curves - Plot training and validation loss over time to spot overfitting. + + + +- Activation grids - Illustrate features learned by convolutional filters. + +- Projection - Reduce dimensionality for intuitive visualization. -- Library Dependency -- Lack of OS +- Precision-recall curves - Assess classification tradeoffs. -## Embedded AI Frameworks +Tools like [TensorBoard](https://www.tensorflow.org/tensorboard/scalars_and_keras) +for TensorFlow and [TensorWatch](https://github.com/microsoft/tensorwatch) for PyTorch enable +real-time metrics and visualization during training. -Explanation: Now, discuss specifically about the unique embedded AI frameworks that are available and why they are special, etc. +### Differentiable programming -- TensorFlow Lite -- ONNX Runtime -- MicroPython -- CMSIS-NN -- Edge Impulse -- Others (briefly mention some less common but significant frameworks) +With the machine learning training methods such as backpropagation +relying on the change in the loss function with respect to the change in +weights (which essentially is the definition of derivatives), the +ability to quickly and efficiently train large machine learning models +rely on the computer's ability to take derivatives. This makes +differentiable programming one of the most important elements of a +machine learning framework. + +There are primarily four methods that we can use to make computers take +derivatives. First, we can manually figure out the derivatives by hand +and input them to the computer. One can see that this would quickly +become a nightmare with many layers of neural networks, if we had to +compute all the derivatives in the backpropagation steps by hand. +Another method is symbolic differentiation using computer algebra +systems such as Mathematica, but this can introduce a layer of +inefficiency, as there needs to be a level of abstraction to take +derivatives. Numerical derivatives, the practice of approximating +gradients using finite difference methods, suffer from many problems +including high computational costs, and larger grid size can lead to a +significant amount of errors. This leads to automatic differentiation, +which exploits the primitive functions that computers use to represent +operations to obtain an exact derivative. With automatic +differentiation, computational complexity of computing the gradient is +proportional to computing the function itself. Intricacies of automatic +differentiation are not dealt with by end users now, but resources to +learn more can be found widely, such as from +[here](https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/slides/lec10.pdf). +Automatic differentiation and differentiable programming today is +ubiquitous and is done efficiently and automatically by modern machine +learning frameworks. + +### Hardware Acceleration + +The trend to continuously train and deploy larger machine learning +models has essentially made hardware acceleration support a necessity +for machine learning platforms. Deep layers of neural networks require +many matrix multiplications, which attracts hardware that can compute +matrix operations fast and in parallel. In this landscape, two types of +hardware architectures, the [GPU and +TPU](https://cloud.google.com/tpu/docs/intro-to-tpu), have +emerged as leading choices for training machine learning models. + +The use of hardware accelerators began with +[AlexNet](https://proceedings.neurips.cc/paper_files/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf), +which paved the way for future works to utilize GPUs as hardware +accelerators for training computer vision models. GPUs, or Graphics +Processing Units, excel in handling a large number of computations at once, making them +ideal for the matrix operations that are central to neural network +training. Their architecture, designed for rendering graphics, turns out +to be perfect for the kind of mathematical operations required in +machine learning. While they are very useful for machine learning tasks +and have been implemented in many hardware platforms, GPU's are still +general purpose in that they can be used for other applications. + +On the other hand, [Tensor Processing +Units](https://cloud.google.com/tpu/docs/intro-to-tpu) +(TPU) are hardware units designed specifically for neural networks. They +focus on the multiply and accumulate (MAC) operation, and their hardware +essentially consists of a large hardware matrix that contains elements +efficiently computing the MAC operation. This concept called the [systolic +array +architecture](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1653825), +was pioneered by @kung1979systolic, but has +proven to be a useful structure to efficiently compute matrix products +and other operations within neural networks (such as convolutions). + +While TPU's can drastically reduce training times, it also has +disadvantages. For example, many operations within the machine learning +frameworks (primarily TensorFlow here since the TPU directly integrates +with it) are not supported with the TPU's. It also cannot support custom +custom operations from the machine learning frameworks, and the network +design must closely align to the hardware capabilities. + +Today, NVIDIA GPUs dominate training, aided by software libraries like +[CUDA](https://developer.nvidia.com/cuda-toolkit), +[cuDNN](https://developer.nvidia.com/cudnn), and +[TensorRT.](https://developer.nvidia.com/tensorrt#:~:text=NVIDIA%20TensorRT%2DLLM%20is%20an,knowledge%20of%20C%2B%2B%20or%20CUDA.) +Frameworks also tend to include optimizations to maximize performance on +these hardware types, like pruning unimportant connections and fusing +layers. Combining these techniques with hardware acceleration provides +greater efficiency. For inference, hardware is increasingly moving +towards optimized ASICs and SoCs. Google\'s TPUs accelerate models in +data centers. Apple, Qualcomm, and others now produce AI-focused mobile +chips. The NVIDIA Jetson family targets autonomous robots. + +## Advanced Features {#sec-ai_frameworks-advanced} + +### Distributed training + +As machine learning models have become larger over the years, it has +become essential for large models to utilize multiple computing nodes in +the training process. This process, called distributed learning, has +allowed for higher training capabilities, but has also imposed +challenges in implementation. + +We can consider three different ways to spread the work of training +machine learning models to multiple computing nodes. Input data +partitioning, referring to multiple processors running the same model on +different input partitions. This is the easiest to implement that is +available for many machine learning frameworks. The more challenging +distribution of work comes with model parallelism, which refers to +multiple computing nodes working on different parts of the model, and +pipelined model parallelism, which refers to multiple computing nodes +working on different layers of the model on the same input. The latter +two mentioned here are active research areas. + +ML frameworks that support distributed learning include TensorFlow +(through its +[tf.distribute](https://www.tensorflow.org/api_docs/python/tf/distribute) +module), PyTorch (through its +[torch.nn.DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) +and +[torch.nn.DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) +modules), and MXNet (through its +[gluon](https://mxnet.apache.org/versions/1.9.1/api/python/docs/api/gluon/index.html) +API). + +### Model Conversion + +Machine learning models have various methods to be represented in order +to be used within different frameworks and for different device types. +For example, a model can be converted to be compatible with inference +frameworks within the mobile device. The default format for TensorFlow +models is checkpoint files containing weights and architectures, which +are needed in case we have to retrain the models. But for mobile +deployment, models are typically converted to TensorFlow Lite format. +TensorFlow Lite uses a compact flatbuffer representation and +optimizations for fast inference on mobile hardware, discarding all the +unnecessary baggage associated with training metadata such as checkpoint +file structures. + +The default format for TensorFlow models is checkpoint files containing +weights and architectures. For mobile deployment, models are typically +converted to TensorFlow Lite format. TensorFlow Lite uses a compact +flatbuffer representation and optimizations for fast inference on mobile +hardware. + +Model optimizations like quantization (see [Optimizations](./optimizations.qmd) chapter) can further optimize models for target architectures like mobile. This +reduces precision of weights and activations to `uint8` or `int8` for a +smaller footprint and faster execution with supported hardware +accelerators. For post-training quantization, TensorFlow\'s converter +handles analysis and conversion automatically. + +Frameworks like TensorFlow simplify deploying trained models to mobile +and embedded IoT devices through easy conversion APIs for TFLite format +and quantization. Ready-to-use conversion enables high performance +inference on mobile without manual optimization burden. Besides TFLite, +other common targets include TensorFlow.js for web deployment, +TensorFlow Serving for cloud services, and TensorFlow Hub for transfer +learning. TensorFlow\'s conversion utilities handle these scenarios to +streamline end-to-end workflows. + +More information about model conversion in TensorFlow is linked +[here](https://www.tensorflow.org/lite/models/convert). + +### AutoML, No-Code/Low-Code ML + +In many cases, machine learning can have a relatively high barrier of +entry compared to other fields. To successfully train and deploy models, +one needs to have a critical understanding of a variety of disciplines, +from data science (data processing, data cleaning), model structures +(hyperparameter tuning, neural network architecture), hardware +(acceleration, parallel processing), and more depending on the problem +at hand. The complexity of these problems have led to the introduction +to frameworks such as AutoML, which aims to make "Machine learning +available for non-Machine Learning exports" and to "automate research in +machine learning". They have constructed AutoWEKA, which aids in the +complex process of hyperparameter selection, as well as Auto-sklearn and +Auto-pytorch, an extension of AutoWEKA into the popular sklearn and +PyTorch Libraries. + +While these works of automating parts of machine learning tasks are +underway, others have focused on constructing machine learning models +easier by deploying no-code/low code machine learning, utilizing a drag +and drop interface with an easy to navigate user interface. Companies +such as Apple, Google, and Amazon have already created these easy to use +platforms to allow users to construct machine learning models that can +integrate to their ecosystem. + +These steps to remove barrier to entry continue to democratize machine +learning and make it easier to access for beginners and simplify +workflow for experts. + +### Advanced Learning Methods + +#### Transfer Learning + +Transfer learning is the practice of using knowledge gained from a +pretrained model to train and improve performance of a model that is for +a different task. For example, datasets that have been trained on +ImageNet datasets such as MobileNet and ResNet can help classify other +image datasets. To do so, one may freeze the pretrained model, utilizing +it as a feature extractor to train a much smaller model that is built on +top of the feature extraction. One can also fine tune the entire model +to fit the new task. + +Transfer learning has a series of challenges, in +that the modified model may not be able to conduct its original tasks +after transfer learning. Papers such as ["Learning without +Forgetting"](https://browse.arxiv.org/pdf/1606.09282.pdf) by @li2017learning +aims to address these challenges and have been implemented in +modern machine learning platforms. + +#### Federated Learning + +Consider the problem of labeling items that are present in a photo from +personal devices. One may consider moving the image data from the +devices to a central server, where a single model will train Using these +image data provided by the devices. However, this presents many +potential challenges. First, with many devices one needs a massive +network infrastructure to move and store data from these devices to a +central location. With the number of devices that are present today this +is often not feasible, and very costly. Furthermore, there are privacy +challenges associated with moving personal data, such as Photos central +servers. + +Federated learning by @mcmahan2023communicationefficient is +a form of distributed computing that resolves these issues by +distributing the models into personal devices for them to be trained on +device. At the beginning, a base global model is trained on a central +server to be distributed to all devices. Using this base model, the +devices individually compute the gradients and send them back to the +central hub. Intuitively this is the transfer of model parameters +instead of the data itself. This innovative approach allows the model to +be trained with many different datasets (which, in our example, would be +the set of images that are on personal devices), without the need to +transfer a large amount of potentially sensitive data. However, +federated learning also comes with a series of challenges. + +In many real-world situations, data collected from devices may not come with suitable labels. This issue is compounded by the fact that users, who are often the primary source of data, can be unreliable. This unreliability means that even when data is labeled, there's no guarantee of its accuracy or relevance. Furthermore, each user's data is unique, resulting in a significant variance in the data generated by different users. This non-IID nature of data, coupled with the unbalanced data production where some users generate more data than others, can adversely impact the performance of the global model. Researchers have worked to compensate for this, such as by +adding a proximal term to achieve a balance between the local and global +model, and adding a frozen [global hypersphere +classifier](https://arxiv.org/abs/2207.09413). + +There are additional challenges associated with federated learning. The number of mobile device owners can far exceed the average number of training samples on each device, leading to substantial communication overhead. This issue is particularly pronounced in the context of mobile networks, which are often used for such communication and can be unstable. This instability can result in delayed or failed transmission of model updates, thereby affecting the overall training process. + +The heterogeneity of device resources is another hurdle. Devices participating in Federated Learning can have varying computational powers and memory capacities. This diversity makes it challenging to design algorithms that are efficient across all devices. Privacy and security issues are not a guarantee for federated learning. Techniques such as inversion gradient attacks can be used to extract information about the training data from the model parameters. Despite these challenges, the large amount of potential benefits continue to make it a popular research area. Open source programs such as [Flower](https://flower.dev/) have been developed to make it simpler to implement federated learning with a variety of machine learning frameworks. + +## Framework Specialization + +Thus far, we have talked about ML frameworks generally. However, +typically frameworks are optimized based on the target environment\'s +computational capabilities and application requirements, ranging from +the cloud to the edge to tiny devices. Choosing the right framework is +crucial based on the target environment for deployment. This section +provides an overview of the major types of AI frameworks tailored for +cloud, edge, and tinyML environments to help understand the similarities +and differences between these different ecosystems. + +### Cloud + +Cloud-based AI frameworks assume access to ample computational power, +memory, and storage resources in the cloud. They generally support both +training and inference. Cloud-based AI frameworks are suited for +applications where data can be sent to the cloud for processing, such as +cloud-based AI services, large-scale data analytics, and web +applications. Popular cloud AI frameworks include the ones we mentioned +earlier such as TensorFlow, PyTorch, MXNet, Keras, and others. These +frameworks utilize technologies like GPUs, TPUs, distributed training, +and AutoML to deliver scalable AI. Concepts like model serving, MLOps, +and AIOps relate to the operationalization of AI in the cloud. Cloud AI +powers services like Google Cloud AI and enables transfer learning using +pre-trained models. + +### Edge + +Edge AI frameworks are tailored for deploying AI models on edge devices, +such as IoT devices, smartphones, and edge servers. Edge AI frameworks +are optimized for devices with moderate computational resources, +offering a balance between power and performance. Edge AI frameworks are +ideal for applications requiring real-time or near-real-time processing, +including robotics, autonomous vehicles, and smart devices. Key edge AI +frameworks include TensorFlow Lite, PyTorch Mobile, CoreML, and others. +They employ optimizations like model compression, quantization, and +efficient neural network architectures. Hardware support includes CPUs, +GPUs, NPUs and accelerators like the Edge TPU. Edge AI enables use cases +like mobile vision, speech recognition, and real-time anomaly detection. + +### Embedded + +TinyML frameworks are specialized for deploying AI models on extremely +resource-constrained devices, specifically microcontrollers and sensors +within the IoT ecosystem. TinyML frameworks are designed for devices +with severely limited resources, emphasizing minimal memory and power +consumption. TinyML frameworks are specialized for use cases on +resource-constrained IoT devices for applications such as predictive +maintenance, gesture recognition, and environmental monitoring. Major +tinyML frameworks include TensorFlow Lite Micro, uTensor, and ARM NN. +They optimize complex models to fit within kilobytes of memory through +techniques like quantization-aware training and reduced precision. +TinyML allows intelligent sensing across battery-powered devices, +enabling collaborative learning via federated learning. The choice of +framework involves balancing model performance and computational +constraints of the target platform, whether cloud, edge or tinyML. Here +is a summary table comparing the major AI frameworks across cloud, edge, +and tinyML environments: + + +| Framework Type | Examples | Key Technologies | Use Cases | +|----------------|-----------------------------------|-------------------------------------------------------------------------|------------------------------------------------------| +| Cloud AI | TensorFlow, PyTorch, MXNet, Keras | GPUs, TPUs, distributed training, AutoML, MLOps | Cloud services, web apps, big data analytics | +| Edge AI | TensorFlow Lite, PyTorch Mobile, Core ML | Model optimization, compression, quantization, efficient NN architectures | Mobile apps, robots, autonomous systems, real-time processing | +| TinyML | TensorFlow Lite Micro, uTensor, ARM NN | Quantization-aware training, reduced precision, neural architecture search | IoT sensors, wearables, predictive maintenance, gesture recognition | + + +**Key differences:** + +- Cloud AI leverages massive computational power for complex models + > using GPUs/TPUs and distributed training + +- Edge AI optimizes models to run locally on resource-constrained edge + > devices. + +- TinyML fits models into extremely low memory and compute + > environments like microcontrollers + +## Embedded AI Frameworks {#sec-ai_frameworks_embedded} + +### Resource Constraints + +Embedded systems face severe resource constraints that pose unique +challenges for deploying machine learning models compared to traditional +computing platforms. For example, microcontroller units (MCUs) commonly +used in IoT devices often have: + +- **RAM** in the range of tens of kilobytes to a few megabytes. The + popular [ESP8266 MCU](https://www.espressif.com/en/products/socs/esp8266) has around 80KB RAM available to developers. + This contrasts with 8GB or more on typical laptops and desktops + today. + +- **Flash storage** ranging from hundreds of kilobytes to a few + megabytes. The Arduino Uno microcontroller provides just 32KB of + storage for code. Standard computers today have disk storage in + the order of terabytes. + +- **Processing power** from just a few MHz to approximately 200MHz. + The ESP8266 operates at 80MHz. This is several orders of magnitude + slower than multi-GHz multi-core CPUs in servers and high-end + laptops. + +These tight constraints make training machine learning models directly +on microcontrollers infeasible in most cases. The limited RAM precludes +handling large datasets for training. Energy usage for training would +also quickly deplete battery-powered devices. Instead, models are +trained on resource-rich systems and deployed on microcontrollers for +optimized inference. But even inference poses challenges: + +1. **Model Size:** AI models are too large to fit on embedded and IoT + devices. This necessitates the need for model compression + techniques, such as quantization, pruning, and knowledge + distillation. Additionally, as we will see, many of the frameworks used by developers for + AI development have large amounts of overhead, and built in + libraries that embedded systems can't support. + +2. **Complexity of Tasks:** With only tens of KBs to a few MBs of RAM, + IoT devices and embedded systems are constrained in the complexity + of tasks they can handle. Tasks that require large datasets or + sophisticated algorithms-- for example LLMs-- which would run + smoothly on traditional computing platforms, might be infeasible + on embedded systems without compression or other optimization + techniques due to memory limitations. + +3. **Data Storage and Processing:** Embedded systems often process data + in real-time and might not store large amounts of data locally. + Conversely, traditional computing systems can hold and process + large datasets in memory, enabling faster data operations and + analysis as well as real-time updates. + +4. **Security and Privacy:** Limited memory also restricts the + complexity of security algorithms and protocols, data encryption, + reverse engineering protections, and more that can be implemented + on the device. This can potentially make some IoT devices more + vulnerable to attacks. + +Consequently, specialized software optimizations and ML frameworks +tailored for microcontrollers are necessary to work within these tight +resource bounds. Clever optimization techniques like quantization, +pruning and knowledge distillation compress models to fit within limited +memory (see Optimizations section). Learnings from neural architecture +search help guide model designs. + +Hardware improvements like dedicated ML accelerators on microcontrollers +also help alleviate constraints. For instance, [Qualcomm's Hexagon DSP](https://developer.qualcomm.com/software/hexagon-dsp-sdk/dsp-processor) +provides acceleration for TensorFlow Lite models on Snapdragon mobile +chips. [Google's Edge TPU](https://cloud.google.com/edge-tpu) packs ML performance into a tiny ASIC for edge +devices. [ARM Ethos-U55](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55) offers efficient inference on Cortex-M class +microcontrollers. These customized ML chips unlock advanced capabilities +for resource-constrained applications. + +Generally, due to the limited processing power, it's almost always +infeasible to train AI models on IoT or embedded systems. Instead, +models are trained on powerful traditional computers (often with GPUs) +and then deployed on the embedded device for inference. TinyML +specifically deals with this, ensuring models are lightweight enough for +real-time inference on these constrained devices. + +### Frameworks & Libraries + +Embedded AI frameworks are software tools and libraries designed to +enable AI and ML capabilities on embedded systems. These frameworks are essential for +bringing AI to IoT devices, robotics, and other +edge computing platforms and they are designed to work where +computational resources, memory, and power consumption are limited. + +### Challenges + +While embedded systems present an enormous opportunity for deploying +machine learning to enable intelligent capabilities at the edge, these +resource-constrained environments also pose significant challenges. +Unlike typical cloud or desktop environments rich with computational +resources, embedded devices introduce severe constraints around memory, +processing power, energy efficiency, and specialized hardware. As a +result, existing machine learning techniques and frameworks designed for +server clusters with abundant resources do not directly translate to +embedded systems. This section uncovers some of the challenges and +opportunities for embedded systems and ML frameworks. + +**Fragmented Ecosystem** + +The lack of a unified ML framework led to a highly fragmented ecosystem. +Engineers at companies like [STMicroelectronics](https://www.st.com/), [NXP Semiconductors](https://www.nxp.com/), and +[Renesas](https://www.renesas.com/) had to develop custom solutions tailored to their specific +microcontroller and DSP architectures. These ad-hoc frameworks required +extensive manual optimization for each low-level hardware platform. This +made porting models extremely difficult, requiring redevelopment for new +Arm, RISC-V or proprietary architectures. + +**Disparate Hardware Needs ** + +Without a shared framework, there was no standard way to assess +hardware's capabilities. Vendors like Intel, Qualcomm and NVIDIA +created integrated solutions blending model, software and hardware +improvements. This made it hard to discern the sources of performance +gains - whether new chip designs like Intel's low-power x86 cores or +software optimizations were responsible. A standard framework was needed +so vendors could evaluate their hardware's capabilities in a fair, +reproducible way. + +**Lack of Portability** + +Adapting models trained in common frameworks like TensorFlow or PyTorch +to run efficiently on microcontrollers was very challenging without +standardized tools. It required time-consuming manual translation of +models to run on specialized DSPs from companies like CEVA or low-power +Arm M-series cores. There were no turnkey tools enabling portable +deployment across different architectures. + +**Incomplete Infrastructure ** + +The infrastructure to support key model development workflows was +lacking. There was minimal support for compression techniques to fit +large models within constrained memory budgets. Tools for quantization +to lower precision for faster inference were missing. Standardized APIs +for integration into applications were incomplete. Essential +functionality like on-device debugging, metrics, and performance +profiling was absent. These gaps increased the cost and difficulty of +embedded ML development. + +**No Standard Benchmark** + +Without unified benchmarks, there was no standard way to assess and +compare the capabilities of different hardware platforms from vendors +like NVIDIA, Arm and Ambiq Micro. Existing evaluations relied on +proprietary benchmarks tailored to showcased strengths of particular +chips. This made it impossible to objectively measure hardware +improvements in a fair, neutral manner. This topic is discussed in more detail in the [Benchmarking AI](./benchmarking.qmd) chapter. + +**Minimal Real-World Testing** + +Much of the benchmarks relied on synthetic data. Rigorously testing +models on real-world embedded applications was difficult without +standardized datasets and benchmarks. This raised questions on how +performance claims would translate to real-world usage. More extensive +testing was needed to validate chips in actual use cases. + +The lack of shared frameworks and infrastructure slowed TinyML adoption, +hampering the integration of ML into embedded products. Recent +standardized frameworks have begun addressing these issues through +improved portability, performance profiling, and benchmarking support. +But ongoing innovation is still needed to enable seamless, +cost-effective deployment of AI to edge devices. + +**Summary** + +The absence of standardized frameworks, benchmarks, and infrastructure +for embedded ML has traditionally hampered adoption. However, recent +progress has been made in developing shared frameworks like TensorFlow +Lite Micro and benchmark suites like MLPerf Tiny that aim to accelerate +the proliferation of TinyML solutions. But overcoming the fragmentation +and difficulty of embedded deployment remains an ongoing process. + +## Examples + +Machine learning deployment on microcontrollers and other embedded +devices often requires specially optimized software libraries and +frameworks to work within the tight constraints of memory, compute, and +power. Several options exist for performing inference on such +resource-limited hardware, each with their own approach to optimizing +model execution. This section will explore the key characteristics and +design principles behind TFLite Micro, TinyEngine, and CMSIS-NN, +providing insight into how each framework tackles the complex problem of +high-accuracy yet efficient neural network execution on +microcontrollers. They showcase different approaches for implementing +efficient TinyML frameworks. + +The table summarizes the key differences and similarities between these +three specialized machine learning inference frameworks for embedded +systems and microcontrollers. + +| Framework | TensorFlow Lite Micro | TinyEngine | CMSIS-NN | +|------------------------|:----------------------------:|:--------------------------------------:|:--------------------------------------:| +| **Approach** | Interpreter-based | Static compilation | Optimized neural network kernels | +| **Hardware Focus** | General embedded devices | Microcontrollers | ARM Cortex-M processors | +| **Arithmetic Support** | Floating point | Floating point, fixed point | Floating point, fixed point | +| **Model Support** | General neural network models| Models co-designed with TinyNAS | Common neural network layer types | +| **Code Footprint** | Larger due to inclusion of interpreter and ops | Small, includes only ops needed for model | Lightweight by design | +| **Latency** | Higher due to interpretation overhead | Very low due to compiled model | Low latency focus | +| **Memory Management** | Dynamically managed by interpreter | Model-level optimization | Tools for efficient allocation | +| **Optimization Approach** | Some code generation features | Specialized kernels, operator fusion | Architecture-specific assembly optimizations | +| **Key Benefits** | Flexibility, portability, ease of updating models | Maximizes performance, optimized memory usage | Hardware acceleration, standardized API, portability | + + +In the following sections, we will dive into understanding each of these +in greater detail. + +### Interpreter + +[TensorFlow Lite Micro (TFLM)](https://www.tensorflow.org/lite/microcontrollers) is a machine learning inference framework +designed for embedded devices with limited resources. It uses an +interpreter to load and execute machine learning models, which provides +flexibility and ease of updating models in the field [@david2021tensorflow]. + +Traditional interpreters often have significant branching overhead, +which can reduce performance. However, machine learning model +interpretation benefits from the efficiency of long-running kernels, +where each kernel runtime is relatively large and helps mitigate +interpreter overhead. + +An alternative to an interpreter-based inference engine is to generate +native code from a model during export. This can improve performance, +but it sacrifices portability and flexibility, as the generated code +needs recompilation for each target platform and must be replaced +entirely to modify a model. + +TFLM strikes a balance between the simplicity of code compilation and +the flexibility of an interpreter-based approach by incorporating +certain code-generation features. For example, the library can be +constructed solely from source files, offering much of the compilation +simplicity associated with code generation while retaining the benefits +of an interpreter-based model execution framework. + +An interpreter-based approach offers several benefits over code +generation for machine learning inference on embedded devices: + +- **Flexibility:** Models can be updated in the field without recompiling + the entire application. + +- **Portability:** The interpreter can be used to execute models on + different target platforms without porting the code. + +- **Memory efficiency:** The interpreter can share code across multiple + models, reducing memory usage. + +- **Ease of development:** Interpreters are easier to develop and maintain + than code generators. + +TensorFlow Lite Micro is a powerful and flexible framework for machine +learning inference on embedded devices. Its interpreter-based approach +offers several benefits over code generation, including flexibility, +portability, memory efficiency, and ease of development. + +### Compiler-based + +[TinyEngine](https://github.com/mit-han-lab/tinyengine) by is an ML inference framework designed specifically for +resource-constrained microcontrollers. It employs several optimizations +to enable high-accuracy neural network execution within the tight +constraints of memory, compute, and storage on microcontrollers [@lin2020mcunet]. + +While inference frameworks like TFLite Micro use interpreters to execute +the neural network graph dynamically at runtime, this adds significant +overhead in terms of memory usage to store metadata, interpretation +latency, and lack of optimizations, although TFLite argues that the +overhead is small. TinyEngine eliminates this overhead by employing a +code generation approach. During compilation, it analyzes the network +graph and generates specialized code to execute just that model. This +code is natively compiled into the application binary, avoiding runtime +interpretation costs. + +Conventional ML frameworks schedule memory per layer, trying to minimize +usage for each layer separately. TinyEngine does model-level scheduling +instead, analyzing memory usage across layers. It allocates a common +buffer size based on the max memory needs of all layers. This buffer is +then shared efficiently across layers to increase data reuse. + +TinyEngine also specializes the kernels for each layer through +techniques like tiling, unrolling, and fusing operators. For example, it +will generate unrolled compute kernels with the exact number of loops +needed for a 3x3 or 5x5 convolution. These specialized kernels extract +maximum performance from the microcontroller hardware. It uses depthwise +convolutions that are optimized to minimize memory allocations by +computing each channel\'s output in-place over the input channel data. +This technique exploits the channel-separable nature of depthwise +convolutions to reduce peak memory size. + +Similar to TFLite Micro, the compiled TinyEngine binary only includes +ops needed for a specific model rather than all possible operations. +This results in a very small binary footprint, keeping code size low for +memory-constrained devices. + +One difference between TFLite Micro and TinyEngine is that the latter is +co-designed with "TinyNAS," an architecture search method for +microcontroller models, similar to differential NAS for +microcontrollers. The efficiency of TinyEngine allows exploring larger +and more accurate models through NAS. It also provides feedback to +TinyNAS on which models can fit within the hardware constraints. + +Through all these various custom techniques like static compilation, +model-based scheduling, specialized kernels, and co-design with NAS, +TinyEngine enables high-accuracy deep learning inference within the +tight resource constraints of microcontrollers. + +### Library + +[CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html), standing for Cortex Microcontroller Software Interface +Standard for Neural Networks, is a software library devised by ARM. It +offers a standardized interface for deploying neural network inference +on microcontrollers and embedded systems, with a particular focus on +optimization for ARM Cortex-M processors [@lai2018cmsis]. + +**Neural Network Kernels:** CMSIS-NN is equipped with highly efficient +kernels that handle fundamental neural network operations such as +convolution, pooling, fully connected layers, and activation functions. +It caters to a broad range of neural network models by supporting both +floating-point and fixed-point arithmetic. The latter is especially +beneficial for resource-constrained devices as it curtails memory and +computational requirements (Quantization). + +**Hardware Acceleration:** CMSIS-NN harnesses the power of Single +Instruction, Multiple Data (SIMD) instructions available on many +Cortex-M processors. This allows for parallel processing of multiple +data elements within a single instruction, thereby boosting +computational efficiency. Certain Cortex-M processors feature Digital +Signal Processing (DSP) extensions that CMSIS-NN can exploit for +accelerated neural network execution. The library also incorporates +assembly-level optimizations tailored to specific microcontroller +architectures to further enhance performance. + +**Standardized API:** CMSIS-NN offers a consistent and abstracted API +that protects developers from the complexities of low-level hardware +details. This makes the integration of neural network models into +applications simpler. It may also encompass tools or utilities for +converting popular neural network model formats into a format that is +compatible with CMSIS-NN. + +**Memory Management:** CMSIS-NN provides functions for efficient memory +allocation and management, which is vital in embedded systems where +memory resources are scarce. It ensures optimal memory usage during +inference and in some instances, allows for in-place operations to +further decrease memory overhead. + +**Portability**: CMSIS-NN is designed with portability in mind across +various Cortex-M processors. This enables developers to write code that +can operate on different microcontrollers without significant +modifications. + +**Low Latency:** CMSIS-NN minimizes inference latency, making it an +ideal choice for real-time applications where swift decision-making is +paramount. + +**Energy Efficiency:** The library is designed with a focus on energy +efficiency, making it suitable for battery-powered and +energy-constrained devices. ## Choosing the Right Framework -- Factors to consider: ease of use, community support, performance, scalability, etc. -- Integration with data engineering tools -- Integration with model optimization tools +Choosing the right machine learning framework for a given application +requires carefully evaluating models, hardware, and software +considerations. By analyzing these three aspects - models, hardware, and +software - ML engineers can select the optimal framework and customize +as needed for efficient and performant on-device ML applications. The +goal is to balance model complexity, hardware limitations, and software +integration to design a tailored ML pipeline for embedded and edge +devices. + +![TensorFlow Framework Comparison - General](images_ml_frameworks/image4.png){width="100%" height="auto" align="center" caption="TensorFlow Framework Comparison - General"} + +### Model + +TensorFlow supports significantly more ops than TensorFlow Lite and +TensorFlow Lite Micro as it is typically used for research or cloud +deployment, which require a large number of and more flexibility with +operators (ops),. TensorFlow Lite supports select ops for on-device +training, whereas TensorFlow Micro does not. TensorFlow Lite also +supports dynamic shapes and quantization aware training, but TensorFlow +Micro does not. In contrast, TensorFlow Lite and TensorFlow Micro offer +native quantization tooling and support, where quantization refers to +the process of transforming an ML program into an approximated +representation with available lower precision operations. + +### Software +![TensorFlow Framework Comparison - Software](images_ml_frameworks/image5.png){width="100%" height="auto" align="center" caption="TensorFlow Framework Comparison - Model"} + + + +TensorFlow Lite Micro does not have OS support, while TensorFlow and +TensorFlow Lite do, in order to reduce memory overhead, make startup +times faster, and consume less energy. TensorFlow Lite Micro can be used +in conjunction with real-time operating systems (RTOS) like FreeRTOS, +Zephyr, and Mbed OS. TensorFlow Lite and TensorFlow Lite Micro support +model memory mapping, allowing models to be directly accessed from flash +storage rather than loaded into RAM, whereas TensorFlow does not. +TensorFlow and TensorFlow Lite support accelerator delegation to +schedule code to different accelerators, whereas TensorFlow Lite Micro +does not, as embedded systems tend not to have a rich array of +specialized accelerators. + +### Hardware + +![TensorFlow Framework Comparison - Hardware](images_ml_frameworks/image3.png){width="100%" height="auto" align="center" caption="TensorFlow Framework Comparison - Hardware"} + +TensorFlow Lite and TensorFlow Lite Micro have significantly smaller +base binary sizes and base memory footprints compared to TensorFlow. For +example, a typical TensorFlow Lite Micro binary is less than 200KB, +whereas TensorFlow is much larger. This is due to the +resource-constrained environments of embedded systems. TensorFlow +provides support for x86, TPUs, and GPUs like NVIDIA, AMD, and Intel. +TensorFlow Lite provides support for Arm Cortex A and x86 processors +commonly used in mobile and tablets. The latter is stripped out of all +the training logic that is not necessary for ondevice deployment. +TensorFlow Lite Micro provides support for microcontroller-focused Arm +Cortex M cores like M0, M3, M4, and M7, as well as DSPs like Hexagon and +SHARC and MCUs like STM32, NXP Kinetis, Microchip AVR. + +Selecting the appropriate AI framework is essential to ensure that +embedded systems can efficiently execute AI models. There are key +factors to consider when choosing a machine learning framework, with a +focus on ease of use, community support, performance, scalability, +integration with data engineering tools, and integration with model +optimization tools. By understanding these factors, you can make +informed decisions and maximize the potential of your machine learning +initiatives. + +### Other Factors + +When evaluating AI frameworks for embedded systems, several other key +factors beyond models, hardware, and software should be considered. + +#### Performance -## Framework Comparison +Performance is critical in embedded systems where computational +resources are limited. Evaluate the framework\'s ability to optimize +model inference for embedded hardware. Factors such as model +quantization and hardware acceleration support play a crucial role in +achieving efficient inference. -Explanation: Provide a high-level comparison of the different frameworks based on class slides, etc. +#### Scalability -- Table of differences and similarities +Scalability is essential when considering the potential growth of an +embedded AI project. The framework should support the deployment of +models on a variety of embedded devices, from microcontrollers to more +powerful processors. It should also handle both small-scale and +large-scale deployments seamlessly. -## Trends in ML Frameworks +#### Integration with Data Engineering Tools -Explanation: Discuss where these ML frameworks are heading in the future. Perhaps consider discussing ML for ML frameworks? +Data engineering tools are essential for data preprocessing and pipeline +management. An ideal AI framework for embedded systems should seamlessly +integrate with these tools, allowing for efficient data ingestion, +transformation, and model training. -- Framework Developments on the Horizon -- Anticipated Innovations in the Field +#### Integration with Model Optimization Tools -## Challenges and Limitations +Model optimization is crucial to ensure that AI models are well-suited +for embedded deployment. Evaluate whether the framework integrates with +model optimization tools, such as TensorFlow Lite Converter or ONNX +Runtime, to facilitate model quantization and size reduction. -Explanation: None of the frameworks are perfect, so it is important to understand their limitations and challenges. +#### Ease of Use -- Model compatibility and interoperability issues -- Scalability and performance challenges -- Addressing the evolving needs of AI developers +The ease of use of an AI framework significantly impacts development +efficiency. A framework with a user-friendly interface and clear +documentation reduces the learning curve for developers. Consideration +should be given to whether the framework supports high-level APIs, +allowing developers to focus on model design rather than low-level +implementation details. This factor is incredibly important for embedded +systems, which have less features that typical developers might be +accustomed to. + +#### Community Support + +Community support plays another essential factor. Frameworks with active +and engaged communities often have well-maintained codebases, receive +regular updates, and provide valuable forums for problem-solving. As a +result, community support plays into Ease of Use as well because it +ensures that developers have access to a wealth of resources, including +tutorials and example projects. Community support provides some +assurance that the framework will continue to be supported for future +updates. There are only a handful of frameworks that cater to TinyML +needs. Of that, TensorFlow Lite Micro is the most popular and has the +most community support. + +## Future Trends in ML Frameworks + +### Decomposition + +Currently, the ML system stack consists of four abstractions, namely (1) +computational graphs, (2) tensor programs, (3) libraries and runtimes, +and (4) hardware +primitives. + +![](images_ml_frameworks/image8.png){fig-align="center" width=70%} + +This has led to vertical (i.e. between abstraction levels) and +horizontal (i.e. library-driven vs. compilation-driven approaches to +tensor computation) boundaries, which hinder innovation for ML. Future +work in ML frameworks can look toward breaking these boundaries. In +December 2021, [Apache TVM](https://tvm.apache.org/2021/12/15/tvm-unity) Unity was proposed, which aimed to facilitate +interactions between the different abstraction levels (as well as the +people behind them, such as ML scientists, ML engineers, and hardware +engineers) and co-optimize decisions in all four abstraction levels. + +### High-Performance Compilers & Libraries + +As ML frameworks further develop, high-performance compilers and +libraries will continue to emerge. Some current examples include +[TensorFlow +XLA](https://www.tensorflow.org/xla/architecture) and +Nvidia's +[CUTLASS](https://developer.nvidia.com/blog/cutlass-linear-algebra-cuda/), +which accelerate linear algebra operations in computational graphs, and +Nvidia's +[TensorRT](https://developer.nvidia.com/tensorrt), which +accelerates and optimizes inference. + +### ML for ML Frameworks + +We can also use ML to improve ML frameworks in the future. Some current +uses of ML for ML frameworks include: + +- hyperparameter optimization using techniques such as Bayesian + optimization, random search, and grid search + +- neural architecture search (NAS) to automatically search for optimal + network architectures + +- AutoML, which as described in the [Advanced Features][@sec-ai_frameworks-advanced] section, + automates the ML pipeline. ## Conclusion -- Summary of Key Takeaways -- Recommendations for Further Learning \ No newline at end of file +In summary, selecting the optimal framework requires thoroughly +evaluating options against criteria like usability, community support, +performance, hardware compatibility, and model conversion abilities. +There is no universal best solution, as the right framework depends on +the specific constraints and use case. + +For extremely resource constrained microcontroller-based platforms, +TensorFlow Lite Micro currently provides a strong starting point. Its +comprehensive optimization tooling like quantization mapping and kernel +optimizations enables high performance on devices like Arm Cortex-M and +RISC-V processors. The active developer community ensures accessible +technical support. Seamless integration with TensorFlow for training and +converting models makes the workflow cohesive. + +For platforms with more capable CPUs like Cortex-A, TensorFlow Lite for +Microcontrollers expand possibilities. They provide greater flexibility +for custom and advanced models beyond the core operators in TFLite +Micro. However, this comes at the cost of a larger memory footprint. +These frameworks are ideal for automotive systems, drones, and more +powerful edge devices that can benefit from greater model +sophistication. + +Frameworks specifically built for specialized hardware like CMSIS-NN on +Cortex-M processors can further maximize performance, but sacrifice +portability. Integrated frameworks from processor vendors tailor the +stack to their architectures. This can unlock the full potential of +their chips but lock you into their ecosystem. + +Ultimately, choosing the right framework involves finding the best match +between its capabilities and the requirements of the target platform. +This requires balancing tradeoffs between performance needs, hardware +constraints, model complexity, and other factors. Thoroughly assessing +intended models, use cases, and evaluating options against key metrics +will guide developers towards picking the ideal framework for their +embedded ML application. \ No newline at end of file diff --git a/generative_ai.qmd b/generative_ai.qmd index 1ed91431..b48d0b1c 100644 --- a/generative_ai.qmd +++ b/generative_ai.qmd @@ -1,6 +1,6 @@ # Generative AI -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * coming soon. diff --git a/hw_acceleration.qmd b/hw_acceleration.qmd index 46d474ca..266b096f 100644 --- a/hw_acceleration.qmd +++ b/hw_acceleration.qmd @@ -1,6 +1,6 @@ # AI Acceleration -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * coming soon. diff --git a/image_classification.qmd b/image_classification.qmd new file mode 100644 index 00000000..b5f19ecb --- /dev/null +++ b/image_classification.qmd @@ -0,0 +1,512 @@ +# CV on Nicla Vision {.unnumbered} + +## Introduction + +As we initiate our studies into embedded machine learning or tinyML, it's impossible to overlook the transformative impact of Computer Vision (CV) and Artificial Intelligence (AI) in our lives. These two intertwined disciplines redefine what machines can perceive and accomplish, from autonomous vehicles and robotics to healthcare and surveillance. + +More and more, we are facing an artificial intelligence (AI) revolution where, as stated by Gartner, **Edge AI** has a very high impact potential, and **it is for now**! + +![](images/imgs_image_classification/image2.jpg){fig-align="center" width="4.729166666666667in"} + +In the "bullseye" of the Radar is the *Edge Computer Vision*, and when we talk about Machine Learning (ML) applied to vision, the first thing that comes to mind is **Image Classification**, a kind of ML "Hello World"! + +This exercise will explore a computer vision project utilizing Convolutional Neural Networks (CNNs) for real-time image classification. Leveraging TensorFlow's robust ecosystem, we'll implement a pre-trained MobileNet model and adapt it for edge deployment. The focus will be on optimizing the model to run efficiently on resource-constrained hardware without sacrificing accuracy. + +We'll employ techniques like quantization and pruning to reduce the computational load. By the end of this tutorial, you'll have a working prototype capable of classifying images in real-time, all running on a low-power embedded system based on the Arduino Nicla Vision board. + +## Computer Vision + +At its core, computer vision aims to enable machines to interpret and make decisions based on visual data from the world, essentially mimicking the capability of the human optical system. Conversely, AI is a broader field encompassing machine learning, natural language processing, and robotics, among other technologies. When you bring AI algorithms into computer vision projects, you supercharge the system's ability to understand, interpret, and react to visual stimuli. + +When discussing Computer Vision projects applied to embedded devices, the most common applications that come to mind are *Image Classification* and *Object Detection*. + +![](images/imgs_image_classification/image15.jpg){fig-align="center" width="6.5in"} + +Both models can be implemented on tiny devices like the Arduino Nicla Vision and used on real projects. In this chapter, we will cover Image Classification. + +## Image Classification Project Goal + +The first step in any ML project is to define the goal. In this case, it is to detect and classify two specific objects present in one image. For this project, we will use two small toys: a *robot* and a small Brazilian parrot (named *Periquito*). Also, we will collect images of a *background* where those two objects are absent. + +![](images/imgs_image_classification/image36.jpg){fig-align="center" width="6.5in"} + +## Data Collection + +Once you have defined your Machine Learning project goal, the next and most crucial step is the dataset collection. You can use the Edge Impulse Studio, the OpenMV IDE we installed, or even your phone for the image capture. Here, we will use the OpenMV IDE for that. + +### Collecting Dataset with OpenMV IDE + +First, create in your computer a folder where your data will be saved, for example, "data." Next, on the OpenMV IDE, go to `Tools > Dataset Editor` and select `New Dataset` to start the dataset collection: + +![](images/imgs_image_classification/image29.png){fig-align="center" width="6.291666666666667in"} + +The IDE will ask you to open the file where your data will be saved and choose the "data" folder that was created. Note that new icons will appear on the Left panel. + +![](images/imgs_image_classification/image46.png){fig-align="center" width="0.9583333333333334in"} + +Using the upper icon (1), enter with the first class name, for example, "periquito": + +![](images/imgs_image_classification/image22.png){fig-align="center" width="3.25in"} + +Running the `dataset_capture_script.py` and clicking on the camera icon (2), will start capturing images: + +![](images/imgs_image_classification/image43.png){fig-align="center" width="6.5in"} + +Repeat the same procedure with the other classes + +![](images/imgs_image_classification/image6.jpg){fig-align="center" width="6.5in"} + +> We suggest around 60 images from each category. Try to capture different angles, backgrounds, and light conditions. + +The stored images use a QVGA frame size of 320x240 and the RGB565 (color pixel format). + +After capturing your dataset, close the Dataset Editor Tool on the `Tools > Dataset Editor`. + +On your computer, you will end with a dataset that contains three classes: *periquito,* *robot*, and *background*. + +![](images/imgs_image_classification/image20.png){fig-align="center" width="6.5in"} + +You should return to *Edge Impulse Studio* and upload the dataset to your project. + +## Training the model with Edge Impulse Studio + +We will use the Edge Impulse Studio for training our model. Enter your account credentials and create a new project: + +![](images/imgs_image_classification/image45.png){fig-align="center" width="6.5in"} + +> Here, you can clone a similar project: [NICLA-Vision_Image_Classification](https://studio.edgeimpulse.com/public/273858/latest). + +## Dataset + +Using the EI Studio (or *Studio*), we will go over four main steps to have our model ready for use on the Nicla Vision board: Dataset, Impulse, Tests, and Deploy (on the Edge Device, in this case, the NiclaV). + +![](images/imgs_image_classification/image41.jpg){fig-align="center" width="6.5in"} + +Regarding the Dataset, it is essential to point out that our Original Dataset, captured with the OpenMV IDE, will be split into *Training*, *Validation*, and *Test*. The Test Set will be divided from the beginning, and a part will reserved to be used only in the Test phase after training. The Validation Set will be used during training. + +![](images/imgs_image_classification/image7.jpg){fig-align="center" width="6.5in"} + +On Studio, go to the Data acquisition tab, and on the UPLOAD DATA section, upload the chosen categories files from your computer: + +![](images/imgs_image_classification/image39.png){fig-align="center" width="6.5in"} + +Leave to the Studio the splitting of the original dataset into *train and test* and choose the label about that specific data: + +![](images/imgs_image_classification/image30.png){fig-align="center" width="6.5in"} + +Repeat the procedure for all three classes. At the end, you should see your "raw data" in the Studio: + +![](images/imgs_image_classification/image11.png){fig-align="center" width="6.5in"} + +The Studio allows you to explore your data, showing a complete view of all the data in your project. You can clear, inspect, or change labels by clicking on individual data items. In our case, a very simple project, the data seems OK. + +![](images/imgs_image_classification/image44.png){fig-align="center" width="6.5in"} + +## The Impulse Design + +In this phase, we should define how to: + +- Pre-process our data, which consists of resizing the individual images and determining the `color depth` to use (be it RGB or Grayscale) and + +- Specify a Model, in this case, it will be the `Transfer Learning (Images)` to fine-tune a pre-trained MobileNet V2 image classification model on our data. This method performs well even with relatively small image datasets (around 150 images in our case). + +![](images/imgs_image_classification/image23.jpg){fig-align="center" width="6.5in"} + +Transfer Learning with MobileNet offers a streamlined approach to model training, which is especially beneficial for resource-constrained environments and projects with limited labeled data. MobileNet, known for its lightweight architecture, is a pre-trained model that has already learned valuable features from a large dataset (ImageNet). + +![](images/imgs_image_classification/image9.jpg){fig-align="center" width="6.5in"} + +By leveraging these learned features, you can train a new model for your specific task with fewer data and computational resources and yet achieve competitive accuracy. + +![](images/imgs_image_classification/image32.jpg){fig-align="center" width="6.5in"} + +This approach significantly reduces training time and computational cost, making it ideal for quick prototyping and deployment on embedded devices where efficiency is paramount. + +Go to the Impulse Design Tab and create the *impulse*, defining an image size of 96x96 and squashing them (squared form, without cropping). Select Image and Transfer Learning blocks. Save the Impulse. + +![](images/imgs_image_classification/image16.png){fig-align="center" width="6.5in"} + +### Image Pre-Processing + +All the input QVGA/RGB565 images will be converted to 27,640 features (96x96x3). + +![](images/imgs_image_classification/image17.png){fig-align="center" width="6.5in"} + +Press \[Save parameters\] and Generate all features: + +![](images/imgs_image_classification/image5.png){fig-align="center" width="6.5in"} + +### Model Design + +In 2007, Google introduced [[MobileNetV1]{.underline}](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html), a family of general-purpose computer vision neural networks designed with mobile devices in mind to support classification, detection, and more. MobileNets are small, low-latency, low-power models parameterized to meet the resource constraints of various use cases. in 2018, Google launched [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381). + +MobileNet V1 and MobileNet V2 aim at mobile efficiency and embedded vision applications but differ in architectural complexity and performance. While both use depthwise separable convolutions to reduce the computational cost, MobileNet V2 introduces Inverted Residual Blocks and Linear Bottlenecks to enhance performance. These new features allow V2 to capture more complex features using fewer parameters, making it computationally more efficient and generally more accurate than its predecessor. Additionally, V2 employs a non-linear activation in the intermediate expansion layer. It still uses a linear activation for the bottleneck layer, a design choice found to preserve important information through the network. MobileNet V2 offers an optimized architecture for higher accuracy and efficiency and will be used in this project. + +Although the base MobileNet architecture is already tiny and has low latency, many times, a specific use case or application may require the model to be even smaller and faster. MobileNets introduces a straightforward parameter α (alpha) called width multiplier to construct these smaller, less computationally expensive models. The role of the width multiplier α is that of thinning a network uniformly at each layer. + +Edge Impulse Studio can use both MobileNetV1 (96x96 images) and V2 (96x96 or 160x160 images), with several different **α** values (from 0.05 to 1.0). For example, you will get the highest accuracy with V2, 160x160 images, and α=1.0. Of course, there is a trade-off. The higher the accuracy, the more memory (around 1.3MB RAM and 2.6MB ROM) will be needed to run the model, implying more latency. The smaller footprint will be obtained at the other extreme with MobileNetV1 and α=0.10 (around 53.2K RAM and 101K ROM). + +![](images/imgs_image_classification/image27.jpg){fig-align="center" width="6.5in"} + +We will use **MobileNetV2 96x96 0.1** for this project, with an estimated memory cost of 265.3 KB in RAM. This model should be OK for the Nicla Vision with 1MB of SRAM. On the Transfer Learning Tab, select this model: + +![](images/imgs_image_classification/image24.png){fig-align="center" width="6.5in"} + +## Model Training + +Another valuable technique to be used with Deep Learning is **Data Augmentation**. Data augmentation is a method to improve the accuracy of machine learning models by creating additional artificial data. A data augmentation system makes small, random changes to your training data during the training process (such as flipping, cropping, or rotating the images). + +Looking under the hood, here you can see how Edge Impulse implements a data Augmentation policy on your data: + +``` python +# Implements the data augmentation policy +def augment_image(image, label): + # Flips the image randomly + image = tf.image.random_flip_left_right(image) + + # Increase the image size, then randomly crop it down to + # the original dimensions + resize_factor = random.uniform(1, 1.2) + new_height = math.floor(resize_factor * INPUT_SHAPE[0]) + new_width = math.floor(resize_factor * INPUT_SHAPE[1]) + image = tf.image.resize_with_crop_or_pad(image, new_height, new_width) + image = tf.image.random_crop(image, size=INPUT_SHAPE) + + # Vary the brightness of the image + image = tf.image.random_brightness(image, max_delta=0.2) + + return image, label +``` + +Exposure to these variations during training can help prevent your model from taking shortcuts by "memorizing" superficial clues in your training data, meaning it may better reflect the deep underlying patterns in your dataset. + +The final layer of our model will have 12 neurons with a 15% dropout for overfitting prevention. Here is the Training result: + +![](images/imgs_image_classification/image31.jpg){fig-align="center" width="6.5in"} + +The result is excellent, with 77ms of latency, which should result in 13fps (frames per second) during inference. + +## Model Testing + +![](images/imgs_image_classification/image10.jpg){fig-align="center" width="6.5in"} + +Now, you should take the data set aside at the start of the project and run the trained model using it as input: + +![](images/imgs_image_classification/image34.png){fig-align="center" width="3.1041666666666665in"} + +The result is, again, excellent. + +![](images/imgs_image_classification/image12.png){fig-align="center" width="6.5in"} + +## Deploying the model + +At this point, we can deploy the trained model as.tflite and use the OpenMV IDE to run it using MicroPython, or we can deploy it as a C/C++ or an Arduino library. + +![](images/imgs_image_classification/image28.jpg){fig-align="center" width="6.5in"} + +### Arduino Library + +First, Let's deploy it as an Arduino Library: + +![](images/imgs_image_classification/image48.png){fig-align="center" width="6.5in"} + +You should install the library as.zip on the Arduino IDE and run the sketch *nicla_vision_camera.ino* available in Examples under your library name. + +> Note that Arduino Nicla Vision has, by default, 512KB of RAM allocated for the M7 core and an additional 244KB on the M4 address space. In the code, this allocation was changed to 288 kB to guarantee that the model will run on the device (`malloc_addblock((void*)0x30000000, 288 * 1024);`). + +The result is good, with 86ms of measured latency. + +![](images/imgs_image_classification/image25.jpg){fig-align="center" width="6.5in"} + +Here is a short video showing the inference results: {{< video https://youtu.be/bZPZZJblU-o width="480" height="270" center >}} + +### OpenMV + +It is possible to deploy the trained model to be used with OpenMV in two ways: as a library and as a firmware. + +Three files are generated as a library: the trained.tflite model, a list with labels, and a simple MicroPython script that can make inferences using the model. + +![](images/imgs_image_classification/image26.png){fig-align="center" width="6.5in"} + +Running this model as a *.tflite* directly in the Nicla was impossible. So, we can sacrifice the accuracy using a smaller model or deploy the model as an OpenMV Firmware (FW). Choosing FW, the Edge Impulse Studio generates optimized models, libraries, and frameworks needed to make the inference. Let's explore this option. + +Select `OpenMV Firmware` on the `Deploy Tab` and press `[Build]`. + +![](images/imgs_image_classification/image3.png){fig-align="center" width="6.5in"} + +On your computer, you will find a ZIP file. Open it: + +![](images/imgs_image_classification/image33.png){fig-align="center" width="6.5in"} + +Use the Bootloader tool on the OpenMV IDE to load the FW on your board: + +![](images/imgs_image_classification/image35.jpg){fig-align="center" width="6.5in"} + +Select the appropriate file (.bin for Nicla-Vision): + +![](images/imgs_image_classification/image8.png){fig-align="center" width="6.5in"} + +After the download is finished, press OK: + +![](images/imgs_image_classification/image40.png){fig-align="center" width="3.875in"} + +If a message says that the FW is outdated, DO NOT UPGRADE. Select \[NO\]. + +![](images/imgs_image_classification/image42.png){fig-align="center" width="4.572916666666667in"} + +Now, open the script **ei_image_classification.py** that was downloaded from the Studio and the.bin file for the Nicla. + +![](images/imgs_image_classification/image14.png){fig-align="center" width="6.5in"} + +Run it. Pointing the camera to the objects we want to classify, the inference result will be displayed on the Serial Terminal. + +![](images/imgs_image_classification/image37.png){fig-align="center" width="6.5in"} + +#### Changing the Code to add labels + +The code provided by Edge Impulse can be modified so that we can see, for test reasons, the inference result directly on the image displayed on the OpenMV IDE. + +[[Upload the code from GitHub,]{.underline}](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Micropython/nicla_image_classification.py) or modify it as below: + +``` python +# Marcelo Rovai - NICLA Vision - Image Classification +# Adapted from Edge Impulse - OpenMV Image Classification Example +# @24Aug23 + +import sensor, image, time, os, tf, uos, gc + +sensor.reset() # Reset and initialize the sensor. +sensor.set_pixformat(sensor.RGB565) # Set pxl fmt to RGB565 (or GRAYSCALE) +sensor.set_framesize(sensor.QVGA) # Set frame size to QVGA (320x240) +sensor.set_windowing((240, 240)) # Set 240x240 window. +sensor.skip_frames(time=2000) # Let the camera adjust. + +net = None +labels = None + +try: + # Load built in model + labels, net = tf.load_builtin_model('trained') +except Exception as e: + raise Exception(e) + +clock = time.clock() +while(True): + clock.tick() # Starts tracking elapsed time. + + img = sensor.snapshot() + + # default settings just do one detection + for obj in net.classify(img, + min_scale=1.0, + scale_mul=0.8, + x_overlap=0.5, + y_overlap=0.5): + fps = clock.fps() + lat = clock.avg() + + print("**********\nPrediction:") + img.draw_rectangle(obj.rect()) + # This combines the labels and confidence values into a list of tuples + predictions_list = list(zip(labels, obj.output())) + + max_val = predictions_list[0][1] + max_lbl = 'background' + for i in range(len(predictions_list)): + val = predictions_list[i][1] + lbl = predictions_list[i][0] + + if val > max_val: + max_val = val + max_lbl = lbl + + # Print label with the highest probability + if max_val < 0.5: + max_lbl = 'uncertain' + print("{} with a prob of {:.2f}".format(max_lbl, max_val)) + print("FPS: {:.2f} fps ==> latency: {:.0f} ms".format(fps, lat)) + + # Draw label with highest probability to image viewer + img.draw_string( + 10, 10, + max_lbl + "\n{:.2f}".format(max_val), + mono_space = False, + scale=2 + ) +``` + +Here you can see the result: + +![](images/imgs_image_classification/image47.jpg){fig-align="center" width="6.5in"} + +Note that the latency (136 ms) is almost double of what we got directly with the Arduino IDE. This is because we are using the IDE as an interface and also the time to wait for the camera to be ready. If we start the clock just before the inference: + +![](images/imgs_image_classification/image13.jpg){fig-align="center" width="6.5in"} + +The latency will drop to only 71 ms. + +![](images/imgs_image_classification/image1.jpg){fig-align="center" width="3.5520833333333335in"} + +> The NiclaV runs about half as fast when connected to the IDE. The FPS should increase once disconnected. + +#### Post-Processing with LEDs + +When working with embedded machine learning, we are looking for devices that can continually proceed with the inference and result, taking some action directly on the physical world and not displaying the result on a connected computer. To simulate this, we will light up a different LED for each possible inference result. + +![](images/imgs_image_classification/image38.jpg){fig-align="center" width="6.5in"} + +To accomplish that, we should [[upload the code from GitHub]{.underline}](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Micropython/nicla_image_classification_LED.py) or change the last code to include the LEDs: + +``` python +# Marcelo Rovai - NICLA Vision - Image Classification with LEDs +# Adapted from Edge Impulse - OpenMV Image Classification Example +# @24Aug23 + +import sensor, image, time, os, tf, uos, gc, pyb + +ledRed = pyb.LED(1) +ledGre = pyb.LED(2) +ledBlu = pyb.LED(3) + +sensor.reset() # Reset and initialize the sensor. +sensor.set_pixformat(sensor.RGB565) # Set pixl fmt to RGB565 (or GRAYSCALE) +sensor.set_framesize(sensor.QVGA) # Set frame size to QVGA (320x240) +sensor.set_windowing((240, 240)) # Set 240x240 window. +sensor.skip_frames(time=2000) # Let the camera adjust. + +net = None +labels = None + +ledRed.off() +ledGre.off() +ledBlu.off() + +try: + # Load built in model + labels, net = tf.load_builtin_model('trained') +except Exception as e: + raise Exception(e) + +clock = time.clock() + + +def setLEDs(max_lbl): + + if max_lbl == 'uncertain': + ledRed.on() + ledGre.off() + ledBlu.off() + + if max_lbl == 'periquito': + ledRed.off() + ledGre.on() + ledBlu.off() + + if max_lbl == 'robot': + ledRed.off() + ledGre.off() + ledBlu.on() + + if max_lbl == 'background': + ledRed.off() + ledGre.off() + ledBlu.off() + + +while(True): + img = sensor.snapshot() + clock.tick() # Starts tracking elapsed time. + + # default settings just do one detection. + for obj in net.classify(img, + min_scale=1.0, + scale_mul=0.8, + x_overlap=0.5, + y_overlap=0.5): + fps = clock.fps() + lat = clock.avg() + + print("**********\nPrediction:") + img.draw_rectangle(obj.rect()) + # This combines the labels and confidence values into a list of tuples + predictions_list = list(zip(labels, obj.output())) + + max_val = predictions_list[0][1] + max_lbl = 'background' + for i in range(len(predictions_list)): + val = predictions_list[i][1] + lbl = predictions_list[i][0] + + if val > max_val: + max_val = val + max_lbl = lbl + + # Print label and turn on LED with the highest probability + if max_val < 0.8: + max_lbl = 'uncertain' + + setLEDs(max_lbl) + + print("{} with a prob of {:.2f}".format(max_lbl, max_val)) + print("FPS: {:.2f} fps ==> latency: {:.0f} ms".format(fps, lat)) + + # Draw label with highest probability to image viewer + img.draw_string( + 10, 10, + max_lbl + "\n{:.2f}".format(max_val), + mono_space = False, + scale=2 + ) +``` + +Now, each time that a class scores a result greater than 0.8, the correspondent LED will be lit: + +- Led Red 0n: Uncertain (no class is over 0.8) + +- Led Green 0n: Periquito \> 0.8 + +- Led Blue 0n: Robot \> 0.8 + +- All LEDs Off: Background \> 0.8 + +Here is the result: + +![](images/imgs_image_classification/image18.jpg){fig-align="center" width="6.5in"} + +In more detail + +![](images/imgs_image_classification/image21.jpg){fig-align="center" width="6.5in"} + +## Image Classification (non-official) Benchmark + +Several development boards can be used for embedded machine learning (tinyML), and the most common ones for Computer Vision applications (consuming low energy), are the ESP32 CAM, the Seeed XIAO ESP32S3 Sense, the Arduino Nicla Vison, and the Arduino Portenta. + +![](images/imgs_image_classification/image19.jpg){fig-align="center" width="6.5in"} + +Catching the opportunity, the same trained model was deployed on the ESP-CAM, the XIAO, and the Portenta (in this one, the model was trained again, using grayscaled images to be compatible with its camera). Here is the result, deploying the models as Arduino's Library: + +![](images/imgs_image_classification/image4.jpg){fig-align="center" width="6.5in"} + +## Conclusion + +Before we finish, consider that Computer Vision is more than just image classification. For example, you can develop Edge Machine Learning projects around vision in several areas, such as: + +- **Autonomous Vehicles**: Use sensor fusion, lidar data, and computer vision algorithms to navigate and make decisions. + +- **Healthcare**: Automated diagnosis of diseases through MRI, X-ray, and CT scan image analysis + +- **Retail**: Automated checkout systems that identify products as they pass through a scanner. + +- **Security and Surveillance**: Facial recognition, anomaly detection, and object tracking in real-time video feeds. + +- **Augmented Reality**: Object detection and classification to overlay digital information in the real world. + +- **Industrial Automation**: Visual inspection of products, predictive maintenance, and robot and drone guidance. + +- **Agriculture**: Drone-based crop monitoring and automated harvesting. + +- **Natural Language Processing**: Image captioning and visual question answering. + +- **Gesture Recognition**: For gaming, sign language translation, and human-machine interaction. + +- **Content Recommendation**: Image-based recommendation systems in e-commerce. diff --git a/images/benchmarking/coco.png b/images/benchmarking/coco.png new file mode 100644 index 00000000..1bdecf5f Binary files /dev/null and b/images/benchmarking/coco.png differ diff --git a/images/benchmarking/cover_ai_benchmarking.png b/images/benchmarking/cover_ai_benchmarking.png new file mode 100644 index 00000000..eb5c83c1 Binary files /dev/null and b/images/benchmarking/cover_ai_benchmarking.png differ diff --git a/images/benchmarking/dynabench.png b/images/benchmarking/dynabench.png new file mode 100644 index 00000000..31f51a14 Binary files /dev/null and b/images/benchmarking/dynabench.png differ diff --git a/images/benchmarking/end2end.png b/images/benchmarking/end2end.png new file mode 100644 index 00000000..e68a7741 Binary files /dev/null and b/images/benchmarking/end2end.png differ diff --git a/images/benchmarking/hardware_lottery.png b/images/benchmarking/hardware_lottery.png new file mode 100644 index 00000000..05a986aa Binary files /dev/null and b/images/benchmarking/hardware_lottery.png differ diff --git a/images/benchmarking/imagenet.png b/images/benchmarking/imagenet.png new file mode 100644 index 00000000..2d7ca548 Binary files /dev/null and b/images/benchmarking/imagenet.png differ diff --git a/images/benchmarking/mlperf_tiny.png b/images/benchmarking/mlperf_tiny.png new file mode 100644 index 00000000..b051a281 Binary files /dev/null and b/images/benchmarking/mlperf_tiny.png differ diff --git a/images/benchmarking/mnist.png b/images/benchmarking/mnist.png new file mode 100644 index 00000000..ac3509cb Binary files /dev/null and b/images/benchmarking/mnist.png differ diff --git a/images/benchmarking/trifecta.png b/images/benchmarking/trifecta.png new file mode 100644 index 00000000..e0e5d0c9 Binary files /dev/null and b/images/benchmarking/trifecta.png differ diff --git a/images/cover_ai_workflow.png b/images/cover_ai_workflow.png new file mode 100644 index 00000000..74e2c8eb Binary files /dev/null and b/images/cover_ai_workflow.png differ diff --git a/images/cover_data_engineering.png b/images/cover_data_engineering.png new file mode 100644 index 00000000..35641f43 Binary files /dev/null and b/images/cover_data_engineering.png differ diff --git a/images/cover_dl_primer.png b/images/cover_dl_primer.png new file mode 100644 index 00000000..4c021821 Binary files /dev/null and b/images/cover_dl_primer.png differ diff --git a/images/cover_embedded_ai.png b/images/cover_embedded_ai.png new file mode 100644 index 00000000..5b9b1cac Binary files /dev/null and b/images/cover_embedded_ai.png differ diff --git a/images/cover_embedded_sys.png b/images/cover_embedded_sys.png new file mode 100644 index 00000000..4ed51e9e Binary files /dev/null and b/images/cover_embedded_sys.png differ diff --git a/images/cover_ml_frameworks.png b/images/cover_ml_frameworks.png new file mode 100644 index 00000000..b84ddfcc Binary files /dev/null and b/images/cover_ml_frameworks.png differ diff --git a/images_4/media/image1.jpg b/images/imgs_image_classification/image1.jpg similarity index 100% rename from images_4/media/image1.jpg rename to images/imgs_image_classification/image1.jpg diff --git a/images_4/media/image10.jpg b/images/imgs_image_classification/image10.jpg similarity index 100% rename from images_4/media/image10.jpg rename to images/imgs_image_classification/image10.jpg diff --git a/images_4/media/image11.png b/images/imgs_image_classification/image11.png similarity index 100% rename from images_4/media/image11.png rename to images/imgs_image_classification/image11.png diff --git a/images_4/media/image12.png b/images/imgs_image_classification/image12.png similarity index 100% rename from images_4/media/image12.png rename to images/imgs_image_classification/image12.png diff --git a/images_4/media/image13.jpg b/images/imgs_image_classification/image13.jpg similarity index 100% rename from images_4/media/image13.jpg rename to images/imgs_image_classification/image13.jpg diff --git a/images_4/media/image14.png b/images/imgs_image_classification/image14.png similarity index 100% rename from images_4/media/image14.png rename to images/imgs_image_classification/image14.png diff --git a/images_4/media/image15.jpg b/images/imgs_image_classification/image15.jpg similarity index 100% rename from images_4/media/image15.jpg rename to images/imgs_image_classification/image15.jpg diff --git a/images_4/media/image16.png b/images/imgs_image_classification/image16.png similarity index 100% rename from images_4/media/image16.png rename to images/imgs_image_classification/image16.png diff --git a/images_4/media/image17.png b/images/imgs_image_classification/image17.png similarity index 100% rename from images_4/media/image17.png rename to images/imgs_image_classification/image17.png diff --git a/images_4/media/image18.jpg b/images/imgs_image_classification/image18.jpg similarity index 100% rename from images_4/media/image18.jpg rename to images/imgs_image_classification/image18.jpg diff --git a/images_4/media/image19.jpg b/images/imgs_image_classification/image19.jpg similarity index 100% rename from images_4/media/image19.jpg rename to images/imgs_image_classification/image19.jpg diff --git a/images_4/media/image2.jpg b/images/imgs_image_classification/image2.jpg similarity index 100% rename from images_4/media/image2.jpg rename to images/imgs_image_classification/image2.jpg diff --git a/images_4/media/image20.png b/images/imgs_image_classification/image20.png similarity index 100% rename from images_4/media/image20.png rename to images/imgs_image_classification/image20.png diff --git a/images_4/media/image21.jpg b/images/imgs_image_classification/image21.jpg similarity index 100% rename from images_4/media/image21.jpg rename to images/imgs_image_classification/image21.jpg diff --git a/images_4/media/image22.png b/images/imgs_image_classification/image22.png similarity index 100% rename from images_4/media/image22.png rename to images/imgs_image_classification/image22.png diff --git a/images_4/media/image23.jpg b/images/imgs_image_classification/image23.jpg similarity index 100% rename from images_4/media/image23.jpg rename to images/imgs_image_classification/image23.jpg diff --git a/images_4/media/image24.png b/images/imgs_image_classification/image24.png similarity index 100% rename from images_4/media/image24.png rename to images/imgs_image_classification/image24.png diff --git a/images_4/media/image25.jpg b/images/imgs_image_classification/image25.jpg similarity index 100% rename from images_4/media/image25.jpg rename to images/imgs_image_classification/image25.jpg diff --git a/images_4/media/image26.png b/images/imgs_image_classification/image26.png similarity index 100% rename from images_4/media/image26.png rename to images/imgs_image_classification/image26.png diff --git a/images_4/media/image27.jpg b/images/imgs_image_classification/image27.jpg similarity index 100% rename from images_4/media/image27.jpg rename to images/imgs_image_classification/image27.jpg diff --git a/images_4/media/image28.jpg b/images/imgs_image_classification/image28.jpg similarity index 100% rename from images_4/media/image28.jpg rename to images/imgs_image_classification/image28.jpg diff --git a/images_4/media/image29.png b/images/imgs_image_classification/image29.png similarity index 100% rename from images_4/media/image29.png rename to images/imgs_image_classification/image29.png diff --git a/images_4/media/image3.png b/images/imgs_image_classification/image3.png similarity index 100% rename from images_4/media/image3.png rename to images/imgs_image_classification/image3.png diff --git a/images_4/media/image30.png b/images/imgs_image_classification/image30.png similarity index 100% rename from images_4/media/image30.png rename to images/imgs_image_classification/image30.png diff --git a/images_4/media/image31.jpg b/images/imgs_image_classification/image31.jpg similarity index 100% rename from images_4/media/image31.jpg rename to images/imgs_image_classification/image31.jpg diff --git a/images_4/media/image32.jpg b/images/imgs_image_classification/image32.jpg similarity index 100% rename from images_4/media/image32.jpg rename to images/imgs_image_classification/image32.jpg diff --git a/images_4/media/image33.png b/images/imgs_image_classification/image33.png similarity index 100% rename from images_4/media/image33.png rename to images/imgs_image_classification/image33.png diff --git a/images_4/media/image34.png b/images/imgs_image_classification/image34.png similarity index 100% rename from images_4/media/image34.png rename to images/imgs_image_classification/image34.png diff --git a/images_4/media/image35.jpg b/images/imgs_image_classification/image35.jpg similarity index 100% rename from images_4/media/image35.jpg rename to images/imgs_image_classification/image35.jpg diff --git a/images_4/media/image36.jpg b/images/imgs_image_classification/image36.jpg similarity index 100% rename from images_4/media/image36.jpg rename to images/imgs_image_classification/image36.jpg diff --git a/images_4/media/image37.png b/images/imgs_image_classification/image37.png similarity index 100% rename from images_4/media/image37.png rename to images/imgs_image_classification/image37.png diff --git a/images_4/media/image38.jpg b/images/imgs_image_classification/image38.jpg similarity index 100% rename from images_4/media/image38.jpg rename to images/imgs_image_classification/image38.jpg diff --git a/images_4/media/image39.png b/images/imgs_image_classification/image39.png similarity index 100% rename from images_4/media/image39.png rename to images/imgs_image_classification/image39.png diff --git a/images_4/media/image4.jpg b/images/imgs_image_classification/image4.jpg similarity index 100% rename from images_4/media/image4.jpg rename to images/imgs_image_classification/image4.jpg diff --git a/images_4/media/image40.png b/images/imgs_image_classification/image40.png similarity index 100% rename from images_4/media/image40.png rename to images/imgs_image_classification/image40.png diff --git a/images_4/media/image41.jpg b/images/imgs_image_classification/image41.jpg similarity index 100% rename from images_4/media/image41.jpg rename to images/imgs_image_classification/image41.jpg diff --git a/images_4/media/image42.png b/images/imgs_image_classification/image42.png similarity index 100% rename from images_4/media/image42.png rename to images/imgs_image_classification/image42.png diff --git a/images_4/media/image43.png b/images/imgs_image_classification/image43.png similarity index 100% rename from images_4/media/image43.png rename to images/imgs_image_classification/image43.png diff --git a/images_4/media/image44.png b/images/imgs_image_classification/image44.png similarity index 100% rename from images_4/media/image44.png rename to images/imgs_image_classification/image44.png diff --git a/images_4/media/image45.png b/images/imgs_image_classification/image45.png similarity index 100% rename from images_4/media/image45.png rename to images/imgs_image_classification/image45.png diff --git a/images_4/media/image46.png b/images/imgs_image_classification/image46.png similarity index 100% rename from images_4/media/image46.png rename to images/imgs_image_classification/image46.png diff --git a/images_4/media/image47.jpg b/images/imgs_image_classification/image47.jpg similarity index 100% rename from images_4/media/image47.jpg rename to images/imgs_image_classification/image47.jpg diff --git a/images_4/media/image48.png b/images/imgs_image_classification/image48.png similarity index 100% rename from images_4/media/image48.png rename to images/imgs_image_classification/image48.png diff --git a/images_4/media/image5.png b/images/imgs_image_classification/image5.png similarity index 100% rename from images_4/media/image5.png rename to images/imgs_image_classification/image5.png diff --git a/images_4/media/image6.jpg b/images/imgs_image_classification/image6.jpg similarity index 100% rename from images_4/media/image6.jpg rename to images/imgs_image_classification/image6.jpg diff --git a/images_4/media/image7.jpg b/images/imgs_image_classification/image7.jpg similarity index 100% rename from images_4/media/image7.jpg rename to images/imgs_image_classification/image7.jpg diff --git a/images_4/media/image8.png b/images/imgs_image_classification/image8.png similarity index 100% rename from images_4/media/image8.png rename to images/imgs_image_classification/image8.png diff --git a/images_4/media/image9.jpg b/images/imgs_image_classification/image9.jpg similarity index 100% rename from images_4/media/image9.jpg rename to images/imgs_image_classification/image9.jpg diff --git a/images/imgs_kws_feature_eng/.ipynb_checkpoints/time_vs_freq-checkpoint.png b/images/imgs_kws_feature_eng/.ipynb_checkpoints/time_vs_freq-checkpoint.png new file mode 100644 index 00000000..a91e1707 Binary files /dev/null and b/images/imgs_kws_feature_eng/.ipynb_checkpoints/time_vs_freq-checkpoint.png differ diff --git a/images/imgs_kws_feature_eng/cover.jpg b/images/imgs_kws_feature_eng/cover.jpg new file mode 100644 index 00000000..8d8e8dc7 Binary files /dev/null and b/images/imgs_kws_feature_eng/cover.jpg differ diff --git a/images/imgs_kws_feature_eng/frame_to_fft.jpg b/images/imgs_kws_feature_eng/frame_to_fft.jpg new file mode 100644 index 00000000..bfb5bdc7 Binary files /dev/null and b/images/imgs_kws_feature_eng/frame_to_fft.jpg differ diff --git a/images/imgs_kws_feature_eng/frame_wind.jpg b/images/imgs_kws_feature_eng/frame_wind.jpg new file mode 100644 index 00000000..bb766860 Binary files /dev/null and b/images/imgs_kws_feature_eng/frame_wind.jpg differ diff --git a/images/imgs_kws_feature_eng/kws_diagram.jpg b/images/imgs_kws_feature_eng/kws_diagram.jpg new file mode 100644 index 00000000..e9e17d1a Binary files /dev/null and b/images/imgs_kws_feature_eng/kws_diagram.jpg differ diff --git a/images/imgs_kws_feature_eng/melbank-1_00.hires.jpg b/images/imgs_kws_feature_eng/melbank-1_00.hires.jpg new file mode 100644 index 00000000..5a93e86e Binary files /dev/null and b/images/imgs_kws_feature_eng/melbank-1_00.hires.jpg differ diff --git a/images/imgs_kws_feature_eng/mfcc_final.jpg b/images/imgs_kws_feature_eng/mfcc_final.jpg new file mode 100644 index 00000000..bec68dd1 Binary files /dev/null and b/images/imgs_kws_feature_eng/mfcc_final.jpg differ diff --git a/images/imgs_kws_feature_eng/time_vs_freq.jpg b/images/imgs_kws_feature_eng/time_vs_freq.jpg new file mode 100644 index 00000000..6e9ae476 Binary files /dev/null and b/images/imgs_kws_feature_eng/time_vs_freq.jpg differ diff --git a/images/imgs_kws_feature_eng/yes_no_mfcc.jpg b/images/imgs_kws_feature_eng/yes_no_mfcc.jpg new file mode 100644 index 00000000..d252862f Binary files /dev/null and b/images/imgs_kws_feature_eng/yes_no_mfcc.jpg differ diff --git a/images/imgs_kws_nicla/KWS_PROJ_INF_BLK.jpg b/images/imgs_kws_nicla/KWS_PROJ_INF_BLK.jpg new file mode 100644 index 00000000..886079a1 Binary files /dev/null and b/images/imgs_kws_nicla/KWS_PROJ_INF_BLK.jpg differ diff --git a/images/imgs_kws_nicla/KWS_PROJ_TRAIN_BLK.jpg b/images/imgs_kws_nicla/KWS_PROJ_TRAIN_BLK.jpg new file mode 100644 index 00000000..3e3d02ce Binary files /dev/null and b/images/imgs_kws_nicla/KWS_PROJ_TRAIN_BLK.jpg differ diff --git a/images/imgs_kws_nicla/MFCC.jpg b/images/imgs_kws_nicla/MFCC.jpg new file mode 100644 index 00000000..f5fe2752 Binary files /dev/null and b/images/imgs_kws_nicla/MFCC.jpg differ diff --git a/images/imgs_kws_nicla/audio_capt.jpg b/images/imgs_kws_nicla/audio_capt.jpg new file mode 100644 index 00000000..3af7c31a Binary files /dev/null and b/images/imgs_kws_nicla/audio_capt.jpg differ diff --git a/images/imgs_kws_nicla/code_ide.jpg b/images/imgs_kws_nicla/code_ide.jpg new file mode 100644 index 00000000..29a6022a Binary files /dev/null and b/images/imgs_kws_nicla/code_ide.jpg differ diff --git a/images/imgs_kws_nicla/dataset.jpg b/images/imgs_kws_nicla/dataset.jpg new file mode 100644 index 00000000..0acc65b8 Binary files /dev/null and b/images/imgs_kws_nicla/dataset.jpg differ diff --git a/images/imgs_kws_nicla/deploy.jpg b/images/imgs_kws_nicla/deploy.jpg new file mode 100644 index 00000000..b8795102 Binary files /dev/null and b/images/imgs_kws_nicla/deploy.jpg differ diff --git a/images/imgs_kws_nicla/ei_MFCC.jpg b/images/imgs_kws_nicla/ei_MFCC.jpg new file mode 100644 index 00000000..cd78a331 Binary files /dev/null and b/images/imgs_kws_nicla/ei_MFCC.jpg differ diff --git a/images/imgs_kws_nicla/ei_data_collection.jpg b/images/imgs_kws_nicla/ei_data_collection.jpg new file mode 100644 index 00000000..17630c85 Binary files /dev/null and b/images/imgs_kws_nicla/ei_data_collection.jpg differ diff --git a/images/imgs_kws_nicla/feat_expl.jpg b/images/imgs_kws_nicla/feat_expl.jpg new file mode 100644 index 00000000..26a39788 Binary files /dev/null and b/images/imgs_kws_nicla/feat_expl.jpg differ diff --git a/images/imgs_kws_nicla/files.jpg b/images/imgs_kws_nicla/files.jpg new file mode 100644 index 00000000..bfd6c435 Binary files /dev/null and b/images/imgs_kws_nicla/files.jpg differ diff --git a/images/imgs_kws_nicla/hey_google.png b/images/imgs_kws_nicla/hey_google.png new file mode 100644 index 00000000..a244f375 Binary files /dev/null and b/images/imgs_kws_nicla/hey_google.png differ diff --git a/images/imgs_kws_nicla/impulse.jpg b/images/imgs_kws_nicla/impulse.jpg new file mode 100644 index 00000000..cb9d0ae7 Binary files /dev/null and b/images/imgs_kws_nicla/impulse.jpg differ diff --git a/images/imgs_kws_nicla/install_zip.jpg b/images/imgs_kws_nicla/install_zip.jpg new file mode 100644 index 00000000..7771227b Binary files /dev/null and b/images/imgs_kws_nicla/install_zip.jpg differ diff --git a/images/imgs_kws_nicla/model.jpg b/images/imgs_kws_nicla/model.jpg new file mode 100644 index 00000000..e48ef1a3 Binary files /dev/null and b/images/imgs_kws_nicla/model.jpg differ diff --git a/images/imgs_kws_nicla/models_1d-2d.jpg b/images/imgs_kws_nicla/models_1d-2d.jpg new file mode 100644 index 00000000..f04dc3c5 Binary files /dev/null and b/images/imgs_kws_nicla/models_1d-2d.jpg differ diff --git a/images/imgs_kws_nicla/pa_block.jpg b/images/imgs_kws_nicla/pa_block.jpg new file mode 100644 index 00000000..79700aab Binary files /dev/null and b/images/imgs_kws_nicla/pa_block.jpg differ diff --git a/images/imgs_kws_nicla/pers_ass.jpg b/images/imgs_kws_nicla/pers_ass.jpg new file mode 100644 index 00000000..b0871d82 Binary files /dev/null and b/images/imgs_kws_nicla/pers_ass.jpg differ diff --git a/images/imgs_kws_nicla/phone.jpg b/images/imgs_kws_nicla/phone.jpg new file mode 100644 index 00000000..8f818118 Binary files /dev/null and b/images/imgs_kws_nicla/phone.jpg differ diff --git a/images/imgs_kws_nicla/split.jpg b/images/imgs_kws_nicla/split.jpg new file mode 100644 index 00000000..b2aa68f1 Binary files /dev/null and b/images/imgs_kws_nicla/split.jpg differ diff --git a/images/imgs_kws_nicla/test.jpg b/images/imgs_kws_nicla/test.jpg new file mode 100644 index 00000000..bcd0d7fd Binary files /dev/null and b/images/imgs_kws_nicla/test.jpg differ diff --git a/images/imgs_kws_nicla/train_errors.jpg b/images/imgs_kws_nicla/train_errors.jpg new file mode 100644 index 00000000..1402f29a Binary files /dev/null and b/images/imgs_kws_nicla/train_errors.jpg differ diff --git a/images/imgs_kws_nicla/train_graphs.jpg b/images/imgs_kws_nicla/train_graphs.jpg new file mode 100644 index 00000000..05074f76 Binary files /dev/null and b/images/imgs_kws_nicla/train_graphs.jpg differ diff --git a/images/imgs_kws_nicla/train_result.jpg b/images/imgs_kws_nicla/train_result.jpg new file mode 100644 index 00000000..6b40fe61 Binary files /dev/null and b/images/imgs_kws_nicla/train_result.jpg differ diff --git a/images/imgs_kws_nicla/upload.jpg b/images/imgs_kws_nicla/upload.jpg new file mode 100644 index 00000000..a38a481e Binary files /dev/null and b/images/imgs_kws_nicla/upload.jpg differ diff --git a/images/imgs_kws_nicla/yes.jpg b/images/imgs_kws_nicla/yes.jpg new file mode 100644 index 00000000..6741e31a Binary files /dev/null and b/images/imgs_kws_nicla/yes.jpg differ diff --git a/images/imgs_kws_nicla/yes_no.jpg b/images/imgs_kws_nicla/yes_no.jpg new file mode 100644 index 00000000..a53fbce2 Binary files /dev/null and b/images/imgs_kws_nicla/yes_no.jpg differ diff --git a/images_2/media/image1.png b/images/imgs_niclav_sys/image1.png similarity index 100% rename from images_2/media/image1.png rename to images/imgs_niclav_sys/image1.png diff --git a/images_2/media/image10.png b/images/imgs_niclav_sys/image10.png similarity index 100% rename from images_2/media/image10.png rename to images/imgs_niclav_sys/image10.png diff --git a/images_2/media/image11.jpg b/images/imgs_niclav_sys/image11.jpg similarity index 100% rename from images_2/media/image11.jpg rename to images/imgs_niclav_sys/image11.jpg diff --git a/images_2/media/image12.png b/images/imgs_niclav_sys/image12.png similarity index 100% rename from images_2/media/image12.png rename to images/imgs_niclav_sys/image12.png diff --git a/images_2/media/image13.jpg b/images/imgs_niclav_sys/image13.jpg similarity index 100% rename from images_2/media/image13.jpg rename to images/imgs_niclav_sys/image13.jpg diff --git a/images_2/media/image14.jpg b/images/imgs_niclav_sys/image14.jpg similarity index 100% rename from images_2/media/image14.jpg rename to images/imgs_niclav_sys/image14.jpg diff --git a/images_2/media/image15.jpg b/images/imgs_niclav_sys/image15.jpg similarity index 100% rename from images_2/media/image15.jpg rename to images/imgs_niclav_sys/image15.jpg diff --git a/images_2/media/image16.png b/images/imgs_niclav_sys/image16.png similarity index 100% rename from images_2/media/image16.png rename to images/imgs_niclav_sys/image16.png diff --git a/images_2/media/image17.png b/images/imgs_niclav_sys/image17.png similarity index 100% rename from images_2/media/image17.png rename to images/imgs_niclav_sys/image17.png diff --git a/images_2/media/image18.jpg b/images/imgs_niclav_sys/image18.jpg similarity index 100% rename from images_2/media/image18.jpg rename to images/imgs_niclav_sys/image18.jpg diff --git a/images_2/media/image19.jpg b/images/imgs_niclav_sys/image19.jpg similarity index 100% rename from images_2/media/image19.jpg rename to images/imgs_niclav_sys/image19.jpg diff --git a/images_2/media/image2.jpg b/images/imgs_niclav_sys/image2.jpg similarity index 100% rename from images_2/media/image2.jpg rename to images/imgs_niclav_sys/image2.jpg diff --git a/images_2/media/image20.jpg b/images/imgs_niclav_sys/image20.jpg similarity index 100% rename from images_2/media/image20.jpg rename to images/imgs_niclav_sys/image20.jpg diff --git a/images_2/media/image21.png b/images/imgs_niclav_sys/image21.png similarity index 100% rename from images_2/media/image21.png rename to images/imgs_niclav_sys/image21.png diff --git a/images_2/media/image22.jpg b/images/imgs_niclav_sys/image22.jpg similarity index 100% rename from images_2/media/image22.jpg rename to images/imgs_niclav_sys/image22.jpg diff --git a/images_2/media/image23.jpg b/images/imgs_niclav_sys/image23.jpg similarity index 100% rename from images_2/media/image23.jpg rename to images/imgs_niclav_sys/image23.jpg diff --git a/images/imgs_niclav_sys/image24.jpg b/images/imgs_niclav_sys/image24.jpg new file mode 100644 index 00000000..084cff5d Binary files /dev/null and b/images/imgs_niclav_sys/image24.jpg differ diff --git a/images_2/media/image25.png b/images/imgs_niclav_sys/image25.png similarity index 100% rename from images_2/media/image25.png rename to images/imgs_niclav_sys/image25.png diff --git a/images_2/media/image26.jpg b/images/imgs_niclav_sys/image26.jpg similarity index 100% rename from images_2/media/image26.jpg rename to images/imgs_niclav_sys/image26.jpg diff --git a/images_2/media/image27.png b/images/imgs_niclav_sys/image27.png similarity index 100% rename from images_2/media/image27.png rename to images/imgs_niclav_sys/image27.png diff --git a/images_2/media/image28.png b/images/imgs_niclav_sys/image28.png similarity index 100% rename from images_2/media/image28.png rename to images/imgs_niclav_sys/image28.png diff --git a/images_2/media/image29.jpg b/images/imgs_niclav_sys/image29.jpg similarity index 100% rename from images_2/media/image29.jpg rename to images/imgs_niclav_sys/image29.jpg diff --git a/images_2/media/image3.png b/images/imgs_niclav_sys/image3.png similarity index 100% rename from images_2/media/image3.png rename to images/imgs_niclav_sys/image3.png diff --git a/images_2/media/image4.png b/images/imgs_niclav_sys/image4.png similarity index 100% rename from images_2/media/image4.png rename to images/imgs_niclav_sys/image4.png diff --git a/images_2/media/image5.png b/images/imgs_niclav_sys/image5.png similarity index 100% rename from images_2/media/image5.png rename to images/imgs_niclav_sys/image5.png diff --git a/images_2/media/image6.png b/images/imgs_niclav_sys/image6.png similarity index 100% rename from images_2/media/image6.png rename to images/imgs_niclav_sys/image6.png diff --git a/images_2/media/image7.png b/images/imgs_niclav_sys/image7.png similarity index 100% rename from images_2/media/image7.png rename to images/imgs_niclav_sys/image7.png diff --git a/images_2/media/image8.png b/images/imgs_niclav_sys/image8.png similarity index 100% rename from images_2/media/image8.png rename to images/imgs_niclav_sys/image8.png diff --git a/images_2/media/image9.png b/images/imgs_niclav_sys/image9.png similarity index 100% rename from images_2/media/image9.png rename to images/imgs_niclav_sys/image9.png diff --git a/images/imgs_object_detection_fomo/cv_obj_detect.jpg b/images/imgs_object_detection_fomo/cv_obj_detect.jpg new file mode 100644 index 00000000..7918fa17 Binary files /dev/null and b/images/imgs_object_detection_fomo/cv_obj_detect.jpg differ diff --git a/images/imgs_object_detection_fomo/data_folder.jpg b/images/imgs_object_detection_fomo/data_folder.jpg new file mode 100644 index 00000000..ca0c0b31 Binary files /dev/null and b/images/imgs_object_detection_fomo/data_folder.jpg differ diff --git a/images/imgs_object_detection_fomo/img_1.png b/images/imgs_object_detection_fomo/img_1.png new file mode 100644 index 00000000..3cf9f13e Binary files /dev/null and b/images/imgs_object_detection_fomo/img_1.png differ diff --git a/images/imgs_object_detection_fomo/img_10.png b/images/imgs_object_detection_fomo/img_10.png new file mode 100644 index 00000000..d3aae23e Binary files /dev/null and b/images/imgs_object_detection_fomo/img_10.png differ diff --git a/images/imgs_object_detection_fomo/img_11.jpg b/images/imgs_object_detection_fomo/img_11.jpg new file mode 100644 index 00000000..b6da7df9 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_11.jpg differ diff --git a/images/imgs_object_detection_fomo/img_12.png b/images/imgs_object_detection_fomo/img_12.png new file mode 100644 index 00000000..ac4550c2 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_12.png differ diff --git a/images/imgs_object_detection_fomo/img_13.jpg b/images/imgs_object_detection_fomo/img_13.jpg new file mode 100644 index 00000000..bf3683d9 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_13.jpg differ diff --git a/images/imgs_object_detection_fomo/img_14.png b/images/imgs_object_detection_fomo/img_14.png new file mode 100644 index 00000000..be87da2c Binary files /dev/null and b/images/imgs_object_detection_fomo/img_14.png differ diff --git a/images/imgs_object_detection_fomo/img_15.png b/images/imgs_object_detection_fomo/img_15.png new file mode 100644 index 00000000..6b20b7f2 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_15.png differ diff --git a/images/imgs_object_detection_fomo/img_16.png b/images/imgs_object_detection_fomo/img_16.png new file mode 100644 index 00000000..88e3ceb9 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_16.png differ diff --git a/images/imgs_object_detection_fomo/img_17.png b/images/imgs_object_detection_fomo/img_17.png new file mode 100644 index 00000000..5c1b7669 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_17.png differ diff --git a/images/imgs_object_detection_fomo/img_18.png b/images/imgs_object_detection_fomo/img_18.png new file mode 100644 index 00000000..b82d860a Binary files /dev/null and b/images/imgs_object_detection_fomo/img_18.png differ diff --git a/images/imgs_object_detection_fomo/img_19.png b/images/imgs_object_detection_fomo/img_19.png new file mode 100644 index 00000000..af210f25 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_19.png differ diff --git a/images/imgs_object_detection_fomo/img_2.png b/images/imgs_object_detection_fomo/img_2.png new file mode 100644 index 00000000..c00e93d2 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_2.png differ diff --git a/images/imgs_object_detection_fomo/img_20.png b/images/imgs_object_detection_fomo/img_20.png new file mode 100644 index 00000000..6880f101 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_20.png differ diff --git a/images/imgs_object_detection_fomo/img_21.png b/images/imgs_object_detection_fomo/img_21.png new file mode 100644 index 00000000..ef3e4af4 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_21.png differ diff --git a/images/imgs_object_detection_fomo/img_22.png b/images/imgs_object_detection_fomo/img_22.png new file mode 100644 index 00000000..b49d9abb Binary files /dev/null and b/images/imgs_object_detection_fomo/img_22.png differ diff --git a/images/imgs_object_detection_fomo/img_23.png b/images/imgs_object_detection_fomo/img_23.png new file mode 100644 index 00000000..ee070d80 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_23.png differ diff --git a/images/imgs_object_detection_fomo/img_24.png b/images/imgs_object_detection_fomo/img_24.png new file mode 100644 index 00000000..5057db8d Binary files /dev/null and b/images/imgs_object_detection_fomo/img_24.png differ diff --git a/images/imgs_object_detection_fomo/img_25.png b/images/imgs_object_detection_fomo/img_25.png new file mode 100644 index 00000000..e3ad0add Binary files /dev/null and b/images/imgs_object_detection_fomo/img_25.png differ diff --git a/images/imgs_object_detection_fomo/img_26.png b/images/imgs_object_detection_fomo/img_26.png new file mode 100644 index 00000000..9802e642 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_26.png differ diff --git a/images/imgs_object_detection_fomo/img_27.jpg b/images/imgs_object_detection_fomo/img_27.jpg new file mode 100644 index 00000000..55ec9e05 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_27.jpg differ diff --git a/images/imgs_object_detection_fomo/img_28.jpg b/images/imgs_object_detection_fomo/img_28.jpg new file mode 100644 index 00000000..3d2caadc Binary files /dev/null and b/images/imgs_object_detection_fomo/img_28.jpg differ diff --git a/images/imgs_object_detection_fomo/img_3.png b/images/imgs_object_detection_fomo/img_3.png new file mode 100644 index 00000000..0d854e0e Binary files /dev/null and b/images/imgs_object_detection_fomo/img_3.png differ diff --git a/images/imgs_object_detection_fomo/img_4.png b/images/imgs_object_detection_fomo/img_4.png new file mode 100644 index 00000000..4654de3f Binary files /dev/null and b/images/imgs_object_detection_fomo/img_4.png differ diff --git a/images/imgs_object_detection_fomo/img_5.jpg b/images/imgs_object_detection_fomo/img_5.jpg new file mode 100644 index 00000000..349cd606 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_5.jpg differ diff --git a/images/imgs_object_detection_fomo/img_6.png b/images/imgs_object_detection_fomo/img_6.png new file mode 100644 index 00000000..6771c762 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_6.png differ diff --git a/images/imgs_object_detection_fomo/img_7.png b/images/imgs_object_detection_fomo/img_7.png new file mode 100644 index 00000000..fac11fd1 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_7.png differ diff --git a/images/imgs_object_detection_fomo/img_8.png b/images/imgs_object_detection_fomo/img_8.png new file mode 100644 index 00000000..08efe60e Binary files /dev/null and b/images/imgs_object_detection_fomo/img_8.png differ diff --git a/images/imgs_object_detection_fomo/img_9.png b/images/imgs_object_detection_fomo/img_9.png new file mode 100644 index 00000000..68aedc11 Binary files /dev/null and b/images/imgs_object_detection_fomo/img_9.png differ diff --git a/images/imgs_object_detection_fomo/proj_goal.jpg b/images/imgs_object_detection_fomo/proj_goal.jpg new file mode 100644 index 00000000..7873a4cd Binary files /dev/null and b/images/imgs_object_detection_fomo/proj_goal.jpg differ diff --git a/images/imgs_object_detection_fomo/samples.jpg b/images/imgs_object_detection_fomo/samples.jpg new file mode 100644 index 00000000..157c73e2 Binary files /dev/null and b/images/imgs_object_detection_fomo/samples.jpg differ diff --git a/images_2/media/image24.gif b/images_2/media/image24.gif deleted file mode 100644 index 0ed868cb..00000000 Binary files a/images_2/media/image24.gif and /dev/null differ diff --git a/images_ml_frameworks/image1.png b/images_ml_frameworks/image1.png new file mode 100644 index 00000000..ba76db9c Binary files /dev/null and b/images_ml_frameworks/image1.png differ diff --git a/images_ml_frameworks/image2.png b/images_ml_frameworks/image2.png new file mode 100644 index 00000000..a3ca27e6 Binary files /dev/null and b/images_ml_frameworks/image2.png differ diff --git a/images_ml_frameworks/image3.png b/images_ml_frameworks/image3.png new file mode 100644 index 00000000..91043427 Binary files /dev/null and b/images_ml_frameworks/image3.png differ diff --git a/images_ml_frameworks/image4.png b/images_ml_frameworks/image4.png new file mode 100644 index 00000000..70dfb2b0 Binary files /dev/null and b/images_ml_frameworks/image4.png differ diff --git a/images_ml_frameworks/image5.png b/images_ml_frameworks/image5.png new file mode 100644 index 00000000..644fed12 Binary files /dev/null and b/images_ml_frameworks/image5.png differ diff --git a/images_ml_frameworks/image6.png b/images_ml_frameworks/image6.png new file mode 100644 index 00000000..39feb8f8 Binary files /dev/null and b/images_ml_frameworks/image6.png differ diff --git a/images_ml_frameworks/image7.png b/images_ml_frameworks/image7.png new file mode 100644 index 00000000..b8a50fec Binary files /dev/null and b/images_ml_frameworks/image7.png differ diff --git a/images_ml_frameworks/image8.png b/images_ml_frameworks/image8.png new file mode 100644 index 00000000..e193c05f Binary files /dev/null and b/images_ml_frameworks/image8.png differ diff --git a/kws_feature_eng.qmd b/kws_feature_eng.qmd new file mode 100644 index 00000000..44739100 --- /dev/null +++ b/kws_feature_eng.qmd @@ -0,0 +1,143 @@ +# Audio Feature Engineering {.unnumbered} + +## Introduction + +In this hands-on tutorial, the emphasis is on the critical role that feature engineering plays in optimizing the performance of machine learning models applied to audio classification tasks, such as speech recognition. It is essential to be aware that the performance of any machine learning model relies heavily on the quality of features used, and we will deal with "under-the-hood" mechanics of feature extraction, mainly focusing on Mel-frequency Cepstral Coefficients (MFCCs), a cornerstone in the field of audio signal processing. + +Machine learning models, especially traditional algorithms, don't understand audio waves. They understand numbers arranged in some meaningful way, i.e., features. These features encapsulate the characteristics of the audio signal, making it easier for models to distinguish between different sounds. + +> This tutorial will deal with generating features specifically for audio classification. This can be particularly interesting for applying machine learning to a variety of audio data, whether for speech recognition, music categorization, insect classification based on wingbeat sounds, or other sound analysis tasks + +## The KWS + +The most common TinyML application is Keyword Spotting (KWS), a subset of the broader field of speech recognition. While general speech recognition aims to transcribe all spoken words into text, Keyword Spotting focuses on detecting specific "keywords" or "wake words" in a continuous audio stream. The system is trained to recognize these keywords as predefined phrases or words, such as *yes* or *no*. In short, KWS is a specialized form of speech recognition with its own set of challenges and requirements. + +Here a typical KWS Process using MFCC Feature Converter: + +![](images/imgs_kws_feature_eng/kws_diagram.jpg){fig-align="center" width="7.29in"} + +#### Applications of KWS: + +- **Voice Assistants**: In devices like Amazon's Alexa or Google Home, KWS is used to detect the wake word ("Alexa" or "Hey Google") to activate the device. +- **Voice-Activated Controls**: In automotive or industrial settings, KWS can be used to initiate specific commands like "Start engine" or "Turn off lights." +- **Security Systems**: Voice-activated security systems may use KWS to authenticate users based on a spoken passphrase. +- **Telecommunication Services**: Customer service lines may use KWS to route calls based on spoken keywords. + +#### Differences from General Speech Recognition: + +- **Computational Efficiency**: KWS is usually designed to be less computationally intensive than full speech recognition, as it only needs to recognize a small set of phrases. +- **Real-time Processing**: KWS often operates in real-time and is optimized for low-latency detection of keywords. +- **Resource Constraints**: KWS models are often designed to be lightweight, so they can run on devices with limited computational resources, like microcontrollers or mobile phones. +- **Focused Task**: While general speech recognition models are trained to handle a broad range of vocabulary and accents, KWS models are fine-tuned to recognize specific keywords, often in noisy environments accurately. + +## Introduction to Audio Signals + +Understanding the basic properties of audio signals is crucial for effective feature extraction and, ultimately, for successfully applying machine learning algorithms in audio classification tasks. Audio signals are complex waveforms that capture fluctuations in air pressure over time. These signals can be characterized by several fundamental attributes: sampling rate, frequency, and amplitude. + +- **Frequency and Amplitude**: [Frequency](https://en.wikipedia.org/wiki/Audio_frequency) refers to the number of oscillations a waveform undergoes per unit time and is also measured in Hz. In the context of audio signals, different frequencies correspond to different pitches. [Amplitude](https://en.wikipedia.org/wiki/Amplitude), on the other hand, measures the magnitude of the oscillations and correlates with the loudness of the sound. Both frequency and amplitude are essential features that capture audio signals' tonal and rhythmic qualities. + +- **Sampling Rate**: The [sampling rate](https://en.wikipedia.org/wiki/Sampling_(signal_processing)), often denoted in Hertz (Hz), defines the number of samples taken per second when digitizing an analog signal. A higher sampling rate allows for a more accurate digital representation of the signal but also demands more computational resources for processing. Typical sampling rates include 44.1 kHz for CD-quality audio and 16 kHz or 8 kHz for speech recognition tasks. Understanding the trade-offs in selecting an appropriate sampling rate is essential for balancing accuracy and computational efficiency. In general, with TinyML projects, we work with 16KHz. Altough music tones can be heard at frequencies up to 20 kHz, voice maxes out at 8 kHz. Traditional telephone systems use an 8 kHz sampling frequency. + +> For an accurate representation of the signal, the sampling rate must be at least twice the highest frequency present in the signal. + +- **Time Domain vs. Frequency Domain**: Audio signals can be analyzed in the time and frequency domains. In the time domain, a signal is represented as a waveform where the amplitude is plotted against time. This representation helps to observe temporal features like onset and duration but the signal's tonal characteristics are not well evidenced. Conversely, a frequency domain representation provides a view of the signal's constituent frequencies and their respective amplitudes, typically obtained via a Fourier Transform. This is invaluable for tasks that require understanding the signal's spectral content, such as identifying musical notes or speech phonemes (our case). + +The image below shows the words `YES` and `NO` with typical representations in the Time (Raw Audio) and Frequency domains: + +![](images/imgs_kws_feature_eng/time_vs_freq.jpg){fig-align="center" width="6.5in"} + +### Why Not Raw Audio? + +While using raw audio data directly for machine learning tasks may seem tempting, this approach presents several challenges that make it less suitable for building robust and efficient models. + +Using raw audio data for Keyword Spotting (KWS), for example, on TinyML devices poses challenges due to its high dimensionality (using a 16 kHz sampling rate), computational complexity for capturing temporal features, susceptibility to noise, and lack of semantically meaningful features, making feature extraction techniques like MFCCs a more practical choice for resource-constrained applications. + +Here are some additional details of the critical issues associated with using raw audio: + +- **High Dimensionality**: Audio signals, especially those sampled at high rates, result in large amounts of data. For example, a 1-second audio clip sampled at 16 kHz will have 16,000 individual data points. High-dimensional data increases computational complexity, leading to longer training times and higher computational costs, making it impractical for resource-constrained environments. Furthermore, the wide dynamic range of audio signals requires a significant amount of bits per sample, while conveying little useful information. + +- **Temporal Dependencies**: Raw audio signals have temporal structures that simple machine learning models may find hard to capture. While recurrent neural networks like [LSTMs](https://annals-csis.org/Volume_18/drp/pdf/185.pdf) can model such dependencies, they are computationally intensive and tricky to train on tiny devices. + +- **Noise and Variability**: Raw audio signals often contain background noise and other non-essential elements affecting model performance. Additionally, the same sound can have different characteristics based on various factors such as distance from the microphone, the orientation of the sound source, and acoustic properties of the environment, adding to the complexity of the data. + +- **Lack of Semantic Meaning**: Raw audio doesn't inherently contain semantically meaningful features for classification tasks. Features like pitch, tempo, and spectral characteristics, which can be crucial for speech recognition, are not directly accessible from raw waveform data. + +- **Signal Redundancy**: Audio signals often contain redundant information, with certain portions of the signal contributing little to no value to the task at hand. This redundancy can make learning inefficient and potentially lead to overfitting. + +For these reasons, feature extraction techniques such as Mel-frequency Cepstral Coefficients (MFCCs), Mel-Frequency Energies (MFEs), and simple Spectograms are commonly used to transform raw audio data into a more manageable and informative format. These features capture the essential characteristics of the audio signal while reducing dimensionality and noise, facilitating more effective machine learning. + +## Introduction to MFCCs + +### What are MFCCs? + +[Mel-frequency Cepstral Coefficients (MFCCs)](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum) are a set of features derived from the spectral content of an audio signal. They are based on human auditory perceptions and are commonly used to capture the phonetic characteristics of an audio signal. The MFCCs are computed through a multi-step process that includes pre-emphasis, framing, windowing, applying the Fast Fourier Transform (FFT) to convert the signal to the frequency domain, and finally, applying the Discrete Cosine Transform (DCT). The result is a compact representation of the original audio signal's spectral characteristics. + +The image below shows the words `YES` and `NO` in their MFCC representation: + +![](images/imgs_kws_feature_eng/yes_no_mfcc.jpg){fig-align="center" width="6.5in"} + +> This [video](https://youtu.be/SJo7vPgRlBQ?si=KSgzmDg8DtSVqzXp) explains the Mel Frequency Cepstral Coefficients (MFCC) and how to compute them. + +### Why are MFCCs important? + +MFCCs are crucial for several reasons, particularly in the context of Keyword Spotting (KWS) and TinyML: + +- **Dimensionality Reduction**: MFCCs capture essential spectral characteristics of the audio signal while significantly reducing the dimensionality of the data, making it ideal for resource-constrained TinyML applications. +- **Robustness**: MFCCs are less susceptible to noise and variations in pitch and amplitude, providing a more stable and robust feature set for audio classification tasks. +- **Human Auditory System Modeling**: The Mel scale in MFCCs approximates the human ear's response to different frequencies, making them practical for speech recognition where human-like perception is desired. +- **Computational Efficiency**: The process of calculating MFCCs is computationally efficient, making it well-suited for real-time applications on hardware with limited computational resources. + +In summary, MFCCs offer a balance of information richness and computational efficiency, making them popular for audio classification tasks, particularly in constrained environments like TinyML. + +### Computing MFCCs + +The computation of Mel-frequency Cepstral Coefficients (MFCCs) involves several key steps. Let's walk through these, which are particularly important for Keyword Spotting (KWS) tasks on TinyML devices. + +- **Pre-emphasis**: The first step is pre-emphasis, which is applied to accentuate the high-frequency components of the audio signal and balance the frequency spectrum. This is achieved by applying a filter that amplifies the difference between consecutive samples. The formula for pre-emphasis is: y(t) = x(t) - $\alpha$ x(t-1) , where $\alpha$ is the pre-emphasis factor, typically around 0.97. + +- **Framing**: Audio signals are divided into short frames (the *frame length*), usually 20 to 40 milliseconds. This is based on the assumption that frequencies in a signal are stationary over a short period. Framing helps in analyzing the signal in such small time slots. The *frame stride* (or step) will displace one frame and the adjacent. Those steps could be sequential or overlapped. + +- **Windowing**: Each frame is then windowed to minimize the discontinuities at the frame boundaries. A commonly used window function is the Hamming window. Windowing prepares the signal for a Fourier transform by minimizing the edge effects. The image below shows three frames (10, 20, and 30) and the time samples after windowing (note that the frame length and frame stride are 20 ms): + +![](images/imgs_kws_feature_eng/frame_wind.jpg){fig-align="center" width="6.5in"} + +- **Fast Fourier Transform (FFT)** The Fast Fourier Transform (FFT) is applied to each windowed frame to convert it from the time domain to the frequency domain. The FFT gives us a complex-valued representation that includes both magnitude and phase information. However, for MFCCs, only the magnitude is used to calculate the Power Spectrum. The power spectrum is the square of the magnitude spectrum and measures the energy present at each frequency component. + +> The power spectrum $P(f)$ of a signal $x(t)$ is defined as $P(f) = |X(f)|^2$, where $X(f)$ is the Fourier Transform of $x(t)$. By squaring the magnitude of the Fourier Transform, we emphasize *stronger* frequencies over *weaker* ones, thereby capturing more relevant spectral characteristics of the audio signal. This is important in applications like audio classification, speech recognition, and Keyword Spotting (KWS), where the focus is on identifying distinct frequency patterns that characterize different classes of audio or phonemes in speech. + +![](images/imgs_kws_feature_eng/frame_to_fft.jpg){fig-align="center" width="6.5in"} + +- **Mel Filter Banks**: The frequency domain is then mapped to the [Mel scale](https://en.wikipedia.org/wiki/Mel_scale), which approximates the human ear's response to different frequencies. The idea is to extract more features (more filter banks) in the lower frequencies and less in the high frequencies. Thus, it performs well on sounds distinguished by the human ear. Typically, 20 to 40 triangular filters extract the Mel-frequency energies. These energies are then log-transformed to convert multiplicative factors into additive ones, making them more suitable for further processing. + +![](images/imgs_kws_feature_eng/melbank-1_00.hires.jpg){fig-align="center" width="6.5in"} + +- **Discrete Cosine Transform (DCT)**: The last step is to apply the [Discrete Cosine Transform (DCT)](https://en.wikipedia.org/wiki/Discrete_cosine_transform) to the log Mel energies. The DCT helps to decorrelate the energies, effectively compressing the data and retaining only the most discriminative features. Usually, the first 12-13 DCT coefficients are retained, forming the final MFCC feature vector. + +![](images/imgs_kws_feature_eng/mfcc_final.jpg){fig-align="center" width="6.5in"} + +## Hands-On using Python + +Let's apply what we discussed while working on an actual audio sample. Open the notebook on Google CoLab and extract the MLCC features on your audio samples: [\[Open In Colab\]](https://colab.research.google.com/github/Mjrovai/Arduino_Nicla_Vision/blob/main/KWS/Audio_Data_Analysis.ipynb) + +## Conclusion + +### **What** Feature Extraction technique **should we use?** + +Mel-frequency Cepstral Coefficients (MFCCs), Mel-Frequency Energies (MFEs), or Spectrogram are techniques for representing audio data, which are often helpful in different contexts. + +In general, MFCCs are more focused on capturing the envelope of the power spectrum, which makes them less sensitive to fine-grained spectral details but more robust to noise. This is often desirable for speech-related tasks. On the other hand, spectrograms or MFEs preserve more detailed frequency information, which can be advantageous in tasks that require discrimination based on fine-grained spectral content. + +#### MFCCs are particularly strong for: + +1. **Speech Recognition**: MFCCs are excellent for identifying phonetic content in speech signals. +2. **Speaker Identification**: They can be used to distinguish between different speakers based on voice characteristics. +3. **Emotion Recognition**: MFCCs can capture the nuanced variations in speech indicative of emotional states. +4. **Keyword Spotting**: Especially in TinyML, where low computational complexity and small feature size are crucial. + +#### Spectrograms or MFEs are often more suitable for: + +1. **Music Analysis**: Spectrograms can capture harmonic and timbral structures in music, which is essential for tasks like genre classification, instrument recognition, or music transcription. +2. **Environmental Sound Classification**: In recognizing non-speech, environmental sounds (e.g., rain, wind, traffic), the full spectrogram can provide more discriminative features. +3. **Birdsong Identification**: The intricate details of bird calls are often better captured using spectrograms. +4. **Bioacoustic Signal Processing**: In applications like dolphin or bat call analysis, the fine-grained frequency information in a spectrogram can be essential. +5. **Audio Quality Assurance**: Spectrograms are often used in professional audio analysis to identify unwanted noises, clicks, or other artifacts. diff --git a/kws_nicla.qmd b/kws_nicla.qmd new file mode 100644 index 00000000..58f0ab1c --- /dev/null +++ b/kws_nicla.qmd @@ -0,0 +1,369 @@ +# Keyword Spotting (KWS) {.unnumbered} + +## Introduction + +Having already explored the Nicla Vision board in the *Image Classification* and *Object Detection* applications, we are now shifting our focus to voice-activated applications with a project on Keyword Spotting (KWS). + +As introduced in the *Feature Engineering for Audio Classification* Hands-On tutorial, Keyword Spotting (KWS) is integrated into many voice recognition systems, enabling devices to respond to specific words or phrases. While this technology underpins popular devices like Google Assistant or Amazon Alexa, it's equally applicable and feasible on smaller, low-power devices. This tutorial will guide you through implementing a KWS system using TinyML on the Nicla Vision development board equipped with a digital microphone. + +Our model will be designed to recognize keywords that can trigger device wake-up or specific actions, bringing them to life with voice-activated commands. + +## How does a voice assistant work? + +As said, *voice assistants* on the market, like Google Home or Amazon Echo-Dot, only react to humans when they are "waked up" by particular keywords such as " Hey Google" on the first one and "Alexa" on the second. + +![](images/imgs_kws_nicla/hey_google.png){fig-align="center" width="6.5in"} + +In other words, recognizing voice commands is based on a multi-stage model or Cascade Detection. + +![](images/imgs_kws_nicla/pa_block.jpg){fig-align="center" width="6.5in"} + +**Stage 1:** A small microprocessor inside the Echo Dot or Google Home continuously listens, waiting for the keyword to be spotted, using a TinyML model at the edge (KWS application). + +**Stage 2:** Only when triggered by the KWS application on Stage 1 is the data sent to the cloud and processed on a larger model. + +The video below shows an example of a Google Assistant being programmed on a Raspberry Pi (Stage 2), with an Arduino Nano 33 BLE as the tinyML device (Stage 1). + +{{< video https://youtu.be/e_OPgcnsyvM width="480" height="270" center >}} + +> To explore the above Google Assistant project, please see the tutorial: [Building an Intelligent Voice Assistant From Scratch](https://www.hackster.io/mjrobot/building-an-intelligent-voice-assistant-from-scratch-2199c3). + +In this KWS project, we will focus on Stage 1 (KWS or Keyword Spotting), where we will use the Nicla Vision, which has a digital microphone that will be used to spot the keyword. + +## The KWS Hands-On Project + +The diagram below gives an idea of how the final KWS application should work (during inference): + +![](images/imgs_kws_nicla/KWS_PROJ_INF_BLK.jpg){fig-align="center" width="6.5in"} + +Our KWS application will recognize four classes of sound: + +- **YES** (Keyword 1) +- **NO** (Keyword 2) +- **NOISE** (no words spoken; only background noise is present) +- **UNKNOW** (a mix of different words than YES and NO) + +> For real-world projects, it is always advisable to include other sounds besides the keywords, such as "Noise" (or Background) and "Unknown." + +### The Machine Learning workflow + +The main component of the KWS application is its model. So, we must train such a model with our specific keywords, noise, and other words (the "unknown"): + +![](images/imgs_kws_nicla/KWS_PROJ_TRAIN_BLK.jpg){fig-align="center" width="6.5in"} + +## Dataset + +The critical component of any Machine Learning Workflow is the **dataset**. Once we have decided on specific keywords, in our case (*YES* and NO), we can take advantage of the dataset developed by Pete Warden, ["Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition](https://arxiv.org/pdf/1804.03209.pdf)." This dataset has 35 keywords (with +1,000 samples each), such as yes, no, stop, and go. In words such as *yes* and *no,* we can get 1,500 samples. + +You can download a small portion of the dataset from Edge Studio ([Keyword spotting pre-built dataset](https://docs.edgeimpulse.com/docs/pre-built-datasets/keyword-spotting)), which includes samples from the four classes we will use in this project: yes, no, noise, and background. For this, follow the steps below: + +- Download the [keywords dataset.](https://cdn.edgeimpulse.com/datasets/keywords2.zip) +- Unzip the file to a location of your choice. + +### Uploading the dataset to the Edge Impulse Studio + +Initiate a new project at Edge Impulse Studio (EIS) and select the `Upload Existing Data` tool in the `Data Acquisition` section. Choose the files to be uploaded: + +![](images/imgs_kws_nicla/files.jpg){fig-align="center" width="6.5in"} + +Define the Label, select `Automatically split between train and test,` and `Upload data` to the EIS. Repeat for all classes. + +![](images/imgs_kws_nicla/upload.jpg){fig-align="center" width="6.5in"} + +The dataset will now appear in the `Data acquisition` section. Note that the approximately 6,000 samples (1,500 for each class) are split into Train (4,800) and Test (1,200) sets. + +![](images/imgs_kws_nicla/dataset.jpg){fig-align="center" width="6.5in"} + +### Capturing additional Audio Data + +Although we have a lot of data from Pete's dataset, collecting some words spoken by us is advised. When working with accelerometers, creating a dataset with data captured by the same type of sensor is essential. In the case of *sound*, this is optional because what we will classify is, in reality, *audio* data. + +> The key difference between sound and audio is the type of energy. Sound is mechanical perturbation (longitudinal sound waves) that propagate through a medium, causing variations of pressure in it. Audio is an electrical (analog or digital) signal representing sound. + +When we pronounce a keyword, the sound waves should be converted to audio data. The conversion should be done by sampling the signal generated by the microphone at a 16KHz frequency with 16-bit per sample amplitude. + +So, any device that can generate audio data with this basic specification (16KHz/16bits) will work fine. As a *device*, we can use the NiclaV, a computer, or even your mobile phone. + +![](images/imgs_kws_nicla/audio_capt.jpg){fig-align="center" width="6.5in"} + +#### Using the NiclaV and the Edge Impulse Studio + +As we learned in the chapter *Setup Nicla Vision*, EIS officially supports the Nicla Vision, which simplifies the capture of the data from its sensors, including the microphone. So, please create a new project on EIS and connect the Nicla to it, following these steps: + +- Download the last updated [EIS Firmware](https://cdn.edgeimpulse.com/firmware/arduino-nicla-vision.zip) and unzip it. + +- Open the zip file on your computer and select the uploader corresponding to your OS: + +![](images/imgs_niclav_sys/image17.png){fig-align="center" width="4.416666666666667in"} + +- Put the NiclaV in Boot Mode by pressing the reset button twice. + +![](https://84771188-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FGEgcCk4PkS5Pa6uBabld%2Fuploads%2Fgit-blob-111b26f413cd411b29594c377868bba901863233%2Fnicla_bootloader.gif?alt=media){fig-align="center" width="6.5in"} + +- Upload the binary *arduino-nicla-vision.bin* to your board by running the batch code corresponding to your OS. + +Go to your project on EIS, and on the `Data Acquisition tab`, select `WebUSB`. A window will pop up; choose the option that shows that the `Nicla is paired` and press `[Connect]`. + +You can choose which sensor data to pick in the `Collect Data` section on the `Data Acquisition` tab. Select: `Built-in microphone`, define your `label` (for example, *yes*), the sampling `Frequency`\[16000Hz\], and the `Sample length (in milliseconds)`, for example \[10s\]. `Start sampling`. + +![](images/imgs_kws_nicla/ei_data_collection.jpg){fig-align="center" width="6.5in"} + +Data on Pete's dataset have a length of 1s, but the recorded samples are 10s long and must be split into 1s samples. Click on `three dots` after the sample name and select `Split sample`. + +A window will pop up with the Split tool. + +![](images/imgs_kws_nicla/split.jpg){fig-align="center" width="6.5in"} + +Once inside the tool, split the data into 1-second (1000 ms) records. If necessary, add or remove segments. This procedure should be repeated for all new samples. + +#### Using a smartphone and the EI Studio + +You can also use your PC or smartphone to capture audio data, using a sampling frequency of 16KHz and a bit depth of 16. + +Go to `Devices`, scan the `QR Code` using your phone, and click on the link. A data Collection app will appear in your browser. Select `Collecting Audio`, and define your `Label`, data capture `Length,` and `Category`. + +![](images/imgs_kws_nicla/phone.jpg){fig-align="center" width="6.5in"} + +Repeat the same procedure used with the NiclaV. + +> Note that any app, such as [Audacity](https://www.audacityteam.org/), can be used for audio recording, provided you use 16KHz/16-bit depth samples. + +## Creating Impulse (Pre-Process / Model definition) + +*An* **impulse** *takes raw data, uses signal processing to extract features, and then uses a learning block to classify new data.* + +### Impulse Design + +![](images/imgs_kws_nicla/impulse.jpg){fig-align="center" width="6.5in"} + +First, we will take the data points with a 1-second window, augmenting the data and sliding that window in 500ms intervals. Note that the option zero-pad data is set. It is essential to fill with 'zeros' samples smaller than 1 second (in some cases, some samples can result smaller than the 1000 ms window on the split tool to avoid noise and spikes). + +Each 1-second audio sample should be pre-processed and converted to an image (for example, 13 x 49 x 1). As discussed in the *Feature Engineering for Audio Classification* Hands-On tutorial, we will use `Audio (MFCC)`, which extracts features from audio signals using [Mel Frequency Cepstral Coefficients](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum), which are well suited for the human voice, our case here. + +Next, we select the `Classification` block to build our model from scratch using a Convolution Neural Network (CNN). + +> Alternatively, you can use the `Transfer Learning (Keyword Spotting)` block, which fine-tunes a pre-trained keyword spotting model on your data. This approach has good performance with relatively small keyword datasets. + +### Pre-Processing (MFCC) + +The following step is to create the features to be trained in the next phase: + +We could keep the default parameter values, but we will use the DSP `Autotune parameters` option. + +![](images/imgs_kws_nicla/ei_MFCC.jpg){fig-align="center" width="6.5in"} + +We will take the `Raw features` (our 1-second, 16KHz sampled audio data) and use the MFCC processing block to calculate the `Processed features`. For every 16,000 raw features (16,000 x 1 second), we will get 637 processed features (13 x 49). + +![](images/imgs_kws_nicla/MFCC.jpg){fig-align="center" width="6.5in"} + +The result shows that we only used a small amount of memory to pre-process data (16KB) and a latency of 34ms, which is excellent. For example, on an Arduino Nano (Cortex-M4f \@ 64MHz), the same pre-process will take around 480ms. The parameters chosen, such as the `FFT length` \[512\], will significantly impact the latency. + +Now, let's `Save parameters` and move to the `Generated features` tab, where the actual features will be generated. Using [UMAP](https://umap-learn.readthedocs.io/en/latest/), a dimension reduction technique, the `Feature explorer` shows how the features are distributed on a two-dimensional plot. + +![](images/imgs_kws_nicla/feat_expl.jpg){fig-align="center" width="5.9in"} + +The result seems OK, with a visually clear separation between *yes* features (in red) and *no* features (in blue). The *unknown* features seem nearer to the *no space* than the *yes*. This suggests that the keyword *no* has more propensity to false positives. + +### Going under the hood + +To understand better how the raw sound is preprocessed, look at the *Feature Engineering for Audio Classification* chapter. You can play with the MFCC features generation by downloading this [notebook](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/KWS/KWS_MFCC_Analysis.ipynb) from GitHub or [\[Opening it In Colab\]](https://colab.research.google.com/github/Mjrovai/Arduino_Nicla_Vision/blob/main/KWS/KWS_MFCC_Analysis.ipynb) + +## Model Design and Training + +We will use a simple Convolution Neural Network (CNN) model, tested with 1D and 2D convolutions. The basic architecture has two blocks of Convolution + MaxPooling (\[8\] and \[16\] filters, respectively) and a Dropout of \[0.25\] for the 1D and \[0.5\] for the 2D. For the last layer, after Flattening, we have \[4\] neurons, one for each class: + +![](images/imgs_kws_nicla/models_1d-2d.jpg){fig-align="center" width="6.5in"} + +As hyper-parameters, we will have a `Learning Rate` of \[0.005\] and a model trained by \[100\] epochs. We will also include a data augmentation method based on [SpecAugment](https://arxiv.org/abs/1904.08779). We trained the 1D and the 2D models with the same hyperparameters. The 1D architecture had a better overall result (90.5% accuracy when compared with 88% of the 2D, so we will use the 1D. + +![](images/imgs_kws_nicla/train_result.jpg){fig-align="center" width="6.5in"} + +> Using 1D convolutions is more efficient because it requires fewer parameters than 2D convolutions, making them more suitable for resource-constrained environments. + +It is also interesting to pay attention to the 1D Confusion Matrix. The F1 Score for `yes` is 95%, and for `no`, 91%. That was expected by what we saw with the Feature Explorer (`no` and `unknown` at close distance). In trying to improve the result, you can inspect closely the results of the samples with an error. + +![](images/imgs_kws_nicla/train_errors.jpg){fig-align="center" width="6.5in"} + +Listen to the samples that went wrong. For example, for `yes`, most of the mistakes were related to a yes pronounced as "yeh". You can acquire additional samples and then retrain your model. + +### Going under the hood + +If you want to understand what is happening "under the hood," you can download the pre-processed dataset (`MFCC training data`) from the `Dashboard` tab and run this [Jupyter Notebook](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/KWS/KWS_CNN_training.ipynb), playing with the code or [\[Opening it In Colab\]](https://colab.research.google.com/github/Mjrovai/Arduino_Nicla_Vision/blob/main/KWS/KWS_CNN_training.ipynb). For example, you can analyze the accuracy by each epoch: + +![](images/imgs_kws_nicla/train_graphs.jpg){fig-align="center" width="6.5in"} + +## Testing + +Testing the model with the data reserved for training (Test Data), we got an accuracy of approximately 76%. + +![](images/imgs_kws_nicla/test.jpg){fig-align="center" width="6.5in"} + +Inspecting the F1 score, we can see that for YES, we got 0.90, an excellent result since we expect to use this keyword as the primary "trigger" for our KWS project. The worst result (0.70) is for UNKNOWN, which is OK. + +For NO, we got 0.72, which was expected, but to improve this result, we can move the samples that were not correctly classified to the training dataset and then repeat the training process. + +### Live Classification + +We can proceed to the project's next step but also consider that it is possible to perform `Live Classification` using the NiclaV or a smartphone to capture live samples, testing the trained model before deployment on our device. + +## Deploy and Inference + +The EIS will package all the needed libraries, preprocessing functions, and trained models, downloading them to your computer. Go to the `Deployment` section, select `Arduino Library`, and at the bottom, choose `Quantized (Int8)` and press `Build`. + +![](images/imgs_kws_nicla/deploy.jpg){fig-align="center" width="5.29in"} + +When the `Build` button is selected, a zip file will be created and downloaded to your computer. On your Arduino IDE, go to the `Sketch` tab, select the option `Add .ZIP Library`, and Choose the .zip file downloaded by EIS: + +![](images/imgs_kws_nicla/install_zip.jpg){fig-align="center" width="6.5in"} + +Now, it is time for a real test. We will make inferences while completely disconnected from the EIS. Let's use the NiclaV code example created when we deployed the Arduino Library. + +In your Arduino IDE, go to the `File/Examples` tab, look for your project, and select `nicla-vision/nicla-vision_microphone` (or `nicla-vision_microphone_continuous`) + +![](images/imgs_kws_nicla/code_ide.jpg){fig-align="center" width="6.5in"} + +Press the reset button twice to put the NiclaV in boot mode, upload the sketch to your board, and test some real inferences: + +![](images/imgs_kws_nicla/yes_no.jpg){fig-align="center" width="6.5in"} + +## Post-processing + +Now that we know the model is working since it detects our keywords, let's modify the code to see the result with the NiclaV completely offline (disconnected from the PC and powered by a battery, a power bank, or an independent 5V power supply). + +The idea is that whenever the keyword YES is detected, the Green LED will light; if a NO is heard, the Red LED will light, if it is a UNKNOW, the Blue LED will light; and in the presence of noise (No Keyword), the LEDs will be OFF. + +We should modify one of the code examples. Let's do it now with the `nicla-vision_microphone_continuous`. + +Start with initializing the LEDs: + +``` cpp +... +void setup() +{ + // Once you finish debugging your code, you can comment or delete the Serial part of the code + Serial.begin(115200); + while (!Serial); + Serial.println("Inferencing - Nicla Vision KWS with LEDs"); + + // Pins for the built-in RGB LEDs on the Arduino NiclaV + pinMode(LEDR, OUTPUT); + pinMode(LEDG, OUTPUT); + pinMode(LEDB, OUTPUT); + + // Ensure the LEDs are OFF by default. + // Note: The RGB LEDs on the Arduino Nicla Vision + // are ON when the pin is LOW, OFF when HIGH. + digitalWrite(LEDR, HIGH); + digitalWrite(LEDG, HIGH); + digitalWrite(LEDB, HIGH); +... +} +``` + +Create two functions, `turn_off_leds()` function , to turn off all RGB LEDs + +``` cpp +** + * @brief turn_off_leds function - turn-off all RGB LEDs + */ +void turn_off_leds(){ + digitalWrite(LEDR, HIGH); + digitalWrite(LEDG, HIGH); + digitalWrite(LEDB, HIGH); +} +``` + +Another `turn_on_led()` function is used to turn on the RGB LEDs according to the most probable result of the classifier. + +``` cpp +/** + * @brief turn_on_leds function used to turn on the RGB LEDs + * @param[in] pred_index + * no: [0] ==> Red ON + * noise: [1] ==> ALL OFF + * unknown: [2] ==> Blue ON + * Yes: [3] ==> Green ON + */ +void turn_on_leds(int pred_index) { + switch (pred_index) + { + case 0: + turn_off_leds(); + digitalWrite(LEDR, LOW); + break; + + case 1: + turn_off_leds(); + break; + + case 2: + turn_off_leds(); + digitalWrite(LEDB, LOW); + break; + + case 3: + turn_off_leds(); + digitalWrite(LEDG, LOW); + break; + } +} +``` + +And change the `// print the predictions` portion of the code on `loop()`: + +``` cpp +... + + if (++print_results >= (EI_CLASSIFIER_SLICES_PER_MODEL_WINDOW)) { + // print the predictions + ei_printf("Predictions "); + ei_printf("(DSP: %d ms., Classification: %d ms., Anomaly: %d ms.)", + result.timing.dsp, result.timing.classification, result.timing.anomaly); + ei_printf(": \n"); + + int pred_index = 0; // Initialize pred_index + float pred_value = 0; // Initialize pred_value + + for (size_t ix = 0; ix < EI_CLASSIFIER_LABEL_COUNT; ix++) { + if (result.classification[ix].value > pred_value){ + pred_index = ix; + pred_value = result.classification[ix].value; + } + // ei_printf(" %s: ", result.classification[ix].label); + // ei_printf_float(result.classification[ix].value); + // ei_printf("\n"); + } + ei_printf(" PREDICTION: ==> %s with probability %.2f\n", + result.classification[pred_index].label, pred_value); + turn_on_leds (pred_index); + + +#if EI_CLASSIFIER_HAS_ANOMALY == 1 + ei_printf(" anomaly score: "); + ei_printf_float(result.anomaly); + ei_printf("\n"); +#endif + + print_results = 0; + } +} + +... +``` + +You can find the complete code on the [project's GitHub](https://github.com/Mjrovai/Arduino_Nicla_Vision/tree/main/KWS/nicla_vision_microphone_continuous_LED). + +Upload the sketch to your board and test some real inferences. The idea is that the Green LED will be ON whenever the keyword YES is detected, the Red will lit for a NO, and any other word will turn on the Blue LED. All the LEDs should be off if silence or background noise is present. Remember that the same procedure can "trigger" an external device to perform a desired action instead of turning on an LED, as we saw in the introduction. + +{{< video https://youtu.be/25Rd76OTXLY width="480" height="270" center >}} + +## Conclusion + +> You will find the notebooks and codes used in this hands-on tutorial on the [GitHub](https://github.com/Mjrovai/Arduino_Nicla_Vision/tree/main/KWS) repository. + +Before we finish, consider that Sound Classification is more than just voice. For example, you can develop TinyML projects around sound in several areas, such as: + +- **Security** (Broken Glass detection, Gunshot) +- **Industry** (Anomaly Detection) +- **Medical** (Snore, Cough, Pulmonary diseases) +- **Nature** (Beehive control, insect sound, pouching mitigation) diff --git a/niclav_sys.qmd b/niclav_sys.qmd new file mode 100644 index 00000000..f410f69e --- /dev/null +++ b/niclav_sys.qmd @@ -0,0 +1,307 @@ +# Setup Nicla Vision {.unnumbered} + +## Introduction + +The [Arduino Nicla Vision](https://docs.arduino.cc/hardware/nicla-vision) (sometimes called *NiclaV*) is a development board that includes two processors that can run tasks in parallel. It is part of a family of development boards with the same form factor but designed for specific tasks, such as the [Nicla Sense ME](https://www.bosch-sensortec.com/software-tools/tools/arduino-nicla-sense-me/) and the [Nicla Voice](https://store-usa.arduino.cc/products/nicla-voice?_gl=1*l3abc6*_ga*MTQ3NzE4Mjk4Mi4xNjQwMDIwOTk5*_ga_NEXN8H46L5*MTY5NjM0Mzk1My4xMDIuMS4xNjk2MzQ0MjQ1LjAuMC4w). The *Niclas* can efficiently run processes created with TensorFlow™ Lite. For example, one of the cores of the NiclaV runs a computer vision algorithm on the fly (inference), while the other executes low-level operations like controlling a motor and communicating or acting as a user interface. The onboard wireless module allows the management of WiFi and Bluetooth Low Energy (BLE) connectivity simultaneously. + +![](images/imgs_niclav_sys/image29.jpg){fig-align="center" width="6.5in"} + +## Hardware + +### Two Parallel Cores + +The central processor is the dual-core [STM32H747,](https://content.arduino.cc/assets/Arduino-Portenta-H7_Datasheet_stm32h747xi.pdf?_gl=1*6quciu*_ga*MTQ3NzE4Mjk4Mi4xNjQwMDIwOTk5*_ga_NEXN8H46L5*MTY0NzQ0NTg1My4xMS4xLjE2NDc0NDYzMzkuMA..) including a Cortex® M7 at 480 MHz and a Cortex® M4 at 240 MHz. The two cores communicate via a Remote Procedure Call mechanism that seamlessly allows calling functions on the other processor. Both processors share all the on-chip peripherals and can run: + +- Arduino sketches on top of the Arm® Mbed™ OS + +- Native Mbed™ applications + +- MicroPython / JavaScript via an interpreter + +- TensorFlow™ Lite + +![](images/imgs_niclav_sys/image22.jpg){fig-align="center" width="6.5in"} + +### Memory + +Memory is crucial for embedded machine learning projects. The NiclaV board can host up to 16 MB of QSPI Flash for storage. However, it is essential to consider that the MCU SRAM is the one to be used with machine learning inferences; the STM32H747 is only 1MB, shared by both processors. This MCU also has incorporated 2MB of FLASH, mainly for code storage. + +### Sensors + +- **Camera**: A GC2145 2 MP Color CMOS Camera. + +- **Microphone**: The `MP34DT05` is an ultra-compact, low-power, omnidirectional, digital MEMS microphone built with a capacitive sensing element and the IC interface. + +- **6-Axis IMU**: 3D gyroscope and 3D accelerometer data from the `LSM6DSOX` 6-axis IMU. + +- **Time of Flight Sensor**: The `VL53L1CBV0FY` Time-of-Flight sensor adds accurate and low power-ranging capabilities to the Nicla Vision. The invisible near-infrared VCSEL laser (including the analog driver) is encapsulated with receiving optics in an all-in-one small module below the camera. + +## Arduino IDE Installation + +Start connecting the board (*microUSB*) to your computer: + +![](images/imgs_niclav_sys/image14.jpg){fig-align="center" width="6.5in"} + +Install the Mbed OS core for Nicla boards in the Arduino IDE. Having the IDE open, navigate to `Tools > Board > Board Manager`, look for Arduino Nicla Vision on the search window, and install the board. + +![](images/imgs_niclav_sys/image2.jpg){fig-align="center" width="6.5in"} + +Next, go to `Tools > Board > Arduino Mbed OS Nicla Boards` and select `Arduino Nicla Vision`. Having your board connected to the USB, you should see the Nicla on Port and select it. + +> Open the Blink sketch on Examples/Basic and run it using the IDE Upload button. You should see the Built-in LED (green RGB) blinking, which means the Nicla board is correctly installed and functional! + +### Testing the Microphone + +On Arduino IDE, go to `Examples > PDM > PDMSerialPlotter`, open and run the sketch. Open the Plotter and see the audio representation from the microphone: + +![](images/imgs_niclav_sys/image9.png){fig-align="center" width="6.5in"} + +> Vary the frequency of the sound you generate and confirm that the mic is working correctly. + +### Testing the IMU + +Before testing the IMU, it will be necessary to install the LSM6DSOX library. For that, go to Library Manager and look for LSM6DSOX. Install the library provided by Arduino: + +![](images/imgs_niclav_sys/image19.jpg){fig-align="center" width="6.5in"} + +Next, go to `Examples > Arduino_LSM6DSOX > SimpleAccelerometer` and run the accelerometer test (you can also run Gyro and board temperature): + +![](images/imgs_niclav_sys/image28.png){fig-align="center" width="6.5in"} + +### Testing the ToF (Time of Flight) Sensor + +As we did with IMU, it is necessary to install the VL53L1X ToF library. For that, go to Library Manager and look for VL53L1X. Install the library provided by Pololu: + +![](images/imgs_niclav_sys/image15.jpg){fig-align="center" width="6.5in"} + +Next, run the sketch [proximity_detection.ino](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Micropython/distance_image_meter.py): + +![](images/imgs_niclav_sys/image12.png){fig-align="center" width="6.5in"} + +On the Serial Monitor, you will see the distance from the camera to an object in front of it (max of 4m). + +![](images/imgs_niclav_sys/image13.jpg){fig-align="center" width="6.5in"} + +### Testing the Camera + +We can also test the camera using, for example, the code provided on `Examples > Camera > CameraCaptureRawBytes`. We cannot see the image directly, but it is possible to get the raw image data generated by the camera. + +Anyway, the best test with the camera is to see a live image. For that, we will use another IDE, the OpenMV. + +## Installing the OpenMV IDE + +OpenMV IDE is the premier integrated development environment with OpenMV Cameras like the one on the Nicla Vision. It features a powerful text editor, debug terminal, and frame buffer viewer with a histogram display. We will use MicroPython to program the camera. + +Go to the [OpenMV IDE page](https://openmv.io/pages/download), download the correct version for your Operating System, and follow the instructions for its installation on your computer. + +![](images/imgs_niclav_sys/image21.png){fig-align="center" width="6.5in"} + +The IDE should open, defaulting to the helloworld_1.py code on its Code Area. If not, you can open it from `Files > Examples > HelloWord > helloword.py` + +![](images/imgs_niclav_sys/image7.png){fig-align="center" width="6.5in"} + +Any messages sent through a serial connection (using print() or error messages) will be displayed on the **Serial Terminal** during run time. The image captured by a camera will be displayed in the **Camera Viewer** Area (or Frame Buffer) and in the Histogram area, immediately below the Camera Viewer. + +OpenMV IDE is the premier integrated development environment with OpenMV Cameras and the Arduino Pro boards. It features a powerful text editor, debug terminal, and frame buffer viewer with a histogram display. We will use MicroPython to program the Nicla Vision. + +> Before connecting the Nicla to the OpenMV IDE, ensure you have the latest bootloader version. Go to your Arduino IDE, select the Nicla board, and open the sketch on `Examples > STM_32H747_System STM_32H747_updateBootloader`. Upload the code to your board. The Serial Monitor will guide you. + +After updating the bootloader, put the Nicla Vision in bootloader mode by double-pressing the reset button on the board. The built-in green LED will start fading in and out. Now return to the OpenMV IDE and click on the connect icon (Left ToolBar): + +![](images/imgs_niclav_sys/image23.jpg){fig-align="center" width="4.010416666666667in"} + +A pop-up will tell you that a board in DFU mode was detected and ask how you would like to proceed. First, select `Install the latest release firmware (vX.Y.Z)`. This action will install the latest OpenMV firmware on the Nicla Vision. + +![](images/imgs_niclav_sys/image10.png){fig-align="center" width="6.5in"} + +You can leave the option `Erase internal file system` unselected and click `[OK]`. + +Nicla's green LED will start flashing while the OpenMV firmware is uploaded to the board, and a terminal window will then open, showing the flashing progress. + +![](images/imgs_niclav_sys/image5.png){fig-align="center" width="4.854166666666667in"} + +Wait until the green LED stops flashing and fading. When the process ends, you will see a message saying, "DFU firmware update complete!". Press `[OK]`. + +![](images/imgs_niclav_sys/image1.png){fig-align="center" width="3.875in"} + +A green play button appears when the Nicla Vison connects to the Tool Bar. + +![](images/imgs_niclav_sys/image18.jpg){fig-align="center" width="4.791666666666667in"} + +Also, note that a drive named "NO NAME" will appear on your computer.: + +![](images/imgs_niclav_sys/image3.png){fig-align="center" width="6.447916666666667in"} + +Every time you press the `[RESET]` button on the board, it automatically executes the *main.py* script stored on it. You can load the [main.py](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Micropython/main.py) code on the IDE (`File > Open File...`). + +![](images/imgs_niclav_sys/image16.png){fig-align="center" width="4.239583333333333in"} + +> This code is the "Blink" code, confirming that the HW is OK. + +For testing the camera, let's run *helloword_1.py*. For that, select the script on `File > Examples > HelloWorld > helloword.py`, + +When clicking the green play button, the MicroPython script (*hellowolrd.py*) on the Code Area will be uploaded and run on the Nicla Vision. On-Camera Viewer, you will start to see the video streaming. The Serial Monitor will show us the FPS (Frames per second), which should be around 14fps. + +![](images/imgs_niclav_sys/image6.png){fig-align="center" width="6.5in"} + +Here is the [helloworld.py](http://helloworld.py/) script: + +``` python +# Hello World Example 2 +# +# Welcome to the OpenMV IDE! Click on the green run arrow button below to run the script! + +import sensor, image, time + +sensor.reset() # Reset and initialize the sensor. +sensor.set_pixformat(sensor.RGB565) # Set pixel format to RGB565 (or GRAYSCALE) +sensor.set_framesize(sensor.QVGA) # Set frame size to QVGA (320x240) +sensor.skip_frames(time = 2000) # Wait for settings take effect. +clock = time.clock() # Create a clock object to track the FPS. + +while(True): + clock.tick() # Update the FPS clock. + img = sensor.snapshot() # Take a picture and return the image. + print(clock.fps()) +``` + +In [GitHub](https://github.com/Mjrovai/Arduino_Nicla_Vision), you can find the Python scripts used here. + +The code can be split into two parts: + +- **Setup**: Where the libraries are imported, initialized and the variables are defined and initiated. + +- **Loop**: (while loop) part of the code that runs continually. The image (*img* variable) is captured (one frame). Each of those frames can be used for inference in Machine Learning Applications. + +To interrupt the program execution, press the red `[X]` button. + +> Note: OpenMV Cam runs about half as fast when connected to the IDE. The FPS should increase once disconnected. + +In the [GitHub](https://github.com/Mjrovai/Arduino_Nicla_Vision/tree/main/Micropython), You can find other Python scripts. Try to test the onboard sensors. + +## Connecting the Nicla Vision to Edge Impulse Studio + +We will need the Edge Impulse Studio later in other exercises. [Edge Impulse](https://www.edgeimpulse.com/) is a leading development platform for machine learning on edge devices. + +Edge Impulse officially supports the Nicla Vision. So, for starting, please create a new project on the Studio and connect the Nicla to it. For that, follow the steps: + +- Download the most updated [EI Firmware](https://cdn.edgeimpulse.com/firmware/arduino-nicla-vision.zip) and unzip it. + +- Open the zip file on your computer and select the uploader corresponding to your OS: + +![](images/imgs_niclav_sys/image17.png){fig-align="center" width="4.416666666666667in"} + +- Put the Nicla-Vision on Boot Mode, pressing the reset button twice. + +- Execute the specific batch code for your OS for uploading the binary *arduino-nicla-vision.bin* to your board. + +Go to your project on the Studio, and on the `Data Acquisition tab`, select `WebUSB` (1). A window will pop up; choose the option that shows that the `Nicla is paired` (2) and press `[Connect]` (3). + +![](images/imgs_niclav_sys/image27.png){fig-align="center" width="6.5in"} + +In the *Collect Data* section on the `Data Acquisition` tab, you can choose which sensor data to pick. + +![](images/imgs_niclav_sys/image25.png){fig-align="center" width="6.5in"} + +For example. `IMU data`: + +![](images/imgs_niclav_sys/image8.png){fig-align="center" width="6.5in"} + +Or Image (`Camera`): + +![](images/imgs_niclav_sys/image4.png){fig-align="center" width="6.5in"} + +And so on. You can also test an external sensor connected to the `ADC` (Nicla pin 0) and the other onboard sensors, such as the microphone and the ToF. + +## Expanding the Nicla Vision Board (optional) + +A last item to be explored is that sometimes, during prototyping, it is essential to experiment with external sensors and devices, and an excellent expansion to the Nicla is the [Arduino MKR Connector Carrier (Grove compatible)](https://store-usa.arduino.cc/products/arduino-mkr-connector-carrier-grove-compatible). + +The shield has 14 Grove connectors: five single analog inputs (A0-A5), one double analog input (A5/A6), five single digital I/Os (D0-D4), one double digital I/O (D5/D6), one I2C (TWI), and one UART (Serial). All connectors are 5V compatible. + +> Note that all 17 Nicla Vision pins will be connected to the Shield Groves, but some Grove connections remain disconnected. + +![](images/imgs_niclav_sys/image20.jpg){fig-align="center" width="6.5in"} + +This shield is MKR compatible and can be used with the Nicla Vision and Portenta. + +![](images/imgs_niclav_sys/image26.jpg){fig-align="center" width="4.34375in"} + +For example, suppose that on a TinyML project, you want to send inference results using a LoRaWAN device and add information about local luminosity. Often, with offline operations, a local low-power display such as an OLED is advised. This setup can be seen here: + +![](images/imgs_niclav_sys/image11.jpg){fig-align="center" width="6.5in"} + +The [Grove Light Sensor](https://wiki.seeedstudio.com/Grove-Light_Sensor/) would be connected to one of the single Analog pins (A0/PC4), the [LoRaWAN device](https://wiki.seeedstudio.com/Grove_LoRa_E5_New_Version/) to the UART, and the [OLED](https://arduino.cl/producto/display-oled-grove/) to the I2C connector. + +The Nicla Pins 3 (Tx) and 4 (Rx) are connected with the Serial Shield connector. The UART communication is used with the LoRaWan device. Here is a simple code to use the UART: + +``` python +# UART Test - By: marcelo_rovai - Sat Sep 23 2023 + +import time +from pyb import UART +from pyb import LED + +redLED = LED(1) # built-in red LED + +# Init UART object. +# Nicla Vision's UART (TX/RX pins) is on "LP1" +uart = UART("LP1", 9600) + +while(True): + uart.write("Hello World!\r\n") + redLED.toggle() + time.sleep_ms(1000) +``` + +To verify that the UART is working, you should, for example, connect another device as the Arduino UNO, displaying "Hello Word" on the Serial Monitor. Here is the [code](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Arduino-IDE/teste_uart_UNO/teste_uart_UNO.ino). + +![](images/imgs_niclav_sys/image24.jpg){fig-align="center" width="2.8125in"} + +Below is the *Hello World code* to be used with the I2C OLED. The MicroPython SSD1306 OLED driver (ssd1306.py), created by Adafruit, should also be uploaded to the Nicla (the ssd1306.py script can be found in [GitHub](https://github.com/Mjrovai/Arduino_Nicla_Vision/blob/main/Micropython/ssd1306.py)). + +``` python +# Nicla_OLED_Hello_World - By: marcelo_rovai - Sat Sep 30 2023 + +#Save on device: MicroPython SSD1306 OLED driver, I2C and SPI interfaces created by Adafruit +import ssd1306 + +from machine import I2C +i2c = I2C(1) + +oled_width = 128 +oled_height = 64 +oled = ssd1306.SSD1306_I2C(oled_width, oled_height, i2c) + +oled.text('Hello, World', 10, 10) +oled.show() +``` + +Finally, here is a simple script to read the ADC value on pin "PC4" (Nicla pin A0): + +``` python + +# Light Sensor (A0) - By: marcelo_rovai - Wed Oct 4 2023 + +import pyb +from time import sleep + +adc = pyb.ADC(pyb.Pin("PC4")) # create an analog object from a pin +val = adc.read() # read an analog value + +while (True): + + val = adc.read() + print ("Light={}".format (val)) + sleep (1) +``` + +The ADC can be used for other sensor variables, such as [Temperature](https://wiki.seeedstudio.com/Grove-Temperature_Sensor_V1.2/). + +> Note that the above scripts ([[downloaded from Github]{.underline}](https://github.com/Mjrovai/Arduino_Nicla_Vision/tree/main/Micropython)) introduce only how to connect external devices with the Nicla Vision board using MicroPython. + +## Conclusion + +The Arduino Nicla Vision is an excellent *tiny device* for industrial and professional uses! However, it is powerful, trustworthy, low power, and has suitable sensors for the most common embedded machine learning applications such as vision, movement, sensor fusion, and sound. + +> On the [GitHub repository,](https://github.com/Mjrovai/Arduino_Nicla_Vision/tree/main) you will find the last version of all the codes used or commented on in this hands-on exercise. diff --git a/object_detection_fomo.qmd b/object_detection_fomo.qmd new file mode 100644 index 00000000..1e38750f --- /dev/null +++ b/object_detection_fomo.qmd @@ -0,0 +1,307 @@ +# Object Detection {.unnumbered} + +## Introduction + +This is a continuation of **CV on Nicla Vision**, now exploring **Object Detection** on microcontrollers. + +![](images/imgs_object_detection_fomo/cv_obj_detect.jpg){fig-align="center" width="6.5in"} + +### Object Detection versus Image Classification + +The main task with Image Classification models is to produce a list of the most probable object categories present on an image, for example, to identify a tabby cat just after his dinner: + +![](images/imgs_object_detection_fomo/img_1.png){fig-align="center"} + +But what happens when the cat jumps near the wine glass? The model still only recognizes the predominant category on the image, the tabby cat: + +![](images/imgs_object_detection_fomo/img_2.png){fig-align="center"} + +And what happens if there is not a dominant category on the image? + +![](images/imgs_object_detection_fomo/img_3.png){fig-align="center"} + +The model identifies the above image completely wrong as an "ashcan," possibly due to the color tonalities. + +> The model used in all previous examples is the *MobileNet*, trained with a large dataset, the *ImageNet*. + +To solve this issue, we need another type of model, where not only **multiple categories** (or labels) can be found but also **where** the objects are located on a given image. + +As we can imagine, such models are much more complicated and bigger, for example, the **MobileNetV2 SSD FPN-Lite 320x320, trained with the COCO dataset.** This pre-trained object detection model is designed to locate up to 10 objects within an image, outputting a bounding box for each object detected. The below image is the result of such a model running on a Raspberry Pi: + +![](images/imgs_object_detection_fomo/img_4.png){fig-align="center" width="6.5in"} + +Those models used for Object detection (such as the MobileNet SSD or YOLO) usually have several MB in size, which is OK for use with Raspberry Pi but unsuitable for use with embedded devices, where the RAM usually is lower than 1M Bytes. + +### An innovative solution for Object Detection: FOMO + +[Edge Impulse launched in 2022, **FOMO** (Faster Objects, More Objects)](https://docs.edgeimpulse.com/docs/edge-impulse-studio/learning-blocks/object-detection/fomo-object-detection-for-constrained-devices), a novel solution to perform object detection on embedded devices, not only on the Nicla Vision (Cortex M7) but also on Cortex M4F CPUs (Arduino Nano33 and OpenMV M4 series) as well the Espressif ESP32 devices (ESP-CAM and XIAO ESP32S3 Sense). + +In this Hands-On exercise, we will explore using FOMO with Object Detection, not entering many details about the model itself. To understand more about how the model works, you can go into the [official FOMO announcement](https://www.edgeimpulse.com/blog/announcing-fomo-faster-objects-more-objects) by Edge Impulse, where Louis Moreau and Mat Kelcey explain in detail how it works. + +## The Object Detection Project Goal + +All Machine Learning projects need to start with a detailed goal. Let's assume we are in an industrial facility and must sort and count **wheels** and special **boxes**. + +![](images/imgs_object_detection_fomo/proj_goal.jpg){fig-align="center" width="6.5in"} + +In other words, we should perform a multi-label classification, where each image can have three classes: + +- Background (No objects) + +- Box + +- Wheel + +Here are some not labeled image samples that we should use to detect the objects (wheels and boxes): + +![](images/imgs_object_detection_fomo/samples.jpg){fig-align="center" width="6.5in"} + +We are interested in which object is in the image, its location (centroid), and how many we can find on it. The object's size is not detected with FOMO, as with MobileNet SSD or YOLO, where the Bounding Box is one of the model outputs. + +We will develop the project using the Nicla Vision for image capture and model inference. The ML project will be developed using the Edge Impulse Studio. But before starting the object detection project in the Studio, let's create a *raw dataset* (not labeled) with images that contain the objects to be detected. + +## Data Collection + +We can use the Edge Impulse Studio, the OpenMV IDE, your phone, or other devices for the image capture. Here, we will use again the OpenMV IDE for our purpose. + +### Collecting Dataset with OpenMV IDE + +First, create in your computer a folder where your data will be saved, for example, "data." Next, on the OpenMV IDE, go to Tools \> Dataset Editor and select New Dataset to start the dataset collection: + +![](images/imgs_object_detection_fomo/data_folder.jpg){fig-align="center" width="6.5in"} + +Edge impulse suggests that the objects should be of similar size and not overlapping for better performance. This is OK in an industrial facility, where the camera should be fixed, keeping the same distance from the objects to be detected. Despite that, we will also try with mixed sizes and positions to see the result. + +> We will not create separate folders for our images because each contains multiple labels. + +Connect the Nicla Vision to the OpenMV IDE and run the `dataset_capture_script.py`. Clicking on the Capture Image button will start capturing images: + +![](images/imgs_object_detection_fomo/img_5.jpg){fig-align="center" width="6.5in"} + +We suggest around 50 images mixing the objects and varying the number of each appearing on the scene. Try to capture different angles, backgrounds, and light conditions. + +> The stored images use a QVGA frame size 320x240 and RGB565 (color pixel format). + +After capturing your dataset, close the Dataset Editor Tool on the `Tools > Dataset Editor`. + +## Edge Impulse Studio + +### Setup the project + +Go to [Edge Impulse Studio,](https://www.edgeimpulse.com/) enter your credentials at **Login** (or create an account), and start a new project. + +![](images/imgs_object_detection_fomo/img_6.png){fig-align="center" width="6.5in"} + +> Here, you can clone the project developed for this hands-on: [NICLA_Vision_Object_Detection](https://studio.edgeimpulse.com/public/292737/latest). + +On your Project Dashboard, go down and on **Project info** and select **Bounding boxes (object detection)** and Nicla Vision as your Target Device: + +![](images/imgs_object_detection_fomo/img_7.png){fig-align="center" width="6.5in"} + +### Uploading the unlabeled data + +On Studio, go to the `Data acquisition` tab, and on the `UPLOAD DATA` section, upload from your computer files captured. + +![](images/imgs_object_detection_fomo/img_8.png){fig-align="center" width="6.5in"} + +> You can leave for the Studio to split your data automatically between Train and Test or do it manually. + +![](images/imgs_object_detection_fomo/img_9.png){fig-align="center" width="6.5in"} + +All the not labeled images (51) were uploaded but they still need to be labeled appropriately before using them as a dataset in the project. The Studio has a tool for that purpose, which you can find in the link `Labeling queue (51)`. + +There are two ways you can use to perform AI-assisted labeling on the Edge Impulse Studio (free version): + +- Using yolov5 +- Tracking objects between frames + +> Edge Impulse launched an [auto-labeling feature](https://docs.edgeimpulse.com/docs/edge-impulse-studio/data-acquisition/auto-labeler) for Enterprise customers, easing labeling tasks in object detection projects. + +Ordinary objects can quickly be identified and labeled using an existing library of pre-trained object detection models from YOLOv5 (trained with the COCO dataset). But since, in our case, the objects are not part of COCO datasets, we should select the option of `tracking objects`. With this option, once you draw bounding boxes and label the images in one frame, the objects will be tracked automatically from frame to frame, *partially* labeling the new ones (not all are correctly labeled). + +> You can use the [EI uploader](https://docs.edgeimpulse.com/docs/tools/edge-impulse-cli/cli-uploader#bounding-boxes) to import your data if you already have a labeled dataset containing bounding boxes. + +### Labeling the Dataset + +Starting with the first image of your unlabeled data, use your mouse to drag a box around an object to add a label. Then click **Save labels** to advance to the next item. + +![](images/imgs_object_detection_fomo/img_10.png){fig-align="center" width="6.5in"} + +Continue with this process until the queue is empty. At the end, all images should have the objects labeled as those samples below: + +![](images/imgs_object_detection_fomo/img_11.jpg){fig-align="center" width="6.5in"} + +Next, review the labeled samples on the `Data acquisition` tab. If one of the labels was wrong, you can edit it using the *`three dots`* menu after the sample name: + +![](images/imgs_object_detection_fomo/img_12.png){fig-align="center" width="6.5in"} + +You will be guided to replace the wrong label, correcting the dataset. + +![](images/imgs_object_detection_fomo/img_13.jpg){fig-align="center" width="6.5in"} + +## The Impulse Design + +In this phase, you should define how to: + +- **Pre-processing** consists of resizing the individual images from `320 x 240` to `96 x 96` and squashing them (squared form, without cropping). Afterwards, the images are converted from RGB to Grayscale. + +- **Design a Model,** in this case, "Object Detection." + +![](images/imgs_object_detection_fomo/img_14.png){fig-align="center" width="6.5in"} + +### Preprocessing all dataset + +In this section, select **Color depth** as `Grayscale`, which is suitable for use with FOMO models and Save `parameters`. + +![](images/imgs_object_detection_fomo/img_15.png){fig-align="center" width="6.5in"} + +The Studio moves automatically to the next section, `Generate features`, where all samples will be pre-processed, resulting in a dataset with individual 96x96x1 images or 9,216 features. + +![](images/imgs_object_detection_fomo/img_16.png){fig-align="center" width="6.5in"} + +The feature explorer shows that all samples evidence a good separation after the feature generation. + +> One of the samples (46) apparently is in the wrong space, but clicking on it can confirm that the labeling is correct. + +## Model Design, Training, and Test + +We will use FOMO, an object detection model based on MobileNetV2 (alpha 0.35) designed to coarsely segment an image into a grid of **background** vs **objects of interest** (here, *boxes* and *wheels*). + +FOMO is an innovative machine learning model for object detection, which can use up to 30 times less energy and memory than traditional models like Mobilenet SSD and YOLOv5. FOMO can operate on microcontrollers with less than 200 KB of RAM. The main reason this is possible is that while other models calculate the object's size by drawing a square around it (bounding box), FOMO ignores the size of the image, providing only the information about where the object is located in the image, by means of its centroid coordinates. + +**How FOMO works?** + +FOMO takes the image in grayscale and divides it into blocks of pixels using a factor of 8. For the input of 96x96, the grid would be 12x12 (96/8=12). Next, FOMO will run a classifier through each pixel block to calculate the probability that there is a box or a wheel in each of them and, subsequently, determine the regions which have the highest probability of containing the object (If a pixel block has no objects, it will be classified as *background*). From the overlap of the final region, the FOMO provides the coordinates (related to the image dimensions) of the centroid of this region. + +![](images/imgs_object_detection_fomo/img_17.png){fig-align="center" width="6.5in"} + +For training, we should select a pre-trained model. Let's use the **`FOMO (Faster Objects, More Objects) MobileNetV2 0.35`\`.** This model uses around 250KB RAM and 80KB of ROM (Flash), which suits well with our board since it has 1MB of RAM and ROM. + +![](images/imgs_object_detection_fomo/img_18.png){fig-align="center" width="6.5in"} + +Regarding the training hyper-parameters, the model will be trained with: + +- Epochs: 60, +- Batch size: 32 +- Learning Rate: 0.001. + +For validation during training, 20% of the dataset (*validation_dataset*) will be spared. For the remaining 80% (*train_dataset*), we will apply Data Augmentation, which will randomly flip, change the size and brightness of the image, and crop them, artificially increasing the number of samples on the dataset for training. + +As a result, the model ends with practically 1.00 in the F1 score, with a similar result when using the Test data. + +> Note that FOMO automatically added a 3rd label background to the two previously defined (*box* and *wheel*). + +![](images/imgs_object_detection_fomo/img_19.png){fig-align="center" width="6.5in"} + +> In object detection tasks, accuracy is generally not the primary [evaluation metric](https://learnopencv.com/mean-average-precision-map-object-detection-model-evaluation-metric/). Object detection involves classifying objects and providing bounding boxes around them, making it a more complex problem than simple classification. The issue is that we do not have the bounding box, only the centroids. In short, using accuracy as a metric could be misleading and may not provide a complete understanding of how well the model is performing. Because of that, we will use the F1 score. + +### Test model with "Live Classification" + +Since Edge Impulse officially supports the Nicla Vision, let's connect it to the Studio. For that, follow the steps: + +- Download the [last EI Firmware](https://cdn.edgeimpulse.com/firmware/arduino-nicla-vision.zip) and unzip it. + +- Open the zip file on your computer and select the uploader related to your OS: + +![](images_2/media/image17.png){fig-align="center" width="4.416666666666667in"} + +- Put the Nicla-Vision on Boot Mode, pressing the reset button twice. + +- Execute the specific batch code for your OS for uploading the binary (`arduino-nicla-vision.bin`) to your board. + +Go to `Live classification` section at EI Studio, and using *webUSB,* connect your Nicla Vision: + +![](images/imgs_object_detection_fomo/img_20.png){fig-align="center" width="6.5in"} + +Once connected, you can use the Nicla to capture actual images to be tested by the trained model on Edge Impulse Studio. + +![](images/imgs_object_detection_fomo/img_21.png){fig-align="center" width="6.5in"} + +One thing to be noted is that the model can produce false positives and negatives. This can be minimized by defining a proper `Confidence Threshold` (use the `Three dots` menu for the set-up). Try with 0.8 or more. + +## Deploying the Model + +Select OpenMV Firmware on the Deploy Tab and press \[Build\]. + +![](images/imgs_object_detection_fomo/img_22.png){fig-align="center" width="6.5in"} + +When you try to connect the Nicla with the OpenMV IDE again, it will try to update its FW. Choose the option `Load a specific firmware` instead. + +![](images/imgs_object_detection_fomo/img_24.png){fig-align="center"} + +You will find a ZIP file on your computer from the Studio. Open it: + +![](images/imgs_object_detection_fomo/img_23.png){fig-align="center" width="6.5in"} + +Load the .bin file to your board: + +![](images/imgs_object_detection_fomo/img_25.png){fig-align="center" width="6.5in"} + +After the download is finished, a pop-up message will be displayed. `Press OK`, and open the script **ei_object_detection.py** downloaded from the Studio. + +Before running the script, let's change a few lines. Note that you can leave the window definition as 240 x 240 and the camera capturing images as QVGA/RGB. The captured image will be pre-processed by the FW deployed from Edge Impulse + +``` python +# Edge Impulse - OpenMV Object Detection Example + +import sensor, image, time, os, tf, math, uos, gc + +sensor.reset() # Reset and initialize the sensor. +sensor.set_pixformat(sensor.RGB565) # Set pixel format to RGB565 (or GRAYSCALE) +sensor.set_framesize(sensor.QVGA) # Set frame size to QVGA (320x240) +sensor.set_windowing((240, 240)) # Set 240x240 window. +sensor.skip_frames(time=2000) # Let the camera adjust. + +net = None +labels = None +``` + +Redefine the minimum confidence, for example, to 0.8 to minimize false positives and negatives. + +``` python +min_confidence = 0.8 +``` + +Change if necessary, the color of the circles that will be used to display the detected object's centroid for a better contrast. + +``` python +try: + # Load built in model + labels, net = tf.load_builtin_model('trained') +except Exception as e: + raise Exception(e) + +colors = [ # Add more colors if you are detecting more than 7 types of classes at once. + (255, 255, 0), # background: yellow (not used) + ( 0, 255, 0), # cube: green + (255, 0, 0), # wheel: red + ( 0, 0, 255), # not used + (255, 0, 255), # not used + ( 0, 255, 255), # not used + (255, 255, 255), # not used +] +``` + +Keep the remaining code as it is and press the `green Play button` to run the code: + +![](images/imgs_object_detection_fomo/img_26.png){fig-align="center" width="6.5in"} + +On the camera view, we can see the objects with their centroids marked with 12 pixel-fixed circles (each circle has a distinct color, depending on its class). On the Serial Terminal, the model shows the labels detected and their position on the image window (240X240). + +> Be ware that the coordinate origin is in the upper left corner. + +![](images/imgs_object_detection_fomo/img_27.jpg){fig-align="center" width="624"} + +Note that the frames per second rate is around 8 fps (similar to what we got with the Image Classification project). This happens because FOMO is cleverly built over a CNN model, not with an object detection model like the SSD MobileNet. For example, when running a MobileNetV2 SSD FPN-Lite 320x320 model on a Raspberry Pi 4, the latency is around 5 times higher (around 1.5 fps) + +Here is a short video showing the inference results: {{< video https://youtu.be/JbpoqRp3BbM width="480" height="270" center >}} + +## Conclusion + +FOMO is a significant leap in the image processing space, as Louis Moreau and Mat Kelcey put it during its launch in 2022: + +> FOMO is a ground-breaking algorithm that brings real-time object detection, tracking, and counting to microcontrollers for the first time. + +Multiple possibilities exist for exploring object detection (and, more precisely, counting them) on embedded devices, for example, to explore the Nicla doing sensor fusion (camera + microphone) and object detection. This can be very useful on projects involving bees, for example. + +![](images/imgs_object_detection_fomo/img_28.jpg){fig-align="center" width="624"} diff --git a/ondevice_learning.qmd b/ondevice_learning.qmd index cc587117..4953b24a 100644 --- a/ondevice_learning.qmd +++ b/ondevice_learning.qmd @@ -1,6 +1,6 @@ # On-Device Learning -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * coming soon. diff --git a/ops.qmd b/ops.qmd index 9b60f7df..63a4f402 100644 --- a/ops.qmd +++ b/ops.qmd @@ -1,6 +1,6 @@ # Embedded AIOps -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * coming soon. diff --git a/optimizations.qmd b/optimizations.qmd index e0c112f4..9b457c3d 100644 --- a/optimizations.qmd +++ b/optimizations.qmd @@ -1,5 +1,15 @@ # Model Optimizations +<<<<<<< HEAD +======= +::: {.callout-tip} +## Learning Objectives + +* coming soon. + +::: + +>>>>>>> upstream/main ## Introduction When machine learning models are deployed on systems, especially on resource-constrained embedded systems, the optimization of models is a necessity. While machine learning inherently often demands substantial computational resources, the systems are inherently limited in memory, processing power, and energy. This chapter will dive into the art and science of optimizing machine learning models to ensure they are lightweight, efficient, and effective when deployed in TinyML scenarios. diff --git a/privacy_security.qmd b/privacy_security.qmd index 03756d87..f11980c8 100644 --- a/privacy_security.qmd +++ b/privacy_security.qmd @@ -1,6 +1,6 @@ # Privacy and Security -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * coming soon. diff --git a/references.bib b/references.bib index 0fdbfede..a25feb47 100644 --- a/references.bib +++ b/references.bib @@ -1,3 +1,137 @@ +@inproceedings{lin2014microsoft, + title={Microsoft coco: Common objects in context}, + author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence}, + booktitle={Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13}, + pages={740--755}, + year={2014}, + organization={Springer} +} + +@article{banbury2020benchmarking, + title={Benchmarking tinyml systems: Challenges and direction}, + author={Banbury, Colby R and Reddi, Vijay Janapa and Lam, Max and Fu, William and Fazel, Amin and Holleman, Jeremy and Huang, Xinyuan and Hurtado, Robert and Kanter, David and Lokhmotov, Anton and others}, + journal={arXiv preprint arXiv:2003.04821}, + year={2020} +} + +@inproceedings{hendrycks2021natural, + title={Natural adversarial examples}, + author={Hendrycks, Dan and Zhao, Kevin and Basart, Steven and Steinhardt, Jacob and Song, Dawn}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={15262--15271}, + year={2021} +} + +@inproceedings{xie2020adversarial, + title={Adversarial examples improve image recognition}, + author={Xie, Cihang and Tan, Mingxing and Gong, Boqing and Wang, Jiang and Yuille, Alan L and Le, Quoc V}, + booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition}, + pages={819--828}, + year={2020} +} + +@inproceedings{koh2021wilds, + title={Wilds: A benchmark of in-the-wild distribution shifts}, + author={Koh, Pang Wei and Sagawa, Shiori and Marklund, Henrik and Xie, Sang Michael and Zhang, Marvin and Balsubramani, Akshay and Hu, Weihua and Yasunaga, Michihiro and Phillips, Richard Lanas and Gao, Irena and others}, + booktitle={International Conference on Machine Learning}, + pages={5637--5664}, + year={2021}, + organization={PMLR} +} + +@inproceedings{antol2015vqa, + title={Vqa: Visual question answering}, + author={Antol, Stanislaw and Agrawal, Aishwarya and Lu, Jiasen and Mitchell, Margaret and Batra, Dhruv and Zitnick, C Lawrence and Parikh, Devi}, + booktitle={Proceedings of the IEEE international conference on computer vision}, + pages={2425--2433}, + year={2015} +} + +@inproceedings{chu2021discovering, + title={Discovering multi-hardware mobile models via architecture search}, + author={Chu, Grace and Arikan, Okan and Bender, Gabriel and Wang, Weijun and Brighton, Achille and Kindermans, Pieter-Jan and Liu, Hanxiao and Akin, Berkin and Gupta, Suyog and Howard, Andrew}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={3022--3031}, + year={2021} +} + +@article{david2021tensorflow, + title={Tensorflow lite micro: Embedded machine learning for tinyml systems}, + author={David, Robert and Duke, Jared and Jain, Advait and Janapa Reddi, Vijay and Jeffries, Nat and Li, Jian and Kreeger, Nick and Nappier, Ian and Natraj, Meghna and Wang, Tiezhen and others}, + journal={Proceedings of Machine Learning and Systems}, + volume={3}, + pages={800--811}, + year={2021} +} + +@article{warden2018speech, + title={Speech commands: A dataset for limited-vocabulary speech recognition}, + author={Warden, Pete}, + journal={arXiv preprint arXiv:1804.03209}, + year={2018} +} + +@inproceedings{adolf2016fathom, + title={Fathom: Reference workloads for modern deep learning methods}, + author={Adolf, Robert and Rama, Saketh and Reagen, Brandon and Wei, Gu-Yeon and Brooks, David}, + booktitle={2016 IEEE International Symposium on Workload Characterization (IISWC)}, + pages={1--10}, + year={2016}, + organization={IEEE} +} + +@article{coleman2017dawnbench, + title={Dawnbench: An end-to-end deep learning benchmark and competition}, + author={Coleman, Cody and Narayanan, Deepak and Kang, Daniel and Zhao, Tian and Zhang, Jian and Nardi, Luigi and Bailis, Peter and Olukotun, Kunle and R{\'e}, Chris and Zaharia, Matei}, + journal={Training}, + volume={100}, + number={101}, + pages={102}, + year={2017} +} + +@article{mattson2020mlperf, + title={Mlperf training benchmark}, + author={Mattson, Peter and Cheng, Christine and Diamos, Gregory and Coleman, Cody and Micikevicius, Paulius and Patterson, David and Tang, Hanlin and Wei, Gu-Yeon and Bailis, Peter and Bittorf, Victor and others}, + journal={Proceedings of Machine Learning and Systems}, + volume={2}, + pages={336--349}, + year={2020} +} + +@article{brown2020language, + title={Language models are few-shot learners}, + author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others}, + journal={Advances in neural information processing systems}, + volume={33}, + pages={1877--1901}, + year={2020} +} + +@article{devlin2018bert, + title={Bert: Pre-training of deep bidirectional transformers for language understanding}, + author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, + journal={arXiv preprint arXiv:1810.04805}, + year={2018} +} + +@inproceedings{ignatov2018ai, + title={Ai benchmark: Running deep neural networks on android smartphones}, + author={Ignatov, Andrey and Timofte, Radu and Chou, William and Wang, Ke and Wu, Max and Hartley, Tim and Van Gool, Luc}, + booktitle={Proceedings of the European Conference on Computer Vision (ECCV) Workshops}, + pages={0--0}, + year={2018} +} + +@inproceedings{reddi2020mlperf, + title={Mlperf inference benchmark}, + author={Reddi, Vijay Janapa and Cheng, Christine and Kanter, David and Mattson, Peter and Schmuelling, Guenther and Wu, Carole-Jean and Anderson, Brian and Breughe, Maximilien and Charlebois, Mark and Chou, William and others}, + booktitle={2020 ACM/IEEE 47th Annual International Symposium on Computer Architecture (ISCA)}, + pages={446--459}, + year={2020}, + organization={IEEE} +} + @misc{Thefutur92:online, author = {ARM.com}, title = {The future is being built on Arm: Market diversification continues to drive strong royalty and licensing growth as ecosystem reaches quarter of a trillion chips milestone – Arm®}, @@ -7,6 +141,153 @@ @misc{Thefutur92:online note = {(Accessed on 09/16/2023)} } +@inproceedings{deng2009imagenet, + title={Imagenet: A large-scale hierarchical image database}, + author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li}, + booktitle={2009 IEEE conference on computer vision and pattern recognition}, + pages={248--255}, + year={2009}, + organization={Ieee} +} + +@article{david2021tensorflow, + title={Tensorflow lite micro: Embedded machine learning for tinyml systems}, + author={David, Robert and Duke, Jared and Jain, Advait and Janapa Reddi, Vijay and Jeffries, Nat and Li, Jian and Kreeger, Nick and Nappier, Ian and Natraj, Meghna and Wang, Tiezhen and others}, + journal={Proceedings of Machine Learning and Systems}, + volume={3}, + pages={800--811}, + year={2021} +} + + +@article{al2016theano, + title={Theano: A Python framework for fast computation of mathematical expressions}, + author={Al-Rfou, Rami and Alain, Guillaume and Almahairi, Amjad and Angermueller, Christof and Bahdanau, Dzmitry and Ballas, Nicolas and Bastien, Fr{\'e}d{\'e}ric and Bayer, Justin and Belikov, Anatoly and Belopolsky, Alexander and others}, + journal={arXiv e-prints}, + pages={arXiv--1605}, + year={2016} +} + + + +@inproceedings{jia2014caffe, + title={Caffe: Convolutional architecture for fast feature embedding}, + author={Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and Karayev, Sergey and Long, Jonathan and Girshick, Ross and Guadarrama, Sergio and Darrell, Trevor}, + booktitle={Proceedings of the 22nd ACM international conference on Multimedia}, + pages={675--678}, + year={2014} +} + +@article{brown2020language, + title={Language models are few-shot learners}, + author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others}, + journal={Advances in neural information processing systems}, + volume={33}, + pages={1877--1901}, + year={2020} +} + +@inproceedings{he2016deep, + title={Deep residual learning for image recognition}, + author={He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian}, + booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, + pages={770--778}, + year={2016} +} + + +@article{krizhevsky2012imagenet, + title={Imagenet classification with deep convolutional neural networks}, + author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E}, + journal={Advances in neural information processing systems}, + volume={25}, + year={2012} +} + +@article{paszke2019pytorch, + title={Pytorch: An imperative style, high-performance deep learning library}, + author={Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others}, + journal={Advances in neural information processing systems}, + volume={32}, + year={2019} +} + +@inproceedings{seide2016cntk, + title={CNTK: Microsoft's open-source deep-learning toolkit}, + author={Seide, Frank and Agarwal, Amit}, + booktitle={Proceedings of the 22nd ACM SIGKDD international conference on knowledge discovery and data mining}, + pages={2135--2135}, + year={2016} +} + +@inproceedings{kung1979systolic, + title={Systolic arrays (for VLSI)}, + author={Kung, Hsiang Tsung and Leiserson, Charles E}, + booktitle={Sparse Matrix Proceedings 1978}, + volume={1}, + pages={256--282}, + year={1979}, + organization={Society for industrial and applied mathematics Philadelphia, PA, USA} +} + + +@article{li2014communication, + title={Communication efficient distributed machine learning with the parameter server}, + author={Li, Mu and Andersen, David G and Smola, Alexander J and Yu, Kai}, + journal={Advances in Neural Information Processing Systems}, + volume={27}, + year={2014} +} + +@inproceedings{abadi2016tensorflow, + title={$\{$TensorFlow$\}$: a system for $\{$Large-Scale$\}$ machine learning}, + author={Abadi, Mart{\'\i}n and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and others}, + booktitle={12th USENIX symposium on operating systems design and implementation (OSDI 16)}, + pages={265--283}, + year={2016} +} + +@article{dean2012large, + title={Large scale distributed deep networks}, + author={Dean, Jeffrey and Corrado, Greg and Monga, Rajat and Chen, Kai and Devin, Matthieu and Mao, Mark and Ranzato, Marc'aurelio and Senior, Andrew and Tucker, Paul and Yang, Ke and others}, + journal={Advances in neural information processing systems}, + volume={25}, + year={2012} +} + +@inproceedings{tokui2015chainer, + title={Chainer: a next-generation open source framework for deep learning}, + author={Tokui, Seiya and Oono, Kenta and Hido, Shohei and Clayton, Justin}, + booktitle={Proceedings of workshop on machine learning systems (LearningSys) in the twenty-ninth annual conference on neural information processing systems (NIPS)}, + volume={5}, + pages={1--6}, + year={2015} +} + +@article{chollet2018keras, + title={Keras: The python deep learning library}, + author={Chollet, Fran{\c{c}}ois and others}, + journal={Astrophysics source code library}, + pages={ascl--1806}, + year={2018} +} + +@article{lai2018cmsis, + title={Cmsis-nn: Efficient neural network kernels for arm cortex-m cpus}, + author={Lai, Liangzhen and Suda, Naveen and Chandra, Vikas}, + journal={arXiv preprint arXiv:1801.06601}, + year={2018} +} + +@article{lin2020mcunet, + title={Mcunet: Tiny deep learning on iot devices}, + author={Lin, Ji and Chen, Wei-Ming and Lin, Yujun and Gan, Chuang and Han, Song and others}, + journal={Advances in Neural Information Processing Systems}, + volume={33}, + pages={11711--11722}, + year={2020} +} + @article{ramcharan2017deep, title={Deep learning for image-based cassava disease detection}, author={Ramcharan, Amanda and Baranowski, Kelsee and McCloskey, Peter and Ahmed, Babuali and Legg, James and Hughes, David P}, @@ -170,6 +451,26 @@ @inproceedings{jouppi2017datacenter year={2017} } +@misc{mcmahan2023communicationefficient, + title={Communication-Efficient Learning of Deep Networks from Decentralized Data}, + author={H. Brendan McMahan and Eider Moore and Daniel Ramage and Seth Hampson and Blaise Agüera y Arcas}, + year={2023}, + eprint={1602.05629}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} + +@article{li2017learning, + title={Learning without forgetting}, + author={Li, Zhizhong and Hoiem, Derek}, + journal={IEEE transactions on pattern analysis and machine intelligence}, + volume={40}, + number={12}, + pages={2935--2947}, + year={2017}, + publisher={IEEE} +} + @article{krizhevsky2012imagenet, title={Imagenet classification with deep convolutional neural networks}, author={Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E}, @@ -622,3 +923,62 @@ @misc{liao_can_2023 file = {arXiv Fulltext PDF:/Users/jeffreyma/Zotero/storage/V6P3XB5H/Liao et al. - 2023 - Can Unstructured Pruning Reduce the Depth in Deep .pdf:application/pdf;arXiv.org Snapshot:/Users/jeffreyma/Zotero/storage/WSQ4ZUH4/2308.html:text/html}, } +@article{kiela2021dynabench, + title={Dynabench: Rethinking benchmarking in NLP}, + author={Kiela, Douwe and Bartolo, Max and Nie, Yixin and Kaushik, Divyansh and Geiger, Atticus and Wu, Zhengxuan and Vidgen, Bertie and Prasad, Grusha and Singh, Amanpreet and Ringshia, Pratik and others}, + journal={arXiv preprint arXiv:2104.14337}, + year={2021} +} + +@article{beyer2020we, + title={Are we done with imagenet?}, + author={Beyer, Lucas and H{\'e}naff, Olivier J and Kolesnikov, Alexander and Zhai, Xiaohua and Oord, A{\"a}ron van den}, + journal={arXiv preprint arXiv:2006.07159}, + year={2020} +} +@article{gaviria2022dollar, + title={The Dollar Street Dataset: Images Representing the Geographic and Socioeconomic Diversity of the World}, + author={Gaviria Rojas, William and Diamos, Sudnya and Kini, Keertan and Kanter, David and Janapa Reddi, Vijay and Coleman, Cody}, + journal={Advances in Neural Information Processing Systems}, + volume={35}, + pages={12979--12990}, + year={2022} +} +@article{xu2023demystifying, + title={Demystifying CLIP Data}, + author={Xu, Hu and Xie, Saining and Tan, Xiaoqing Ellen and Huang, Po-Yao and Howes, Russell and Sharma, Vasu and Li, Shang-Wen and Ghosh, Gargi and Zettlemoyer, Luke and Feichtenhofer, Christoph}, + journal={arXiv preprint arXiv:2309.16671}, + year={2023} +} +@inproceedings{coleman2022similarity, + title={Similarity search for efficient active learning and search of rare concepts}, + author={Coleman, Cody and Chou, Edward and Katz-Samuels, Julian and Culatana, Sean and Bailis, Peter and Berg, Alexander C and Nowak, Robert and Sumbaly, Roshan and Zaharia, Matei and Yalniz, I Zeki}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={36}, + number={6}, + pages={6402--6410}, + year={2022} +} +@inproceedings{ribeiro2016should, + title={" Why should i trust you?" Explaining the predictions of any classifier}, + author={Ribeiro, Marco Tulio and Singh, Sameer and Guestrin, Carlos}, + booktitle={Proceedings of the 22nd ACM SIGKDD international conference on knowledge discovery and data mining}, + pages={1135--1144}, + year={2016} +} +@article{lundberg2017unified, + title={A unified approach to interpreting model predictions}, + author={Lundberg, Scott M and Lee, Su-In}, + journal={Advances in neural information processing systems}, + volume={30}, + year={2017} +} +@inproceedings{coleman2022similarity, + title={Similarity search for efficient active learning and search of rare concepts}, + author={Coleman, Cody and Chou, Edward and Katz-Samuels, Julian and Culatana, Sean and Bailis, Peter and Berg, Alexander C and Nowak, Robert and Sumbaly, Roshan and Zaharia, Matei and Yalniz, I Zeki}, + booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, + volume={36}, + number={6}, + pages={6402--6410}, + year={2022} +} diff --git a/responsible_ai.qmd b/responsible_ai.qmd index 6fdaed16..5b23c984 100644 --- a/responsible_ai.qmd +++ b/responsible_ai.qmd @@ -1,6 +1,6 @@ # Responsible AI -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * coming soon. diff --git a/robust_ai.qmd b/robust_ai.qmd index 75c4f173..c5f63358 100644 --- a/robust_ai.qmd +++ b/robust_ai.qmd @@ -2,7 +2,7 @@ Robust AI focuses on ensuring that AI systems operate reliably and safely in real-world environments. As AI systems are deployed in critical applications, from healthcare to autonomous vehicles, it's essential that they can handle unexpected situations, adversarial attacks, and hardware or software faults. This chapter delves into the various challenges and solutions associated with building robust AI systems, especially for TinyML. -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * coming soon. diff --git a/training.qmd b/training.qmd index 1027a9c0..90f5cefe 100644 --- a/training.qmd +++ b/training.qmd @@ -1,6 +1,6 @@ # AI Training -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * coming soon. diff --git a/workflow.qmd b/workflow.qmd index 3c4d3c22..1031b77a 100644 --- a/workflow.qmd +++ b/workflow.qmd @@ -1,10 +1,12 @@ # AI Workflow +![_DALL·E 3 Prompt: Illustration in a rectangular format of a stylized flowchart representing the AI workflow chapter. Starting from the left, the stages include 'Data Collection' represented by a database icon, 'Data Preprocessing' with a filter icon, 'Model Design' with a brain icon, 'Training' with a weight icon, 'Evaluation' with a checkmark, and 'Deployment' with a rocket on the far right. Arrows connect each stage, guiding the viewer horizontally through the AI processes, emphasizing the sequential and interconnected nature of these steps._](./images/cover_ai_workflow.png) + In this chapter, we'll explore the machine learning (ML) workflow, setting the stage for subsequent chapters that delve into the specifics. To ensure we don't lose sight of the bigger picture, this chapter offers a high-level overview of the steps involved in the ML workflow. The ML workflow is a structured approach that guides professionals and researchers through the process of developing, deploying, and maintaining ML models. This workflow is generally divided into several crucial stages, each contributing to the effective development of intelligent systems. -::: {.callout-tip collapse="true"} +::: {.callout-tip} ## Learning Objectives * Understand the ML workflow and gain insights into the structured approach and stages involved in developing, deploying, and maintaining machine learning models.