From a0816d9e89ac397f331bda4f31c0ad13beb1958f Mon Sep 17 00:00:00 2001
From: Shan Li <lishanok@google.com>
Date: Tue, 22 Aug 2023 15:11:52 -0700
Subject: [PATCH] No public description

PiperOrigin-RevId: 559237100
Change-Id: If11ab4c474473bfdd59c4e7e6999eaae22c8c0c0
---
 qkeras/qtools/divide_and_conquer.py | 517 ++++++++++++++++++++++++++++
 qkeras/qtools/qtools_util.py        |  38 +-
 tests/qtools_model_test.py          |  46 +++
 3 files changed, 600 insertions(+), 1 deletion(-)
 create mode 100644 qkeras/qtools/divide_and_conquer.py

diff --git a/qkeras/qtools/divide_and_conquer.py b/qkeras/qtools/divide_and_conquer.py
new file mode 100644
index 00000000..deb98e94
--- /dev/null
+++ b/qkeras/qtools/divide_and_conquer.py
@@ -0,0 +1,517 @@
+# Copyright 2019 Google LLC
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""divide_and_conquer hardware cost profiling.
+
+Given a target throughput and a ML model, this implementation determines
+the key HW design parameters (bitwidth, unroll factors) for ML area
+optimization in a pipelined architecture.
+
+It generates recommended design parameters to assist downstream HW synthesis
+design. With this, it provides accurate HW cost modeling for ML training
+and ML complexity evaluation such as AV2/ROOF_ML.
+"""
+
+import enum
+import logging
+from typing import List, Any, Union
+
+import numpy as np
+import tensorflow as tf
+
+from qkeras import quantizers
+from qkeras.qtools import qgraph
+from qkeras.qtools import qtools_util
+
+
+class CostMode(enum.Enum):
+    NAIVE = 1
+    ML_PE_AREA = 2
+    ML_PE_BW_AREA = 3
+
+
+# pylint: disable=invalid-name
+class DivideConquerGraph:
+  """This class creates model graph structure and methods to access layers."""
+
+  def __init__(self, model: tf.keras.Model,
+               source_quantizers: quantizers.BaseQuantizer = None):
+    self._model = model
+    self._source_quantizer_list = source_quantizers or [
+        quantizers.quantized_bits(8, 0, 1)]
+
+    (self._graph, self._source_quantizer_list) = qgraph.CreateGraph(
+        model, source_quantizers, "quantized_bits(8, 0, 1)")
+
+    # Propagate output quantizer info into the graph edges.
+    qgraph.GraphPropagateActivationsToEdges(self._graph)
+
+    # Create layer-to-index mapping dict.
+    self._layer_to_idx_dict = {}
+    for idx in self._graph._node.keys():
+      self._layer_to_idx_dict[self.idx_to_layer(idx)] = idx
+
+  def idx_to_layer(self, idx: int):
+    # Map layer index to the layer object.
+    return self._graph._node[idx]["layer"][0]
+
+  def layer_to_idx(self, layer: tf.keras.layers.Layer):
+    # Map a layer object to index.
+    return self._layer_to_idx_dict.get(layer, None)
+
+  def get_first_node(self):
+    # Get the source node of the graph.
+    return qgraph.SOURCE
+
+  def is_first_node(self, node: Union[int, tf.keras.layers.Layer]):
+    # Find whether a given node is the first node of the graph.
+    # Node could be either index value or layer object.
+    idx = node if isinstance(node, int) else self.layer_to_idx(node)
+    return idx == qgraph.SOURCE
+
+  def get_last_node(self):
+    # Find the last node of the graph.
+    return qgraph.SINK
+
+  def is_last_node(self, node: Union[int, tf.keras.layers.Layer]):
+    # Find whether a given node is the last node of the graph.
+    # Node could be either index value or layer object.
+    idx = node if isinstance(node, int) else self.layer_to_idx(node)
+    return idx == qgraph.SINK
+
+  def get_prev_nodes(self, node: Union[int, tf.keras.layers.Layer]):
+    # Find the predecessor nodes in the graph of the given node.
+    # Node could be either index value or layer object.
+    idx = node if isinstance(node, int) else self.layer_to_idx(node)
+    return list(self._graph.predecessors(idx))
+
+  def get_next_nodes(self, node: Union[int, tf.keras.layers.Layer]):
+    # Find the successor nodes in the graph of the given node.
+    # node could be either index value or layer object.
+    idx = node if isinstance(node, int) else self.layer_to_idx(node)
+    return list(self._graph.successors(idx))
+
+
+class Choice:
+  """This class stores a combination of HW design param values."""
+
+  def __init__(self, l: float = 0, k: float = 0, cin_unroll: int = 0,
+               cout_unroll: int = 0, kh_unroll: int = 0, kw_unroll: int = 0):
+    """Intializer for a combination of hardware design parameters.
+
+    Args:
+      l: Ratio between OutElementPerClk and ComputeOutElementPerClk
+      k: Ratio between InElementPerClk and ComputeInElementPerClk
+      cin_unroll: Unroll factors for input channel
+      cout_unroll: Unroll factors for output channel
+      kh_unroll: Unroll factors for kernel height
+      kw_unroll: Unroll factors for kernel width
+    """
+
+    self.k = k
+    self.l = l
+    self.cin_unroll = cin_unroll
+    self.cout_unroll = cout_unroll
+    self.kh_unroll = kh_unroll
+    self.kw_unroll = kw_unroll
+
+  def __str__(self):
+    return (f"Choice(k={self.k}, l={self.l}, cin_unroll={self.cin_unroll}, "
+            f"cout_unroll={self.cout_unroll} kh_unroll={self.kh_unroll}, "
+            f"kw_unroll={self.kw_unroll})")
+
+
+def get_valid_unrolls(layer: tf.keras.layers.Layer, cout_unroll: int,
+                      target_throughput: int):
+  """Get valid unroll values where resulting throughput>=Target throughput."""
+
+  input_channel = qtools_util.get_layer_info(layer, "input_channel")
+  output_channel = qtools_util.get_layer_info(layer, "output_channel")
+  kernel_height = qtools_util.get_layer_info(layer, "kernel_height")
+  kernel_width = qtools_util.get_layer_info(layer, "kernel_width")
+
+  # Cin_unroll needs to be a divisor of layer.input_channel
+  cin_unroll_list = qtools_util.find_divisors(input_channel)
+  # kw_unroll needs to be a divisor of layer.kernel_width
+  kw_unroll_list = qtools_util.find_divisors(kernel_width)
+  # kh_unroll needs to be a divisor of layer.kernel_height
+  kh_unroll_list = qtools_util.find_divisors(kernel_height)
+
+  valid_unrolls = []
+  for cin_unroll in cin_unroll_list:
+    for kw_unroll in kw_unroll_list:
+      for kh_unroll in kh_unroll_list:
+        # Caculate computation throughput.
+        pe_throughput = cin_unroll * cout_unroll * kh_unroll * kw_unroll / (
+            input_channel * output_channel * kernel_height * kernel_width)
+
+        if pe_throughput >= target_throughput:
+          # Save the valid combination of unroll factors to valid_unrolls.
+          valid_unrolls.append((cin_unroll, kh_unroll, kw_unroll))
+
+  return valid_unrolls
+
+
+def get_per_layer_cost(mac_bitwidth, cin_unroll, cout_unroll, kh_unroll,
+                       kw_unroll, InElementPerClk, OutElementPerClk,
+                       mode):
+  # Area for a single layer, includes both PE and memory Bandwidth
+  # TODO(lishanok@): needs a better cost modeling function. For now we simplify
+  # it to the number of multipliers + interface bitwidth.
+  assert mode == CostMode.NAIVE, "Only CostMode.NAIVE is supported for now."
+
+  pe_area = mac_bitwidth * cin_unroll * cout_unroll * kh_unroll * kw_unroll
+  memory_bw = InElementPerClk * OutElementPerClk
+  return pe_area + memory_bw
+
+
+def get_valid_candidates(input_value, output_to_input_ratio_max):
+  candidate_list = qtools_util.find_divisors(input_value)
+  # Add the other scenario where ComputeElementPerClk is multiple
+  # of ElementPerClk.
+  if output_to_input_ratio_max >= 2:
+    candidate_list += [input_value * x for x in list(
+        range(2, output_to_input_ratio_max+1))]
+
+  return candidate_list
+
+
+def is_bufferThru_greater_than_targetThru(
+    InElementPerClk: int, OutElementPerClk: int, input_channel: int,
+    output_channel: int, kernel_height: int, kernel_width: int,
+    is_upsampled: bool, target_throughput: float):
+  """Verify whether the resulting buffer throughput > target throughput."""
+
+  # Calculate throughput of input buffer.
+  InBuf_throughput = InElementPerClk / input_channel
+  # Calculate throughput of output buffer.
+  if is_upsampled:
+    OutBuf_throughput = OutElementPerClk / (
+        output_channel * kernel_height *kernel_width)
+  else:
+    OutBuf_throughput = OutElementPerClk / output_channel
+
+  logging.debug(
+      "...............InBuf_throughput: %.2f OutBuf_throughput: %.2f",
+      InBuf_throughput, OutBuf_throughput)
+
+  # Valid unroll values must meet buffer throughput requirements.
+  return (InBuf_throughput >= target_throughput and
+          OutBuf_throughput >= target_throughput)
+
+
+def set_best_global_cost_in_paths(
+    OutElementPerClk_list, paths, layer_idx, cur_layer_idx,
+    input_quantizer_bits, mode):
+  """Find the best global cost of the entire model and update the paths dict.
+
+  Args:
+    OutElementPerClk_list: list of OutElementPerClk for the current layer.
+    paths: Dict that contains the choices that each layer has.
+    layer_idx: The index value of the current layer's predecessor.
+    cur_layer_idx: current layer's index value.
+    input_quantizer_bits: Input quantizer bits to the model.
+    mode: mode to calculate cost per layer.
+
+  Returns:
+    None.
+  """
+
+  def calculate_cost(OutElementPerClk):
+    cur_layer_cost = get_per_layer_cost(
+        input_quantizer_bits, 0, 0, 0, 0, 0, OutElementPerClk, mode)
+    accumulative_cost = cur_layer_cost + paths[layer_idx][OutElementPerClk][2]
+    return (cur_layer_cost, accumulative_cost, OutElementPerClk)
+
+  cost_and_values = list(map(calculate_cost, OutElementPerClk_list))
+
+  layer_cost, min_accumulative_cost, best_OutElementPerClk = (
+      min(cost_and_values, key=lambda x: x[1]))
+
+  # For the initial node, we find the best path which contains a sentinel
+  # choice, cost with that path, and the chosen OutElementPerClk
+  # that will point to the corresponding choice of the following layer.
+  paths[cur_layer_idx] = {
+      best_OutElementPerClk: (Choice().__str__(), layer_cost,
+                              min_accumulative_cost, best_OutElementPerClk)}
+
+
+def backtrack(graph, paths):
+  """Backtracking of the best path from the first layer to the last."""
+  best_path = {}
+  # Get the second node from the graph as the first node is a sentinel node.
+  layer_idx = graph.get_first_node()
+
+  logging.debug("=======================")
+  logging.debug("Trimmed Paths:")
+  logging.debug("paths: %s", paths)
+  logging.debug("=======================")
+
+  # Find the best choice of the first layer.
+  # TODO(lishanok@): extend code to non-sequential model where there are
+  # multiple input layers
+  best_OutElementPerClk = list(paths[layer_idx].keys())[0]
+  best_entry = paths[layer_idx][best_OutElementPerClk]
+  best_path[layer_idx] = best_entry
+  best_OutElementPerClk = best_entry[3]
+  best_accumlative_cost = best_entry[2]
+
+  layer_idx = graph.get_next_nodes(layer_idx)[0]
+  # Given the best choice of 1st layer, find the best choice for all following
+  # layers by backtracking.
+  while not graph.is_last_node(layer_idx):
+    # Find current layer's best choice from the ptr (ie. best_OutElementPerClk)
+    # stored in the best choice of the previous layer.
+    best_entry = paths[layer_idx][best_OutElementPerClk]
+    best_path[layer_idx] = best_entry
+    # Update the ptr to the next layer.
+    best_OutElementPerClk = best_entry[3]
+
+    # get the next node from the graph
+    # TODO(lishanok@): extend the code to non-sequential model where there are
+    # multiple next layers.
+    layer_idx = graph.get_next_nodes(layer_idx)[0]
+
+  # best_path stores the best hw param combination and cost for each layer.
+  return best_path, best_accumlative_cost
+
+
+def update_cur_best_choices(
+    cur_best_choices: List[Any], OutElementPerClk: int,
+    prev_OutElementPerClk: int, cur_layer_cost: float,
+    accumulative_cost: float, choice: Choice):
+  """Update the cur_best_choices dict.
+
+  At each layer, different choices of unroll factors will generate a
+  prev_OutElementPerClk value. Some of the choices might generate the same
+  prev_OutElementPerClk. So for each pre_OutElementPerClk, we only store
+  the best choice which has the min cost.
+  """
+
+  entry = cur_best_choices.get(prev_OutElementPerClk, None)
+  existing_accumulative_cost = entry[2] if entry else np.inf
+  logging.debug("...............cost of cur_best_choices [%d]: %.2f",
+                prev_OutElementPerClk, existing_accumulative_cost)
+  if accumulative_cost < existing_accumulative_cost:
+    # Stores the best choice and its cost for the given
+    # prev_OutElementPerClk. We also store the ptr to next layer's
+    # OutElementPerClk for future backtracking purpose.
+    cur_best_choices[prev_OutElementPerClk] = (
+        choice.__str__(), cur_layer_cost, accumulative_cost, OutElementPerClk)
+    logging.debug(
+        "...............Find better cost! Update cur_best_choices[%d]: %s",
+        prev_OutElementPerClk, cur_best_choices[prev_OutElementPerClk])
+
+
+def calc_hw_params(graph, target_OutElementPerClk, target_throughput,
+                   input_quantizer_bits,
+                   compute_to_memory_max_ratio=4,
+                   memory_to_unroll_max_ratio=4,
+                   mode=CostMode.NAIVE):
+  """Calculate HW params that minimizes total cost.
+
+  Args:
+    graph: DivideConquerGraph Object. Model graph.
+    target_OutElementPerClk: Int. Target number of elements per clock
+      cycle that the hardware needs to output.
+    target_throughput: Float. Target number of inferences per clock
+      cycle that the hardware needs to make.
+    input_quantizer_bits: Int. Model's input quantizer bits.
+    compute_to_memory_max_ratio: Int. Max allowed ratio between
+      ComputOutElement and OutElement
+    memory_to_unroll_max_ratio: Int. Max allowed ratio between
+      InElementPerClk and CinUnroll
+    mode: CostMode. The mode to calculate per layer cost. Default is NAIVE.
+
+  Returns:
+    best_path: Dict. Stores the best hw param value at each layer and their
+      irrespective cost.
+    best_cost: Float. The best global cost of the entire model.
+  """
+
+  # Paths stores the best choices for every layer.
+  # For the layer_idx, for each OutElementPerClk, we can calculate the best hw
+  # param choice. We store all these best choices, each choice will
+  # correspond to one OutElementPerClk key. Path therefore has the format:
+  # {layer: {OutElementPerClk: (choice, cost, downstream_OutElementPerClk)}}
+  paths = {}
+
+  # We start the computation from the last node.
+  layer_idx = graph.get_last_node()
+
+  # Store the hw choices for the last node (a fake node) for the sake
+  # of completion.
+  paths[layer_idx] = {target_OutElementPerClk: (
+      Choice().__str__(), 0, 0, -1)}
+
+  logging.debug("====== Extracting HW params combinations per layer =====")
+
+  # The following code calculates cost backward, from last layer to the first.
+  while  graph.get_prev_nodes(layer_idx):
+    # Find precessor of the layer.
+    # TODO(lishanok@): extend this code to multiple prev layers.
+    cur_layer_idx = graph.get_prev_nodes(layer_idx)[0]
+    cur_layer = graph.idx_to_layer(cur_layer_idx)
+    logging.debug("processing layer_idx:%d layer:%s",
+                  cur_layer_idx, getattr(cur_layer, "name", None))
+
+    # Previous layer will generate a list of candidates for OutElementPerClk
+    # values for the current layer.
+    OutElementPerClk_list = list(paths[layer_idx].keys())
+    logging.debug("OutElementPerClk_list:%s", OutElementPerClk_list)
+
+    # TODO(lishanok@): need to extend to multiple input layers, i.e., more
+    # than 1 layer will reach graph's first node. We should only exit if all
+    # input layers are processed.
+    if graph.is_first_node(cur_layer_idx):
+      # Computation reaches the 1st node of the graph. We can now find the best
+      # path of all OutElementPerClk choices at the first layer.
+      set_best_global_cost_in_paths(OutElementPerClk_list, paths, layer_idx,
+                                    cur_layer_idx, input_quantizer_bits, mode)
+      break
+
+    # Get layer-related information
+    input_channel = qtools_util.get_layer_info(cur_layer, "input_channel")
+    output_channel = qtools_util.get_layer_info(cur_layer, "output_channel")
+    kernel_height = qtools_util.get_layer_info(cur_layer, "kernel_height")
+    kernel_width = qtools_util.get_layer_info(cur_layer, "kernel_width")
+    quantizer_bits = qtools_util.get_layer_info(cur_layer, "quantizer_bits")
+
+    logging.debug("input_channel: %d, output_channel: %d, kernel_height: %d, "
+                  "kernel_width: %d, quantizer_bits: %d", input_channel,
+                  output_channel, kernel_width, kernel_width, quantizer_bits)
+
+    cur_best_choices = {}
+    for OutElementPerClk in OutElementPerClk_list:
+      logging.debug("...OutElementPerClk: %d", OutElementPerClk)
+
+      # For each of the possible OutElementPerClk values provided by the next
+      # layer, we derive possible HW params choices of the current layer.
+      for ComputeOutElementPerClk in get_valid_candidates(
+          OutElementPerClk, compute_to_memory_max_ratio):
+        logging.debug("......ComputeOutElementPerClk: %d",
+                      ComputeOutElementPerClk)
+        l = OutElementPerClk / ComputeOutElementPerClk
+        cout_unroll = ComputeOutElementPerClk
+
+        # Find valid unroll values that meet pe throughput requirement.
+        valid_unrolls = get_valid_unrolls(cur_layer, cout_unroll,
+                                          target_throughput)
+        if len(valid_unrolls) == 0:
+          # Skip if no valid unroll values are found.
+          continue
+
+        for (cin_unroll, kh_unroll, kw_unroll) in valid_unrolls:
+          # Check throughput requirement of each combination of unroll values.
+          logging.debug(".........cin_unroll: %d, kh_unroll: %d, kw_unroll: %d",
+                        cin_unroll, kh_unroll, kw_unroll)
+
+          for InElementPerClk in get_valid_candidates(
+              cin_unroll, memory_to_unroll_max_ratio):
+            # With given cin_unroll, check throughput requirement of each
+            # possible candidate of InElementPerClk.
+
+            # InElementPerClk*k=ComputeInElementPerClk/(kh_unroll * kw_unroll)
+            # ==> InElementPerClk=cin_unroll/k
+            logging.debug("............InElementPerClk: %d", InElementPerClk)
+            k = cin_unroll / InElementPerClk
+            prev_OutElementPerClk = InElementPerClk
+
+            is_upsampled = qtools_util.is_upsampled(cur_layer)
+            if is_bufferThru_greater_than_targetThru(
+                InElementPerClk, OutElementPerClk, input_channel,
+                output_channel, kernel_height, kernel_width, is_upsampled,
+                target_throughput):
+              # If valid unroll values meet buffer throughput requirements,
+              # comput cost.
+              # cost = current layer's cost + total of downstream layers' cost.
+              # Since we derive cost iteratively starting from the last layer,
+              # paths already store the total cost of the downstream layers.
+              cur_layer_cost = get_per_layer_cost(
+                  quantizer_bits, cin_unroll, cout_unroll, kh_unroll,
+                  kw_unroll, InElementPerClk, OutElementPerClk, mode)
+              accumulative_cost = (
+                  cur_layer_cost + paths[layer_idx][OutElementPerClk][1])
+
+              logging.debug("...............Buf throughput is good! "
+                            "Accumulative_cost: %.2f", accumulative_cost)
+
+              # Each choice is a hw param combination.
+              choice = Choice(l, k, cin_unroll, cout_unroll, kh_unroll,
+                              kw_unroll)
+
+              update_cur_best_choices(cur_best_choices, OutElementPerClk,
+                                      prev_OutElementPerClk, cur_layer_cost,
+                                      accumulative_cost, choice)
+
+    logging.debug("=======================")
+
+    # Store the best choices of hw params for the current layer. Proceed to
+    # the previous layer.
+    paths[cur_layer_idx] = cur_best_choices
+    layer_idx = cur_layer_idx
+
+  return backtrack(graph, paths)
+
+
+def estimate_model_cost(
+    model: tf.keras.Model,
+    input_quantizer_bits: int = 8,
+    target_OutElementPerClk: int = 10,
+    target_throughput: float = 1.0,
+    compute_to_memory_max_ratio: int = 4,
+    memory_to_unroll_max_ratio: int = 4,
+    mode: CostMode = CostMode.NAIVE):
+  """Main function to divide and conquer cost modeling.
+
+  Args:
+    model: QKeras model.
+    input_quantizer_bits: Model's input quantizer bits.
+    target_OutElementPerClk: Target number of elements per clock
+      cycle that the hardware needs to output.
+    target_throughput: Target number of inferences per clock
+      cycle that the hardware needs to make.
+    compute_to_memory_max_ratio: Max allowed ratio between
+      ComputOutElement and OutElement
+    memory_to_unroll_max_ratio: Max allowed ratio between
+      InElementPerClk and CinUnroll
+    mode: The mode to calculate per layer cost.
+
+  Returns:
+    best_path: Dict. Stores the best hw param value at each layer and their
+      irrespective cost.
+    best_cost: Float. The best global cost of the entire model.
+  """
+
+  logging.info("Estimating model design params and cost...")
+  # Generate graph
+  graph = DivideConquerGraph(model)
+  # Call the main function to generate optimal HW configs for all layers
+  best_path, best_cost = calc_hw_params(
+      graph=graph, target_OutElementPerClk=target_OutElementPerClk,
+      target_throughput=target_throughput,
+      input_quantizer_bits=input_quantizer_bits,
+      compute_to_memory_max_ratio=(
+          compute_to_memory_max_ratio),
+      memory_to_unroll_max_ratio=(
+          memory_to_unroll_max_ratio),
+      mode=mode
+  )
+
+  logging.info("best_design_params: %s", best_path)
+  logging.info("best_cost: %.2f", best_cost)
+
+  return (best_path, best_cost)
diff --git a/qkeras/qtools/qtools_util.py b/qkeras/qtools/qtools_util.py
index 5ea3aafa..e9250c33 100644
--- a/qkeras/qtools/qtools_util.py
+++ b/qkeras/qtools/qtools_util.py
@@ -23,7 +23,7 @@
 import sys
 import numpy as np
 import tensorflow.keras.backend as K
-
+import tensorflow as tf
 from qkeras.qtools import quantized_operators
 
 
@@ -337,3 +337,39 @@ def adjust_accumulator_for_auto_po2(
         fused_kernel_accumulator.output, bias_quantizer)
 
   return fused_accumulator
+
+
+def find_divisors(num):
+  return [i for i in range(1, num + 1) if num % i == 0]
+
+
+def get_layer_info(layer: tf.keras.layers.Layer, attr_name: str):
+
+  layer_type = layer.__class__.__name__
+  supported_layer_types = ["QConv2D"]
+  assert layer_type in supported_layer_types, (
+      f"For now only {supported_layer_types} layers are supported. "
+      f"Found {layer_type} instead.")
+
+  # Get layer info such as input/output channels, kernel size and quantizers.
+  input_channel = layer.input_shape[-1]
+  output_channel = layer.output_shape[-1]
+
+  kernel_height, kernel_width = layer.kernel_size if hasattr(
+      layer, "kernel_size") else (None, None)
+
+  quantizer_bits = layer.kernel_quantizer.bits
+  layer_dict = {
+      "layer_type": layer_type,
+      "input_channel": input_channel,
+      "output_channel": output_channel,
+      "kernel_height": kernel_height,
+      "kernel_width": kernel_width,
+      "quantizer_bits": quantizer_bits
+  }
+  return layer_dict.get(attr_name, None)
+
+
+def is_upsampled(layer: tf.keras.layers.Layer):
+  # Evaluate if a given layer is doing upsampling.
+  return "UpSampling" in layer.__class__.__name__
diff --git a/tests/qtools_model_test.py b/tests/qtools_model_test.py
index 77739889..a1861f17 100644
--- a/tests/qtools_model_test.py
+++ b/tests/qtools_model_test.py
@@ -38,6 +38,7 @@
 from qkeras.utils import model_save_quantized_weights
 from qkeras.qtools.quantized_operators import adder_impl
 from qkeras.qtools.quantized_operators import quantizer_impl
+from qkeras.qtools import divide_and_conquer
 
 
 def qdense_model_fork():
@@ -946,5 +947,50 @@ def test_qdepthwiseconv2d():
   assert dtype_dict["pw_conv"]["accumulator"]["int_bits"] == 11
 
 
+def test_divide_and_conquer_sequential_conv2d():
+  # These following values are verified manually to be globally optimal.
+
+  # The test has two purposes:
+  # 1) check if the code runs ok;
+  # 2) for a simple conv2d model, the output is as expected.
+
+  # We will need to add more tests with more complex graph architecture
+  # in the future as our solution grows.
+
+  xin = x = tf.keras.layers.Input(shape=(16, 16, 1), name="input_layer")
+  x = QConv2D(
+      kernel_size=3,
+      filters=3,
+      use_bias=False,
+      kernel_quantizer=quantizers.quantized_bits(4, 0, alpha=1.0),
+      name="conv_1",
+  )(x)
+  x = QConv2D(
+      kernel_size=3,
+      filters=5,
+      use_bias=False,
+      kernel_quantizer=quantizers.quantized_bits(4, 0, alpha=1.0),
+      name="conv_2",
+  )(x)
+
+  # Create a model
+  model = tf.keras.Model(inputs=xin, outputs=x)
+
+  best_path, best_cost = divide_and_conquer.estimate_model_cost(
+      model,
+      input_quantizer_bits=8,
+      target_OutElementPerClk=10,
+      target_throughput=1.0,
+      compute_to_memory_max_ratio=1,
+      memory_to_unroll_max_ratio=1,
+      mode=divide_and_conquer.CostMode.NAIVE,
+  )
+
+  assert best_path[1][2] == 681
+  assert best_path[1][3] == 3
+  assert best_path[2][3] == 10
+  assert best_cost == 681
+
+
 if __name__ == "__main__":
   pytest.main([__file__])