From 7f8e9e8fa91d0a65e6a3b47a925eb2379ca0f9f8 Mon Sep 17 00:00:00 2001 From: Liu Liu Date: Fri, 22 Dec 2023 16:33:16 -0500 Subject: [PATCH] Add dry_run support so we can compile further without executing. --- lib/nnc/ccv_cnnp_model.c | 9 ++- lib/nnc/ccv_nnc.h | 22 ++++++ lib/nnc/ccv_nnc_dynamic_graph_evaluate.c | 87 ++++++++++++++++++++++++ lib/nnc/ccv_nnc_easy.h | 29 -------- lib/nnc/ccv_nnc_symbolic_graph_compile.c | 23 ++++--- 5 files changed, 130 insertions(+), 40 deletions(-) diff --git a/lib/nnc/ccv_cnnp_model.c b/lib/nnc/ccv_cnnp_model.c index f9a2e1711..3c4080bc8 100644 --- a/lib/nnc/ccv_cnnp_model.c +++ b/lib/nnc/ccv_cnnp_model.c @@ -1734,7 +1734,7 @@ static void _ccv_cnnp_model_multistage_jit_0(ccv_cnnp_model_t* const model, cons ccv_nnc_graph_autotune(compiled_data->graph, model->workspace_size, 0, TRAVERSE_FULL); } -void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) +void ccv_cnnp_model_dry_run(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size) { ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; assert(compiled_data); @@ -1772,6 +1772,13 @@ void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evalu }; ccv_cnnp_model_set_is_test(model, params.is_test, _ccv_cnnp_cmd_update_for_execs, &update); } +} + +void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) +{ + ccv_cnnp_compiled_data_t* const compiled_data = model->compiled_data; + assert(compiled_data); + ccv_cnnp_model_dry_run(model, params, inputs, input_size, outputs, output_size); if (compiled_data->graph_mode == CCV_CNNP_MODEL_GRAPH_MULTISTAGE_MODE_NO_GRAD) ccv_nnc_graph_run_with_schedule(compiled_data->graph, 0, 0, tensor_tape, stream_context); else { diff --git a/lib/nnc/ccv_nnc.h b/lib/nnc/ccv_nnc.h index 6797f6db7..41dcd8ac9 100644 --- a/lib/nnc/ccv_nnc.h +++ b/lib/nnc/ccv_nnc.h @@ -2917,6 +2917,18 @@ typedef struct ccv_cnnp_model_s ccv_cnnp_model_t; * @param stream_context Which stream this computation will be executed upon. */ void ccv_nnc_dynamic_graph_evaluate(ccv_nnc_dynamic_graph_t* const dynamic_graph, ccv_cnnp_model_t* const model, const int is_test, const ccv_nnc_tensor_variable_t* const inputs, const int input_size, ccv_nnc_tensor_variable_t* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context); +/** + * Dry run a CNNP model on the dynamic graph with set of inputs up until the actual execution. + * @param dynamic_graph The dynamic graph. + * @param model The CNNP model to be evaluated against. Note that ccv_nnc_dynamic_graph_backward / + * ccv_nnc_dynamic_graph_apply_gradients / ccv_nnc_dynamic_graph_minimize all works with this + * model. It takes over the life-cycle of the model, and now you don't need to free it any more. + * @param is_test Whether we are in test mode or not. + * @param inputs The input variables. + * @param input_size The size of the input variables array. + * @param stream_context Which stream this computation will be executed upon. + */ +void ccv_nnc_dynamic_graph_dry_run(ccv_nnc_dynamic_graph_t* const dynamic_graph, ccv_cnnp_model_t* const model, const int is_test, const ccv_nnc_tensor_variable_t* const inputs, const int input_size, ccv_nnc_stream_context_t* const stream_context); /** * Set the maximum operator-level concurrency. This is a soft-limit, e.g. if you have operations on * different devices, they are concurrent. @@ -3717,6 +3729,16 @@ typedef struct { * @param stream_context The stream where the evaluation can be executed upon. */ void ccv_cnnp_model_evaluate(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context); +/** + * Dryrun the model with inputs / outputs. This runs the evaluation loop up until the actual execution. + * @param model The composed model. + * @param params The parameters for how evaluation should behave. + * @param inputs The input tensors. + * @param input_size The size of the input tensors array. + * @param outputs The actual outputs from the model. + * @param output_size The size of the outputs array. + */ +void ccv_cnnp_model_dry_run(ccv_cnnp_model_t* const model, const ccv_cnnp_evaluate_param_t params, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size); /** * Based on the input gradients, compute the output gradients (w.r.t. the inputs). This also adds parameter gradients. * @param model The composed model. diff --git a/lib/nnc/ccv_nnc_dynamic_graph_evaluate.c b/lib/nnc/ccv_nnc_dynamic_graph_evaluate.c index a228e24d2..60afcab24 100644 --- a/lib/nnc/ccv_nnc_dynamic_graph_evaluate.c +++ b/lib/nnc/ccv_nnc_dynamic_graph_evaluate.c @@ -136,6 +136,93 @@ static ccv_nnc_stateful_cmd_vtab_t ccv_cnnp_model_exec_isa = { .apply_gradients = _ccv_cnnp_model_apply_gradients, }; +void ccv_nnc_dynamic_graph_dry_run(ccv_nnc_dynamic_graph_t* const dynamic_graph, ccv_cnnp_model_t* const model, const int is_test, const ccv_nnc_tensor_variable_t* const inputs, const int input_size, ccv_nnc_stream_context_t* const stream_context) +{ + assert(input_size > 0); + const int parallel_count = ccv_max(model->parallel_count, 1); + const int per_input_size = input_size / parallel_count; + assert(per_input_size > 0); + assert((input_size % parallel_count) == 0); + int i, j; + if (!model->graph) + { + ccv_nnc_tensor_param_t input_params[per_input_size]; + for (i = 0; i < per_input_size; i++) + input_params[i] = inputs[i]->info; + ccv_cnnp_model_compile(model, input_params, per_input_size, CMD_NOOP(), CMD_NOOP()); + } else { + assert(per_input_size == model->input_size); + ccv_nnc_tensor_param_t input_params[per_input_size]; + int flag = 0; + for (i = 0; i < per_input_size; i++) + { + input_params[i] = inputs[i]->info; + const ccv_nnc_tensor_param_t params = ccv_nnc_tensor_symbol_params(model->graph, model->inputs[i]); + // If these two parameters doesn't match, recompile the graph.. + if (memcmp(¶ms, &input_params[i], sizeof(params)) != 0) + flag = 1; + } + if (flag) // Recompile the graph. + ccv_cnnp_model_compile(model, input_params, per_input_size, ccv_cnnp_model_minimizer(model), CMD_NOOP()); + } + ccv_nnc_tensor_t* input_tensors[input_size]; + for (i = 0; i < input_size; i++) + { + // Cannot have the parameter be a partial tensor view for model evaluation. + input_tensors[i] = inputs[i] ? ccv_nnc_tensor_from_variable(dynamic_graph, inputs[i], stream_context) : 0; + if (input_tensors[i]) + { assert(CCV_IS_TENSOR_CONTIGUOUS(input_tensors[i])); } + } + const int per_output_size = ccv_cnnp_model_output_size(model); + ccv_nnc_tensor_param_t output_params[ccv_max(1, per_output_size)]; + const int output_size = per_output_size * parallel_count; + ccv_nnc_tensor_variable_t outputs[output_size]; + ccv_nnc_tensor_t* output_tensors[output_size]; + for (i = 0; i < parallel_count; i++) + { + for (j = 0; j < per_output_size; j++) + output_params[j] = ccv_nnc_tensor_auto; + ccv_cnnp_model_tensor_auto(model, output_params, per_output_size); + for (j = 0; j < per_output_size; j++) + if (!ccv_nnc_is_tensor_auto(output_params[j])) + { + outputs[i * per_output_size + j] = ccv_nnc_tensor_variable_new(dynamic_graph, output_params[j]); + output_tensors[i * per_output_size + j] = ccv_nnc_tensor_from_variable(dynamic_graph, outputs[i * per_output_size + j], stream_context); + } else { + outputs[i * per_output_size + j] = 0; + output_tensors[i * per_output_size + j] = 0; + } + } + if (dynamic_graph->no_grad) + { + ccv_cnnp_model_dry_run(model, (ccv_cnnp_evaluate_param_t){ + .requires_grad = 0, + .disable_outgrad = CCV_CNNP_DISABLE_OUTGRAD_ALL, + .is_test = is_test, + }, input_tensors, input_size, output_tensors, output_size); + } else { + uint64_t disable_outgrad = 0; + int count = 0; + for (i = 0; i < per_input_size; i++) + if (!inputs[i] || inputs[i]->type == CCV_NNC_TENSOR_CONSTANT) + { + disable_outgrad |= ((uint64_t)1 << i); + ++count; + } + if (count == per_input_size) + disable_outgrad = CCV_CNNP_DISABLE_OUTGRAD_ALL; + ccv_cnnp_model_dry_run(model, (ccv_cnnp_evaluate_param_t){ + .requires_grad = 1, + .disable_outgrad = disable_outgrad, + .is_test = is_test, + }, input_tensors, input_size, output_tensors, output_size); + } + // Free the allocated variables. + for (i = 0; i < output_size; i++) + if (outputs[i]) + ccv_nnc_tensor_variable_free(dynamic_graph, outputs[i]); +} + void ccv_nnc_dynamic_graph_evaluate(ccv_nnc_dynamic_graph_t* const dynamic_graph, ccv_cnnp_model_t* const model, const int is_test, const ccv_nnc_tensor_variable_t* const inputs, const int input_size, ccv_nnc_tensor_variable_t* const outputs, const int output_size, ccv_nnc_tensor_tape_t* const tensor_tape, ccv_nnc_stream_context_t* const stream_context) { ccv_nnc_cmd_t cmd = ccv_nnc_cmd(CCV_NNC_CUSTOM_FORWARD, (ccv_nnc_cmd_vtab_t*)&ccv_cnnp_model_exec_isa, (ccv_nnc_cmd_param_t){}, 0); diff --git a/lib/nnc/ccv_nnc_easy.h b/lib/nnc/ccv_nnc_easy.h index 1c8f42992..1a1fd5322 100644 --- a/lib/nnc/ccv_nnc_easy.h +++ b/lib/nnc/ccv_nnc_easy.h @@ -250,35 +250,6 @@ static inline size_t ccv_nnc_tensor_data_size(const ccv_nnc_tensor_param_t param return ((data_size + 63) & -64); } -static inline size_t ccv_nnc_tensor_decompressed_data_size_without_padding(const ccv_nnc_tensor_param_t params) -{ - const ssize_t count = (ssize_t)ccv_nnc_tensor_count(params); - ssize_t data_size; - if (CCV_GET_DATA_TYPE(params.datatype) == CCV_QX) - { - // Our QX right now only does palettization. Hence, we need to get the palette datatype. - const int palette_datatype = (params.datatype & 0xff) << 12; - data_size = CCV_GET_DATA_TYPE_SIZE(palette_datatype) * count; - } else - data_size = CCV_GET_DATA_TYPE_SIZE(params.datatype) * count; - return data_size; -} - -static inline size_t ccv_nnc_tensor_decompressed_data_size(const ccv_nnc_tensor_param_t params) -{ - ssize_t data_size = ccv_nnc_tensor_decompressed_data_size_without_padding(params); -#ifdef HAVE_CUDA // For CUDA, we align to 128-bytes. - if (CCV_TENSOR_GET_MEMORY(params.type) == CCV_TENSOR_GPU_MEMORY) - return ((data_size + 127) & -128); - else -#elif defined(HAVE_MPS) // For MPS, we have to align to PAGE_SIZE. - if (CCV_TENSOR_GET_MEMORY(params.type) == CCV_TENSOR_GPU_MEMORY) - return ((data_size + PAGE_SIZE - 1) & -PAGE_SIZE); - else -#endif - return ((data_size + 63) & -64); -} - static inline void ccv_nnc_tensor_view_get_dim(const ccv_nnc_tensor_view_t* const tv, int dim[CCV_NNC_MAX_DIM_ALLOC]) { int x; diff --git a/lib/nnc/ccv_nnc_symbolic_graph_compile.c b/lib/nnc/ccv_nnc_symbolic_graph_compile.c index 566d1a495..4c5d97572 100644 --- a/lib/nnc/ccv_nnc_symbolic_graph_compile.c +++ b/lib/nnc/ccv_nnc_symbolic_graph_compile.c @@ -4214,7 +4214,7 @@ int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, cons mv = (ccv_nnc_tensor_multiview_t*)(mv->it ? mv->it : CCV_NNC_MULTIVIEW_DATA(mv)[0]); tensor = (ccv_nnc_tensor_t*)mv; } - tensor_arena->vt_sizes[i] = ccv_nnc_tensor_decompressed_data_size(tensor->info); + tensor_arena->vt_sizes[i] = ccv_nnc_tensor_data_size(tensor->info); } } int flag = 0; @@ -4222,7 +4222,10 @@ int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, cons if (tensor_arena->vt_tensors[i] && !tensor_arena->vt_alias_refs[i]) { ccv_nnc_tensor_symbol_info_t* const symbol_info = (ccv_nnc_tensor_symbol_info_t*)ccv_array_get(graph->tensor_symbol_info, i); - flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(symbol_info->info)); + ccv_nnc_tensor_param_t params = symbol_info->info; + params.datatype = tensor_arena->vt_tensors[i]->info.datatype; + params.reserved = tensor_arena->vt_tensors[i]->info.reserved; + flag = (tensor_arena->vt_sizes[i] < ccv_nnc_tensor_data_size(params)); } if (flag) return -1; @@ -4236,16 +4239,16 @@ int ccv_nnc_tensor_arena_reinit(ccv_nnc_tensor_arena_t* const tensor_arena, cons assert(!tensor_arena->vt_alias_refs[i]); _ccv_nnc_multiview_update_params((ccv_nnc_tensor_multiview_t*)tensor, symbol_info->info); } else if (!tensor_arena->vt_alias_refs[i]) { - ccv_nnc_tensor_param_t params = tensor->info; - tensor->info = symbol_info->info; - tensor->info.datatype = params.datatype; - tensor->info.reserved = params.reserved; + ccv_nnc_tensor_param_t params = symbol_info->info; + params.datatype = tensor->info.datatype; + params.reserved = tensor->info.reserved; + tensor->info = params; } else { off_t off = ccv_nnc_tensor_view_offset(tensor->info.datatype, symbol_info->stride, symbol_info->ofs); - ccv_nnc_tensor_param_t params = tensor->info; - tensor->info = symbol_info->info; - tensor->info.datatype = params.datatype; - tensor->info.reserved = params.reserved; + ccv_nnc_tensor_param_t params = symbol_info->info; + params.datatype = tensor->info.datatype; + params.reserved = tensor->info.reserved; + tensor->info = params; const int alias_ref = tensor_arena->vt_alias_refs[i] - 1; ccv_nnc_tensor_data(tensor->info, tensor_arena->vt_tensors[alias_ref]->data.u8, off + tensor_arena->vt_tensors[alias_ref]->dataof, &tensor->data, &tensor->dataof); if (CCV_IS_TENSOR_VIEW(tensor))