diff --git a/.github/workflows/black.yaml b/.github/workflows/black.yaml new file mode 100644 index 000000000..5903fb668 --- /dev/null +++ b/.github/workflows/black.yaml @@ -0,0 +1,12 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: psf/black@stable + with: + options: "--extend-exclude 'evaluations/'" diff --git a/compiler/annotations_utils/util_cmd_invocations.py b/compiler/annotations_utils/util_cmd_invocations.py index 26dd8cb6b..7770de644 100644 --- a/compiler/annotations_utils/util_cmd_invocations.py +++ b/compiler/annotations_utils/util_cmd_invocations.py @@ -1,24 +1,43 @@ from pash_annotations.datatypes.BasicDatatypes import Flag, ArgStringType, Operand from pash_annotations.datatypes.BasicDatatypesWithIO import OptionWithIO from pash_annotations.datatypes.CommandInvocationInitial import CommandInvocationInitial -from pash_annotations.annotation_generation.datatypes.InputOutputInfo import InputOutputInfo -from pash_annotations.annotation_generation.datatypes.ParallelizabilityInfo import ParallelizabilityInfo -from pash_annotations.annotation_generation.datatypes.CommandProperties import CommandProperties -from pash_annotations.annotation_generation.AnnotationGeneration import get_input_output_info_from_cmd_invocation, \ - get_parallelizability_info_from_cmd_invocation -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.annotation_generation.datatypes.InputOutputInfo import ( + InputOutputInfo, +) +from pash_annotations.annotation_generation.datatypes.ParallelizabilityInfo import ( + ParallelizabilityInfo, +) +from pash_annotations.annotation_generation.datatypes.CommandProperties import ( + CommandProperties, +) +from pash_annotations.annotation_generation.AnnotationGeneration import ( + get_input_output_info_from_cmd_invocation, + get_parallelizability_info_from_cmd_invocation, +) +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.arg import Arg # for typing from pash_annotations.datatypes.CommandInvocationPrefix import CommandInvocationPrefix -from shell_ast.ast_util import string_to_argument, redir_stdout_to_file, redir_file_to_stdin, make_command +from shell_ast.ast_util import ( + string_to_argument, + redir_stdout_to_file, + redir_file_to_stdin, + make_command, +) + def get_command_invocation_prefix_from_dfg_node(dfg_node): - return CommandInvocationPrefix(cmd_name = dfg_node.com_name, - flag_option_list = dfg_node.flag_option_list, - positional_config_list = dfg_node.positional_config_list) + return CommandInvocationPrefix( + cmd_name=dfg_node.com_name, + flag_option_list=dfg_node.flag_option_list, + positional_config_list=dfg_node.positional_config_list, + ) + # TODO: ideally methods in the respective classes but requires refactoring of parsing infrastructure # TODO: isn't this `to_ast`? @@ -48,19 +67,22 @@ def to_node_cmd_inv_with_io_vars(cmd_inv, edges, redirs, assignments): node = make_command(cmd_asts, redirections=new_redirs, assignments=assignments) return node + def to_ast_flagoption(flagoption, edges): if isinstance(flagoption, Flag): return [string_to_argument(flagoption.get_name())] - elif isinstance(flagoption, OptionWithIO): # retype to IOVar + elif isinstance(flagoption, OptionWithIO): # retype to IOVar opt_name_ast = string_to_argument(flagoption.get_name()) opt_arg_ast = translate_io_var_if_applicable(flagoption.get_arg(), edges) return [opt_name_ast, opt_arg_ast] + def to_ast_operand(operand, edges): if isinstance(operand, Operand): return translate_io_var_if_applicable(operand.get_name(), edges) return translate_io_var_if_applicable(operand, edges) + def translate_io_var_if_applicable(pot_io_var, edges): # TODO: this is currently a hack but eventually every possible type gets their own to_ast-function if isinstance(pot_io_var, int): @@ -68,7 +90,7 @@ def translate_io_var_if_applicable(pot_io_var, edges): elif isinstance(pot_io_var, ArgStringType): return to_ast_arg_string_type(pot_io_var) elif isinstance(pot_io_var, CommandInvocationWithIOVars): - assert(False) + assert False # only happens as r-wrapped node return to_node_cmd_inv_with_io_vars(pot_io_var, edges, [], []) elif isinstance(pot_io_var, Arg): @@ -76,27 +98,39 @@ def translate_io_var_if_applicable(pot_io_var, edges): else: raise Exception("Unhandled type for operand in to_ast!") + def to_ast_arg_string_type(arg_string_type): - return arg_string_type.get_name().arg_char_list # is of type Arg + return arg_string_type.get_name().arg_char_list # is of type Arg + # assumes io_var is an edge id def dereference_io_var(io_var, edges): fid, _, _ = edges[io_var] return fid.to_ast() -def get_input_output_info_from_cmd_invocation_util(cmd_invocationInitial : CommandInvocationInitial) -> InputOutputInfo: + +def get_input_output_info_from_cmd_invocation_util( + cmd_invocationInitial: CommandInvocationInitial, +) -> InputOutputInfo: return get_input_output_info_from_cmd_invocation(cmd_invocationInitial) -def get_parallelizability_info_from_cmd_invocation_util(cmd_invocationInitial : CommandInvocationInitial) -> ParallelizabilityInfo: + +def get_parallelizability_info_from_cmd_invocation_util( + cmd_invocationInitial: CommandInvocationInitial, +) -> ParallelizabilityInfo: return get_parallelizability_info_from_cmd_invocation(cmd_invocationInitial) + def construct_property_container_from_list_of_properties(list_properties): return CommandProperties(dict(list_properties)) + # this function is needed to wrap a node in `r_wrap` -def to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping(cmd_inv, edges): +def to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping( + cmd_inv, edges +): # we already expand here - whole_cmd = Arg.string_to_arg("\'") + whole_cmd = Arg.string_to_arg("'") arg_cmd_name = Arg.string_to_arg(cmd_inv.cmd_name) arg_flagoptions = [] for flagoption in cmd_inv.flag_option_list: @@ -107,9 +141,10 @@ def to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wra all_cmd_parts_arg.extend(arg_operands) for part in all_cmd_parts_arg: whole_cmd.concatenate(part) - whole_cmd.concatenate(Arg.string_to_arg("\'")) + whole_cmd.concatenate(Arg.string_to_arg("'")) return whole_cmd + def to_arg_flagoption(flagoption, edges): if isinstance(flagoption, Flag): return [Arg.string_to_arg(flagoption.get_name())] @@ -118,11 +153,13 @@ def to_arg_flagoption(flagoption, edges): opt_arg_arg = translate_io_var_to_arg_if_applicable(flagoption.get_arg(), edges) return [opt_name_arg, opt_arg_arg] + def to_arg_operand(operand, edges): if isinstance(operand, Operand): return translate_io_var_to_arg_if_applicable(operand.get_name(), edges) return translate_io_var_to_arg_if_applicable(operand, edges) + def translate_io_var_to_arg_if_applicable(pot_io_var, edges): if isinstance(pot_io_var, int): return Arg(dereference_io_var(pot_io_var, edges)) diff --git a/compiler/annotations_utils/util_file_descriptors.py b/compiler/annotations_utils/util_file_descriptors.py index fb17438b0..4495af9af 100644 --- a/compiler/annotations_utils/util_file_descriptors.py +++ b/compiler/annotations_utils/util_file_descriptors.py @@ -1,18 +1,21 @@ from util import log from definitions.ir.resource import FileResource, Resource, FileDescriptorResource -from pash_annotations.datatypes.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo +from pash_annotations.datatypes.BasicDatatypesWithIO import ( + FileNameWithIOInfo, + StdDescriptorWithIOInfo, +) def resource_from_file_descriptor(file_descriptor) -> Resource: if isinstance(file_descriptor, FileNameWithIOInfo): arg = file_descriptor.get_name() - log(f'filedes name: {file_descriptor.get_name()}') - log(f'filedes name type: {type(file_descriptor.get_name())}') - log(f'arg: {arg}') + log(f"filedes name: {file_descriptor.get_name()}") + log(f"filedes name type: {type(file_descriptor.get_name())}") + log(f"arg: {arg}") return FileResource(file_descriptor.get_name()) elif isinstance(file_descriptor, StdDescriptorWithIOInfo): resource = ("fd", file_descriptor.get_type().value) return FileDescriptorResource(resource) else: - assert(False) + assert False # unreachable diff --git a/compiler/annotations_utils/util_parsing.py b/compiler/annotations_utils/util_parsing.py index f4655b9fa..074b94004 100644 --- a/compiler/annotations_utils/util_parsing.py +++ b/compiler/annotations_utils/util_parsing.py @@ -3,9 +3,20 @@ from definitions.ir.arg import Arg from pash_annotations.datatypes.CommandInvocationInitial import CommandInvocationInitial -from pash_annotations.datatypes.BasicDatatypes import Option, ArgStringType, Flag, Operand -from pash_annotations.parser.parser import parse, get_set_of_all_flags, get_dict_flag_to_primary_repr, get_set_of_all_options, \ - get_dict_option_to_primary_repr, are_all_individually_flags +from pash_annotations.datatypes.BasicDatatypes import ( + Option, + ArgStringType, + Flag, + Operand, +) +from pash_annotations.parser.parser import ( + parse, + get_set_of_all_flags, + get_dict_flag_to_primary_repr, + get_set_of_all_options, + get_dict_option_to_primary_repr, + are_all_individually_flags, +) from pash_annotations.parser.util_parser import get_json_data @@ -18,13 +29,19 @@ def merge_to_single_string_with_space(list_str): else: return " ".join(list_str) + def get_command_invocation(command, options) -> CommandInvocationInitial: command_as_string: str = format_arg_chars(command) - options_and_operands_as_string: str = merge_to_single_string_with_space([format_arg_chars(option) for option in options]) - command_invocation_as_string: str = f'{command_as_string} {options_and_operands_as_string}' + options_and_operands_as_string: str = merge_to_single_string_with_space( + [format_arg_chars(option) for option in options] + ) + command_invocation_as_string: str = ( + f"{command_as_string} {options_and_operands_as_string}" + ) command_invocation: CommandInvocationInitial = parse(command_invocation_as_string) return command_invocation + def get_ast_for_flagoption(flagoption): result = string_to_argument(flagoption.get_name()) if isinstance(flagoption, Option): @@ -32,26 +49,31 @@ def get_ast_for_flagoption(flagoption): assert False return result + def get_ast_for_argstringtype(arg): return string_to_argument(arg.get_name()) + # TODO: this is a hack to fix the wrong parsing of " def fix_parsing_newline(arg): - if arg.get_name() == '\\n': + if arg.get_name() == "\\n": return ArgStringType(r'"\n"') else: return arg -def parse_arg_list_to_command_invocation(command, flags_options_operands) -> CommandInvocationInitial: - +def parse_arg_list_to_command_invocation( + command, flags_options_operands +) -> CommandInvocationInitial: cmd_name = format_arg_chars(command) json_data = get_json_data(cmd_name) set_of_all_flags: Set[str] = get_set_of_all_flags(json_data) dict_flag_to_primary_repr: dict[str, str] = get_dict_flag_to_primary_repr(json_data) set_of_all_options: Set[str] = get_set_of_all_options(json_data) - dict_option_to_primary_repr: dict[str, str] = get_dict_option_to_primary_repr(json_data) + dict_option_to_primary_repr: dict[str, str] = get_dict_option_to_primary_repr( + json_data + ) # we keep the Arg for everything but flag and option names # parse list of command invocation terms @@ -61,20 +83,30 @@ def parse_arg_list_to_command_invocation(command, flags_options_operands) -> Com potential_flag_or_option_arg = flags_options_operands[i] potential_flag_or_option_name = format_arg_chars(potential_flag_or_option_arg) if potential_flag_or_option_name in set_of_all_flags: - flag_name_as_string: str = dict_flag_to_primary_repr.get(potential_flag_or_option_name, potential_flag_or_option_name) + flag_name_as_string: str = dict_flag_to_primary_repr.get( + potential_flag_or_option_name, potential_flag_or_option_name + ) flag: Flag = Flag(flag_name_as_string) flag_option_list.append(flag) - elif (potential_flag_or_option_name in set_of_all_options) and ((i+1) < len(flags_options_operands)): - option_name_as_string: str = dict_option_to_primary_repr.get(potential_flag_or_option_name, potential_flag_or_option_name) - option_arg_as_arg: Arg = Arg(flags_options_operands[i+1]) + elif (potential_flag_or_option_name in set_of_all_options) and ( + (i + 1) < len(flags_options_operands) + ): + option_name_as_string: str = dict_option_to_primary_repr.get( + potential_flag_or_option_name, potential_flag_or_option_name + ) + option_arg_as_arg: Arg = Arg(flags_options_operands[i + 1]) option = Option(option_name_as_string, option_arg_as_arg) flag_option_list.append(option) i += 1 # since we consumed another term for the argument - elif potential_flag_or_option_name == "-": # switch to operand mode (interpreted as hyphen-stdin) + elif ( + potential_flag_or_option_name == "-" + ): # switch to operand mode (interpreted as hyphen-stdin) break - elif are_all_individually_flags(potential_flag_or_option_name, set_of_all_flags): + elif are_all_individually_flags( + potential_flag_or_option_name, set_of_all_flags + ): for split_el in list(potential_flag_or_option_name[1:]): - flag: Flag = Flag(f'-{split_el}') + flag: Flag = Flag(f"-{split_el}") flag_option_list.append(flag) else: break # next one is Operand, and we keep these in separate list @@ -85,7 +117,9 @@ def parse_arg_list_to_command_invocation(command, flags_options_operands) -> Com # if parsed_elements_list[i] == '--': # i += 1 - operand_list = [Operand(Arg(operand_arg)) for operand_arg in flags_options_operands[i:]] + operand_list = [ + Operand(Arg(operand_arg)) for operand_arg in flags_options_operands[i:] + ] # log("type of operand_list[0].get_name()", type(operand_list[0].get_name())) can only be used if there are operands return CommandInvocationInitial(cmd_name, flag_option_list, operand_list) diff --git a/compiler/ast_to_ir.py b/compiler/ast_to_ir.py index 2fda09d92..8d6f755a4 100644 --- a/compiler/ast_to_ir.py +++ b/compiler/ast_to_ir.py @@ -8,7 +8,7 @@ from util import * from parse import from_ast_objects_to_shell -## TODO: Separate the ir stuff to the bare minimum and +## TODO: Separate the ir stuff to the bare minimum and ## try to move this to the shell_ast folder. ## @@ -24,25 +24,52 @@ ## without knowing about previous or later subtrees that can be ## distributed. Is that reasonable? compile_cases = { - "Pipe": (lambda fileIdGen, config: - lambda ast_node: compile_node_pipe(ast_node, fileIdGen, config)), - "Command": (lambda fileIdGen, config: - lambda ast_node: compile_node_command(ast_node, fileIdGen, config)), - "And": (lambda fileIdGen, config: - lambda ast_node: compile_node_and_or_semi(ast_node, fileIdGen, config)), - "Or": (lambda fileIdGen, config: - lambda ast_node: compile_node_and_or_semi(ast_node, fileIdGen, config)), - "Semi": (lambda fileIdGen, config: - lambda ast_node: compile_node_and_or_semi(ast_node, fileIdGen, config)), - "Redir": (lambda fileIdGen, config: - lambda ast_node: compile_node_redir_subshell(ast_node, fileIdGen, config)), - "Subshell": (lambda fileIdGen, config: - lambda ast_node: compile_node_redir_subshell(ast_node, fileIdGen, config)), - "Background": (lambda fileIdGen, config: - lambda ast_node: compile_node_background(ast_node, fileIdGen, config)), - "For": (lambda fileIdGen, config: - lambda ast_node: compile_node_for(ast_node, fileIdGen, config)) - } + "Pipe": ( + lambda fileIdGen, config: lambda ast_node: compile_node_pipe( + ast_node, fileIdGen, config + ) + ), + "Command": ( + lambda fileIdGen, config: lambda ast_node: compile_node_command( + ast_node, fileIdGen, config + ) + ), + "And": ( + lambda fileIdGen, config: lambda ast_node: compile_node_and_or_semi( + ast_node, fileIdGen, config + ) + ), + "Or": ( + lambda fileIdGen, config: lambda ast_node: compile_node_and_or_semi( + ast_node, fileIdGen, config + ) + ), + "Semi": ( + lambda fileIdGen, config: lambda ast_node: compile_node_and_or_semi( + ast_node, fileIdGen, config + ) + ), + "Redir": ( + lambda fileIdGen, config: lambda ast_node: compile_node_redir_subshell( + ast_node, fileIdGen, config + ) + ), + "Subshell": ( + lambda fileIdGen, config: lambda ast_node: compile_node_redir_subshell( + ast_node, fileIdGen, config + ) + ), + "Background": ( + lambda fileIdGen, config: lambda ast_node: compile_node_background( + ast_node, fileIdGen, config + ) + ), + "For": ( + lambda fileIdGen, config: lambda ast_node: compile_node_for( + ast_node, fileIdGen, config + ) + ), +} def compile_asts(ast_objects: "list[AstNode]", fileIdGen, config): @@ -51,12 +78,12 @@ def compile_asts(ast_objects: "list[AstNode]", fileIdGen, config): for i, ast_object in enumerate(ast_objects): # log("Compiling AST {}".format(i)) # log(ast_object) - assert(isinstance(ast_object, AstNode)) + assert isinstance(ast_object, AstNode) ## Compile subtrees of the AST to out intermediate representation - ## KK 2023-05-25: Would we ever want to pass this state to the expansion + ## KK 2023-05-25: Would we ever want to pass this state to the expansion ## of the next object? I don't think so. - exp_state = ExpansionState(config['shell_variables']) + exp_state = ExpansionState(config["shell_variables"]) expanded_ast = expand_command(ast_object, exp_state) # log("Expanded:", expanded_ast) compiled_ast = compile_node(expanded_ast, fileIdGen, config) @@ -67,9 +94,8 @@ def compile_asts(ast_objects: "list[AstNode]", fileIdGen, config): ## If the accumulator contains an IR (meaning that the ## previous commands where run in background), union it with ## the current returned ast. - if (not acc_ir is None): - - if (isinstance(compiled_ast, IR)): + if not acc_ir is None: + if isinstance(compiled_ast, IR): acc_ir.background_union(compiled_ast) else: ## TODO: Make this union the compiled_ast with the @@ -82,21 +108,19 @@ def compile_asts(ast_objects: "list[AstNode]", fileIdGen, config): ## If the current compiled ast not in background (and so ## the union isn't in background too), stop accumulating - if (not acc_ir is None - and not acc_ir.is_in_background()): + if not acc_ir is None and not acc_ir.is_in_background(): compiled_asts.append(acc_ir) acc_ir = None else: ## If the compiled ast is in background, start ## accumulating it - if (isinstance(compiled_ast, IR) - and compiled_ast.is_in_background()): + if isinstance(compiled_ast, IR) and compiled_ast.is_in_background(): acc_ir = compiled_ast else: compiled_asts.append(compiled_ast) ## The final accumulator - if (not acc_ir is None): + if not acc_ir is None: compiled_asts.append(acc_ir) return compiled_asts @@ -106,9 +130,11 @@ def compile_node(ast_object, fileIdGen, config): global compile_cases return ast_match(ast_object, compile_cases, fileIdGen, config) + def compile_node_pipe(ast_node, fileIdGen, config): - compiled_pipe_nodes = combine_pipe([compile_node(pipe_item, fileIdGen, config) - for pipe_item in ast_node.items]) + compiled_pipe_nodes = combine_pipe( + [compile_node(pipe_item, fileIdGen, config) for pipe_item in ast_node.items] + ) ## Note: When calling combine_pipe_nodes (which ## optimistically distributes all the children of a @@ -124,27 +150,29 @@ def compile_node_pipe(ast_node, fileIdGen, config): compiled_ast = compiled_ir return compiled_ast + ## This combines all the children of the Pipeline to an IR. def combine_pipe(ast_nodes): ## Initialize the IR with the first node in the Pipe - if (isinstance(ast_nodes[0], IR)): + if isinstance(ast_nodes[0], IR): combined_nodes = ast_nodes[0] else: ## If any part of the pipe is not an IR, the compilation must fail. log("Node: {} is not pure".format(ast_nodes[0])) - raise Exception('Not pure node in pipe') + raise Exception("Not pure node in pipe") ## Combine the rest of the nodes for ast_node in ast_nodes[1:]: - if (isinstance(ast_node, IR)): + if isinstance(ast_node, IR): combined_nodes.pipe_append(ast_node) else: ## If any part of the pipe is not an IR, the compilation must fail. log("Node: {} is not pure".format(ast_nodes)) - raise Exception('Not pure node in pipe') + raise Exception("Not pure node in pipe") return [combined_nodes] + def compile_node_command(ast_node, fileIdGen, config): ## Compile assignments and redirection list compiled_assignments = compile_assignments(ast_node.assignments, fileIdGen, config) @@ -160,10 +188,9 @@ def compile_node_command(ast_node, fileIdGen, config): try: ## If the command is not compileable to a DFG the following call will fail - ir = compile_command_to_DFG(fileIdGen, - command_name, - options, - redirections=compiled_redirections) + ir = compile_command_to_DFG( + fileIdGen, command_name, options, redirections=compiled_redirections + ) compiled_ast = ir except ValueError as err: log("Command not compiled to DFG:", err) @@ -171,37 +198,52 @@ def compile_node_command(ast_node, fileIdGen, config): ## Is there any case where a non-compiled command is fine? # log(traceback.format_exc()) compiled_arguments = compile_command_arguments(arguments, fileIdGen, config) - compiled_ast = make_kv(type(ast_node).NodeName, - [ast_node.line_number, compiled_assignments, - compiled_arguments, compiled_redirections]) + compiled_ast = make_kv( + type(ast_node).NodeName, + [ + ast_node.line_number, + compiled_assignments, + compiled_arguments, + compiled_redirections, + ], + ) return compiled_ast + def compile_node_and_or_semi(ast_node, fileIdGen, config): - compiled_ast = make_kv(type(ast_node).NodeName, - [compile_node(ast_node.left_operand, fileIdGen, config), - compile_node(ast_node.right_operand, fileIdGen, config)]) + compiled_ast = make_kv( + type(ast_node).NodeName, + [ + compile_node(ast_node.left_operand, fileIdGen, config), + compile_node(ast_node.right_operand, fileIdGen, config), + ], + ) return compiled_ast + def compile_node_redir_subshell(ast_node, fileIdGen, config): compiled_node = compile_node(ast_node.node, fileIdGen, config) - if (isinstance(compiled_node, IR)): + if isinstance(compiled_node, IR): ## TODO: I should use the redir list to redirect the files of ## the IR accordingly compiled_ast = compiled_node else: - compiled_ast = make_kv(type(ast_node).NodeName, [ast_node.line_number, - compiled_node, ast_node.redir_list]) + compiled_ast = make_kv( + type(ast_node).NodeName, + [ast_node.line_number, compiled_node, ast_node.redir_list], + ) return compiled_ast + def compile_node_background(ast_node, fileIdGen, config): compiled_node = compile_node(ast_node.node, fileIdGen, config) ## TODO: I should use the redir list to redirect the files of ## the IR accordingly - if (isinstance(compiled_node, IR)): + if isinstance(compiled_node, IR): ## TODO: Redirect the stdout, stdin accordingly compiled_node.set_background(True) compiled_ast = compiled_node @@ -218,14 +260,19 @@ def compile_node_background(ast_node, fileIdGen, config): return compiled_ast + def compile_node_for(ast_node, fileIdGen, config): ## TODO: Investigate what kind of check could we do to make a for ## loop parallel - compiled_ast = make_kv(type(ast_node).NodeName, - [ast_node.line_number, - compile_command_argument(ast_node.argument, fileIdGen, config), - compile_node(ast_node.body, fileIdGen, config), - ast_node.variable]) + compiled_ast = make_kv( + type(ast_node).NodeName, + [ + ast_node.line_number, + compile_command_argument(ast_node.argument, fileIdGen, config), + compile_node(ast_node.body, fileIdGen, config), + ast_node.variable, + ], + ) return compiled_ast @@ -238,15 +285,16 @@ def compile_node_for(ast_node, fileIdGen, config): ## 2. Second it raises an error if we cannot expand an argument. def should_expand_arg_char(arg_char): key, val = get_kv(arg_char) - if (key in ['V']): # Variable + if key in ["V"]: # Variable return True - elif (key == 'Q'): + elif key == "Q": return should_expand_argument(val) - elif (key == 'B'): + elif key == "B": log("Cannot expand:", arg_char) raise NotImplementedError() return False + def should_expand_argument(argument): return any([should_expand_arg_char(arg_char) for arg_char in argument]) @@ -255,21 +303,26 @@ def should_expand_argument(argument): def execute_shell_asts(asts): output_script = from_ast_objects_to_shell(asts) # log(output_script) - exec_obj = subprocess.run(["/usr/bin/env", "bash"], input=output_script, - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - universal_newlines=True) + exec_obj = subprocess.run( + ["/usr/bin/env", "bash"], + input=output_script, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ) exec_obj.check_returncode() # log(exec_obj.stdout) return exec_obj.stdout + ## TODO: Properly parse the output of the shell script def parse_string_to_arguments(arg_char_string): # log(arg_char_string) return string_to_arguments(arg_char_string) + ## TODO: Use "pash_input_args" when expanding in place of normal arguments. def naive_expand(argument, config): - ## config contains a dictionary with: ## - all variables, their types, and values in 'shell_variables' ## - the name of a file that contains them in 'shell_variables_file_path' @@ -277,7 +330,7 @@ def naive_expand(argument, config): # log(config['shell_variables_file_path']) ## Create an AST node that "echo"s the argument - echo_asts = make_echo_ast(argument, config['shell_variables_file_path']) + echo_asts = make_echo_ast(argument, config["shell_variables_file_path"]) ## Execute the echo AST by unparsing it to shell ## and calling bash @@ -293,7 +346,6 @@ def naive_expand(argument, config): return expanded_arguments - ## This function expands an arg_char. ## At the moment it is pretty inefficient as it serves as a prototype. ## @@ -301,17 +353,17 @@ def naive_expand(argument, config): ## might have assignments of its own, therefore requiring that we use them to properly expand. def expand_command_argument(argument, config): new_arguments = [argument] - if(should_expand_argument(argument)): + if should_expand_argument(argument): new_arguments = naive_expand(argument, config) return new_arguments + ## This function compiles an arg char by recursing if it contains quotes or command substitution. ## ## It is currently being extended to also expand any arguments that are safe to expand. def compile_arg_char(arg_char: ArgChar, fileIdGen, config): ## Compile the arg char - if isinstance(arg_char, CArgChar) \ - or isinstance(arg_char, EArgChar): + if isinstance(arg_char, CArgChar) or isinstance(arg_char, EArgChar): # Single character or escape return arg_char elif isinstance(arg_char, BArgChar): @@ -326,32 +378,42 @@ def compile_arg_char(arg_char: ArgChar, fileIdGen, config): arg_char.arg = compile_command_argument(arg_char.arg, fileIdGen, config) return arg_char else: - log(f'Unknown arg_char: {arg_char}') + log(f"Unknown arg_char: {arg_char}") ## TODO: Complete this return arg_char + def compile_command_argument(argument, fileIdGen, config): compiled_argument = [compile_arg_char(char, fileIdGen, config) for char in argument] return compiled_argument + def compile_command_arguments(arguments, fileIdGen, config): - compiled_arguments = [compile_command_argument(arg, fileIdGen, config) for arg in arguments] + compiled_arguments = [ + compile_command_argument(arg, fileIdGen, config) for arg in arguments + ] return compiled_arguments + ## Compiles the value assigned to a variable using the command argument rules. ## TODO: Is that the correct way to handle them? def compile_assignments(assignments, fileIdGen, config): - compiled_assignments = [[assignment[0], compile_command_argument(assignment[1], fileIdGen, config)] - for assignment in assignments] + compiled_assignments = [ + [assignment[0], compile_command_argument(assignment[1], fileIdGen, config)] + for assignment in assignments + ] return compiled_assignments + def compile_redirection(redirection, fileIdGen, config): file_arg = compile_command_argument(redirection.arg, fileIdGen, config) redirection.arg = file_arg return redirection + def compile_redirections(redirections, fileIdGen, config): - compiled_redirections = [compile_redirection(redirection, fileIdGen, config) - for redirection in redirections] + compiled_redirections = [ + compile_redirection(redirection, fileIdGen, config) + for redirection in redirections + ] return compiled_redirections - diff --git a/compiler/config.py b/compiler/config.py index c6a9c662b..e8276bd9a 100644 --- a/compiler/config.py +++ b/compiler/config.py @@ -7,22 +7,34 @@ from util import * ## Global -__version__ = "0.12.2" # FIXME add libdash version -GIT_TOP_CMD = [ 'git', 'rev-parse', '--show-toplevel', '--show-superproject-working-tree'] -if 'PASH_TOP' in os.environ: - PASH_TOP = os.environ['PASH_TOP'] +__version__ = "0.12.2" # FIXME add libdash version +GIT_TOP_CMD = [ + "git", + "rev-parse", + "--show-toplevel", + "--show-superproject-working-tree", +] +if "PASH_TOP" in os.environ: + PASH_TOP = os.environ["PASH_TOP"] else: - PASH_TOP = subprocess.run(GIT_TOP_CMD, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True).stdout.rstrip() + PASH_TOP = subprocess.run( + GIT_TOP_CMD, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + ).stdout.rstrip() PYTHON_VERSION = "python3" PLANNER_EXECUTABLE = os.path.join(PASH_TOP, "compiler/pash_compiler.py") RUNTIME_EXECUTABLE = os.path.join(PASH_TOP, "compiler/pash_runtime.sh") SAVE_ARGS_EXECUTABLE = os.path.join(PASH_TOP, "runtime/save_args.sh") -SAVE_SHELL_STATE_EXECUTABLE = os.path.join(PASH_TOP, "compiler/orchestrator_runtime/save_shell_state.sh") +SAVE_SHELL_STATE_EXECUTABLE = os.path.join( + PASH_TOP, "compiler/orchestrator_runtime/save_shell_state.sh" +) ## Ensure that PASH_TMP_PREFIX is set by pa.sh -assert(not os.getenv('PASH_TMP_PREFIX') is None) -PASH_TMP_PREFIX = os.getenv('PASH_TMP_PREFIX') +assert not os.getenv("PASH_TMP_PREFIX") is None +PASH_TMP_PREFIX = os.getenv("PASH_TMP_PREFIX") SOCKET_BUF_SIZE = 8192 @@ -60,9 +72,11 @@ def set_config_globals_from_pash_args(given_pash_args): if given_pash_args.log_file == "": logging.basicConfig(format="%(message)s") else: - logging.basicConfig(format="%(message)s", - filename=f"{os.path.abspath(given_pash_args.log_file)}", - filemode="w") + logging.basicConfig( + format="%(message)s", + filename=f"{os.path.abspath(given_pash_args.log_file)}", + filemode="w", + ) # Set debug level if given_pash_args.debug == 1: @@ -70,162 +84,226 @@ def set_config_globals_from_pash_args(given_pash_args): elif given_pash_args.debug >= 2: logging.getLogger().setLevel(logging.DEBUG) + ## Increase the recursion limit (it seems that the parser/unparser needs it for bigger graphs) sys.setrecursionlimit(10000) + def load_config(config_file_path=""): global config pash_config = {} - CONFIG_KEY = 'distr_planner' + CONFIG_KEY = "distr_planner" - if(config_file_path == ""): - config_file_path = '{}/compiler/config.json'.format(PASH_TOP) + if config_file_path == "": + config_file_path = "{}/compiler/config.json".format(PASH_TOP) with open(config_file_path) as config_file: pash_config = json.load(config_file) if not pash_config: - raise Exception('No valid configuration could be loaded from {}'.format(config_file_path)) + raise Exception( + "No valid configuration could be loaded from {}".format(config_file_path) + ) if CONFIG_KEY not in pash_config: - raise Exception('Missing `{}` config in {}'.format(CONFIG_KEY, config_file_path)) + raise Exception( + "Missing `{}` config in {}".format(CONFIG_KEY, config_file_path) + ) config = pash_config + def getWidth(): cpus = os.cpu_count() return math.floor(cpus / 8) if cpus >= 16 else 2 + def add_general_config_arguments(parser): ## TODO: Delete that at some point, or make it have a different use (e.g., outputting time even without -d 1). - parser.add_argument("-t", "--output_time", #FIXME: --time - help="(obsolete, time is always logged now) output the time it took for every step", - action="store_true") - parser.add_argument("-d", "--debug", - type=int, - help="configure debug level; defaults to 0", - default=0) - parser.add_argument("--log_file", - help="configure where to write the log; defaults to stderr.", - default="") + parser.add_argument( + "-t", + "--output_time", # FIXME: --time + help="(obsolete, time is always logged now) output the time it took for every step", + action="store_true", + ) + parser.add_argument( + "-d", + "--debug", + type=int, + help="configure debug level; defaults to 0", + default=0, + ) + parser.add_argument( + "--log_file", + help="configure where to write the log; defaults to stderr.", + default="", + ) + ## These are arguments that are common to pash.py and pash_compiler.py def add_common_arguments(parser): add_general_config_arguments(parser) - parser.add_argument("-w", "--width", - type=int, - default=getWidth(), - help="set data-parallelism factor") - parser.add_argument("--no_optimize", - help="not apply transformations over the DFG", - action="store_true") - parser.add_argument("--dry_run_compiler", - help="not execute the compiled script, even if the compiler succeeded", - action="store_true") - parser.add_argument("--assert_compiler_success", - help="assert that the compiler succeeded (used to make tests more robust)", - action="store_true") - parser.add_argument("--avoid_pash_runtime_completion", - help="avoid the pash_runtime execution completion (only relevant when --debug > 0)", - action="store_true") - parser.add_argument("--profile_driven", - help="(experimental) use profiling information when optimizing", - action="store_true") - parser.add_argument("-p", "--output_optimized", # FIXME: --print - help="output the parallel shell script for inspection", - action="store_true") - parser.add_argument("--graphviz", - help="generates graphical representations of the dataflow graphs. The option argument corresponds to the format. PaSh stores them in a timestamped directory in the argument of --graphviz_dir", - choices=["no", "dot", "svg", "pdf", "png"], - default="no") - ## TODO: To discuss: Do we maybe want to have graphviz to always be included + parser.add_argument( + "-w", + "--width", + type=int, + default=getWidth(), + help="set data-parallelism factor", + ) + parser.add_argument( + "--no_optimize", + help="not apply transformations over the DFG", + action="store_true", + ) + parser.add_argument( + "--dry_run_compiler", + help="not execute the compiled script, even if the compiler succeeded", + action="store_true", + ) + parser.add_argument( + "--assert_compiler_success", + help="assert that the compiler succeeded (used to make tests more robust)", + action="store_true", + ) + parser.add_argument( + "--avoid_pash_runtime_completion", + help="avoid the pash_runtime execution completion (only relevant when --debug > 0)", + action="store_true", + ) + parser.add_argument( + "--profile_driven", + help="(experimental) use profiling information when optimizing", + action="store_true", + ) + parser.add_argument( + "-p", + "--output_optimized", # FIXME: --print + help="output the parallel shell script for inspection", + action="store_true", + ) + parser.add_argument( + "--graphviz", + help="generates graphical representations of the dataflow graphs. The option argument corresponds to the format. PaSh stores them in a timestamped directory in the argument of --graphviz_dir", + choices=["no", "dot", "svg", "pdf", "png"], + default="no", + ) + ## TODO: To discuss: Do we maybe want to have graphviz to always be included ## in the temp directory (under a graphviz subdirectory) instead of in its own? - ## kk: I think that ideally we want a log-directory where we can put logs, graphviz, + ## kk: I think that ideally we want a log-directory where we can put logs, graphviz, ## and other observability and monitoring info (instead of putting them in the temp). - parser.add_argument("--graphviz_dir", - help="the directory in which to store graphical representations", - default="/tmp") - parser.add_argument("--no_eager", - help="(experimental) disable eager nodes before merging nodes", - action="store_true") - parser.add_argument("--no_daemon", - help="(obsolete) does nothing -- Run the compiler everytime we need a compilation instead of using the daemon", - action="store_true", - default=False) - parser.add_argument("--parallel_pipelines", - help="Run multiple pipelines in parallel if they are safe to run", - action="store_true", - default=False) - parser.add_argument("--r_split_batch_size", - type=int, - help="configure the batch size of r_split (default: 1MB)", - default=1000000) - parser.add_argument("--r_split", - help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", - action="store_true") - parser.add_argument("--dgsh_tee", - help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", - action="store_true") - parser.add_argument("--speculative", - help="(experimental) use the speculative execution preprocessing and runtime (NOTE: this has nothing to do with --speculation, which is actually misnamed, and should be named concurrent compilation/execution and is now obsolete)", - action="store_true", - default=False) + parser.add_argument( + "--graphviz_dir", + help="the directory in which to store graphical representations", + default="/tmp", + ) + parser.add_argument( + "--no_eager", + help="(experimental) disable eager nodes before merging nodes", + action="store_true", + ) + parser.add_argument( + "--no_daemon", + help="(obsolete) does nothing -- Run the compiler everytime we need a compilation instead of using the daemon", + action="store_true", + default=False, + ) + parser.add_argument( + "--parallel_pipelines", + help="Run multiple pipelines in parallel if they are safe to run", + action="store_true", + default=False, + ) + parser.add_argument( + "--r_split_batch_size", + type=int, + help="configure the batch size of r_split (default: 1MB)", + default=1000000, + ) + parser.add_argument( + "--r_split", + help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", + action="store_true", + ) + parser.add_argument( + "--dgsh_tee", + help="(obsolete) does nothing -- only here for old interfaces (not used anywhere in the code)", + action="store_true", + ) + parser.add_argument( + "--speculative", + help="(experimental) use the speculative execution preprocessing and runtime (NOTE: this has nothing to do with --speculation, which is actually misnamed, and should be named concurrent compilation/execution and is now obsolete)", + action="store_true", + default=False, + ) ## This is misnamed, it should be named concurrent compilation/execution - parser.add_argument("--speculation", - help="(obsolete) does nothing -- run the original script during compilation; if compilation succeeds, abort the original and run only the parallel (quick_abort) (Default: no_spec)", - choices=['no_spec', 'quick_abort'], - default='no_spec') - parser.add_argument("--termination", - help="(experimental) determine the termination behavior of the DFG. Defaults to cleanup after the last process dies, but can drain all streams until depletion", - choices=['clean_up_graph', 'drain_stream'], - default="clean_up_graph") - parser.add_argument("--daemon_communicates_through_unix_pipes", - help="(experimental) the daemon communicates through unix pipes instead of sockets", - action="store_true") - parser.add_argument("--distributed_exec", - help="(experimental) execute the script in a distributed environment. Remote machines should be configured and ready", - action="store_true", - default=False) - parser.add_argument("--config_path", - help="determines the config file path. By default it is 'PASH_TOP/compiler/config.yaml'.", - default="") - parser.add_argument("--version", - action='version', - version='%(prog)s {version}'.format(version=__version__)) + parser.add_argument( + "--speculation", + help="(obsolete) does nothing -- run the original script during compilation; if compilation succeeds, abort the original and run only the parallel (quick_abort) (Default: no_spec)", + choices=["no_spec", "quick_abort"], + default="no_spec", + ) + parser.add_argument( + "--termination", + help="(experimental) determine the termination behavior of the DFG. Defaults to cleanup after the last process dies, but can drain all streams until depletion", + choices=["clean_up_graph", "drain_stream"], + default="clean_up_graph", + ) + parser.add_argument( + "--daemon_communicates_through_unix_pipes", + help="(experimental) the daemon communicates through unix pipes instead of sockets", + action="store_true", + ) + parser.add_argument( + "--distributed_exec", + help="(experimental) execute the script in a distributed environment. Remote machines should be configured and ready", + action="store_true", + default=False, + ) + parser.add_argument( + "--config_path", + help="determines the config file path. By default it is 'PASH_TOP/compiler/config.yaml'.", + default="", + ) + parser.add_argument( + "--version", + action="version", + version="%(prog)s {version}".format(version=__version__), + ) return + def pass_common_arguments(pash_arguments): arguments = [] - if (pash_arguments.no_optimize): + if pash_arguments.no_optimize: arguments.append("--no_optimize") - if (pash_arguments.dry_run_compiler): + if pash_arguments.dry_run_compiler: arguments.append("--dry_run_compiler") - if (pash_arguments.assert_compiler_success): + if pash_arguments.assert_compiler_success: arguments.append("--assert_compiler_success") - if (pash_arguments.avoid_pash_runtime_completion): + if pash_arguments.avoid_pash_runtime_completion: arguments.append("--avoid_pash_runtime_completion") - if (pash_arguments.profile_driven): + if pash_arguments.profile_driven: arguments.append("--profile_driven") - if (pash_arguments.output_time): + if pash_arguments.output_time: arguments.append("--output_time") - if (pash_arguments.output_optimized): + if pash_arguments.output_optimized: arguments.append("--output_optimized") arguments.append("--graphviz") arguments.append(pash_arguments.graphviz) arguments.append("--graphviz_dir") arguments.append(pash_arguments.graphviz_dir) - if(not pash_arguments.log_file == ""): + if not pash_arguments.log_file == "": arguments.append("--log_file") arguments.append(pash_arguments.log_file) - if (pash_arguments.no_eager): + if pash_arguments.no_eager: arguments.append("--no_eager") - if (pash_arguments.distributed_exec): + if pash_arguments.distributed_exec: arguments.append("--distributed_exec") - if (pash_arguments.speculative): + if pash_arguments.speculative: arguments.append("--speculative") - if (pash_arguments.parallel_pipelines): + if pash_arguments.parallel_pipelines: arguments.append("--parallel_pipelines") - if (pash_arguments.daemon_communicates_through_unix_pipes): + if pash_arguments.daemon_communicates_through_unix_pipes: arguments.append("--daemon_communicates_through_unix_pipes") arguments.append("--r_split_batch_size") arguments.append(str(pash_arguments.r_split_batch_size)) @@ -235,14 +313,15 @@ def pass_common_arguments(pash_arguments): arguments.append(pash_arguments.termination) arguments.append("--width") arguments.append(str(pash_arguments.width)) - if(not pash_arguments.config_path == ""): + if not pash_arguments.config_path == "": arguments.append("--config_path") arguments.append(pash_arguments.config_path) return arguments + def init_log_file(): global LOG_FILE - if(not LOG_FILE == ""): + if not LOG_FILE == "": with open(LOG_FILE, "w") as f: pass @@ -251,7 +330,8 @@ def init_log_file(): ## Set the shell variables ## + def set_vars_file(var_file_path: str, var_dict: dict): - global config - config['shell_variables'] = var_dict - config['shell_variables_file_path'] = var_file_path + global config + config["shell_variables"] = var_dict + config["shell_variables_file_path"] = var_file_path diff --git a/compiler/definitions/ir/aggregator_node.py b/compiler/definitions/ir/aggregator_node.py index 125ce46db..a99f1e7b5 100644 --- a/compiler/definitions/ir/aggregator_node.py +++ b/compiler/definitions/ir/aggregator_node.py @@ -1,54 +1,80 @@ from definitions.ir.dfg_node import * + # from definitions.ir.nodes.arg import Arg -from annotations_utils.util_cmd_invocations import get_command_invocation_prefix_from_dfg_node +from annotations_utils.util_cmd_invocations import ( + get_command_invocation_prefix_from_dfg_node, +) ## This class corresponds to a generic n-ary aggregator ## ## TODO: Do we need to do anything special for binary aggregators? class MapperAggregatorNode(DFGNode): - def __init__(self, old_node, input_ids, output_ids, name_string, new_options, flag_option_list): - + def __init__( + self, + old_node, + input_ids, + output_ids, + name_string, + new_options, + flag_option_list, + ): ## The name of the aggregator command name = Arg.string_to_arg(name_string) ## TODO: The category should also be acquired through annotations (and maybe should be asserted to be at most pure) - com_category="pure" + com_category = "pure" ## TODO: Not sure if redirections need to be copied to new function. com_redirs = [redir.to_ast() for redir in old_node.com_redirs] - super().__init__(input_ids, - output_ids, - name, - com_category, - com_options=new_options, # changed that all are already in there and not appended - flag_option_list=flag_option_list, - com_redirs=com_redirs, - com_assignments=old_node.com_assignments) + super().__init__( + input_ids, + output_ids, + name, + com_category, + com_options=new_options, # changed that all are already in there and not appended + flag_option_list=flag_option_list, + com_redirs=com_redirs, + com_assignments=old_node.com_assignments, + ) class AggregatorNode(MapperAggregatorNode): def __init__(self, old_node, input_ids, output_ids): - used_parallelizer = old_node.get_used_parallelizer() cmd_inv_pref = get_command_invocation_prefix_from_dfg_node(old_node) used_aggregator = used_parallelizer.get_actual_aggregator(cmd_inv_pref) - log(f'used_agg: {used_aggregator}') - log(f'old_node: {old_node}') + log(f"used_agg: {used_aggregator}") + log(f"old_node: {old_node}") ## Check if an aggregator can be instantiated from the node - if(used_aggregator is None): - log("Error: Node:", old_node, "does not contain information to instantiate an aggregator!") - raise Exception('No information to instantiate aggregator') + if used_aggregator is None: + log( + "Error: Node:", + old_node, + "does not contain information to instantiate an aggregator!", + ) + raise Exception("No information to instantiate aggregator") ## The name of the aggregator command agg_name_string = used_aggregator.cmd_name - all_options_incl_new = [Arg.string_to_arg(el.get_name()) for el in used_aggregator.flag_option_list + used_aggregator.positional_config_list] + all_options_incl_new = [ + Arg.string_to_arg(el.get_name()) + for el in used_aggregator.flag_option_list + + used_aggregator.positional_config_list + ] # TODO: zip is nicer - all_options_incl_new_right_format = [(i, all_options_incl_new[i]) for i in range(len(all_options_incl_new))] + all_options_incl_new_right_format = [ + (i, all_options_incl_new[i]) for i in range(len(all_options_incl_new)) + ] - super().__init__(old_node, input_ids, output_ids, agg_name_string, all_options_incl_new_right_format, - flag_option_list=used_aggregator.flag_option_list) + super().__init__( + old_node, + input_ids, + output_ids, + agg_name_string, + all_options_incl_new_right_format, + flag_option_list=used_aggregator.flag_option_list, + ) log("Generic Aggregator Created:", self) - diff --git a/compiler/definitions/ir/arg.py b/compiler/definitions/ir/arg.py index 41fcafc6a..9cf83037b 100644 --- a/compiler/definitions/ir/arg.py +++ b/compiler/definitions/ir/arg.py @@ -3,37 +3,43 @@ from shell_ast.ast_util import * from util import * + class Arg: arg_char_list: "list[ArgChar]" def __init__(self, arg_char_list: "list[ArgChar]"): - assert(not isinstance(arg_char_list, Arg)) + assert not isinstance(arg_char_list, Arg) for arg_char in arg_char_list: - assert(isinstance(arg_char, ArgChar)) + assert isinstance(arg_char, ArgChar) self.arg_char_list = arg_char_list def __repr__(self): return format_arg_chars(self.arg_char_list) def __eq__(self, other): - if(isinstance(other, Arg)): + if isinstance(other, Arg): return self.arg_char_list == other.arg_char_list - log("Warning: Comparing Arg:", self, "with a non Arg argument:", other, "of type:", type(other)) + log( + "Warning: Comparing Arg:", + self, + "with a non Arg argument:", + other, + "of type:", + type(other), + ) return False def opt_serialize(self): return self.__repr__() - + def to_ast(self): return self.arg_char_list def concatenate(self, other): - space = [CArgChar(32)] # space + space = [CArgChar(32)] # space self.arg_char_list.extend(space) self.arg_char_list.extend(other.arg_char_list) @staticmethod def string_to_arg(string: str) -> Arg: return Arg(string_to_carg_char_list(string)) - - diff --git a/compiler/definitions/ir/dfg_node.py b/compiler/definitions/ir/dfg_node.py index 7259a29af..304355d7c 100644 --- a/compiler/definitions/ir/dfg_node.py +++ b/compiler/definitions/ir/dfg_node.py @@ -2,9 +2,16 @@ from definitions.ir.redirection import * from definitions.ir.resource import * -from annotations_utils.util_cmd_invocations import to_node_cmd_inv_with_io_vars, construct_property_container_from_list_of_properties +from annotations_utils.util_cmd_invocations import ( + to_node_cmd_inv_with_io_vars, + construct_property_container_from_list_of_properties, +) + +from util import ( + return_empty_list_if_none_else_itself, + return_default_if_none_else_itself, +) -from util import return_empty_list_if_none_else_itself, return_default_if_none_else_itself ## Assumption: Everything related to a DFGNode must be already expanded. ## TODO: Ensure that this is true with assertions @@ -17,13 +24,14 @@ class DFGNode: ## com_assignments : list of assignments ## parallelizer_list : list of parallelizers for this DFGNode ## cmd_related_properties : dict to store properties like commutativity - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs = [], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None, - ): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + ): # TODO []: default parameters! ## @KK: can this be deleted? Was there another id in the member attributes before? @@ -34,9 +42,15 @@ def __init__(self, self.com_redirs = [Redirection(redirection) for redirection in com_redirs] self.com_assignments = com_assignments - self.parallelizer_list = return_empty_list_if_none_else_itself(parallelizer_list) - default_cmd_properties = construct_property_container_from_list_of_properties([]) - self.cmd_related_properties = return_default_if_none_else_itself(cmd_related_properties, default_cmd_properties) + self.parallelizer_list = return_empty_list_if_none_else_itself( + parallelizer_list + ) + default_cmd_properties = construct_property_container_from_list_of_properties( + [] + ) + self.cmd_related_properties = return_default_if_none_else_itself( + cmd_related_properties, default_cmd_properties + ) self.cmd_invocation_with_io_vars = cmd_invocation_with_io_vars # log("Node created:", self.id, self) @@ -57,7 +71,6 @@ def get_dot_label(self) -> str: basename = os.path.basename(str(name)) return basename - def get_id(self): return self.id @@ -84,20 +97,19 @@ def get_configuration_inputs(self): return inputs.get_config_inputs() def is_commutative(self): - val = self.cmd_related_properties.get_property_value('is_commutative') + val = self.cmd_related_properties.get_property_value("is_commutative") if val is not None: return val else: return False - ## Auxiliary method that returns any necessary redirections, ## at the moment it doesn't look necessary. def _to_ast_aux_get_redirs(self): ## still used in to_ast ## TODO: Properly handle redirections ## - ## TODO: If one of the redirected outputs or inputs is changed in the IR + ## TODO: If one of the redirected outputs or inputs is changed in the IR ## (e.g. `cat < s1` was changed to read from an ephemeral file `cat < "#file5"`) ## this needs to be changed in the redirections too. Maybe we can modify redirections ## when replacing fid. @@ -111,7 +123,6 @@ def _to_ast_aux_get_redirs(self): ## where we recreate arguments and redirections). return [] - ## TODO: Improve this function to be separately implemented for different special nodes, ## such as cat, eager, split, etc... ## I do not think this separation is reasonable anymore since we remodelled nodes in a way that the back-translation is trivial @@ -120,7 +131,7 @@ def _to_ast_aux_get_redirs(self): ## hence assumes that non-streaming inputs/outputs will not change; with a special to_ast, we could circumvent this def to_ast(self, edges, drain_streams): ## TODO: We might not want to implement this at all actually - if (drain_streams): + if drain_streams: raise NotImplementedError() else: # commented since "see above" @@ -132,7 +143,9 @@ def to_ast(self, edges, drain_streams): redirs = self._to_ast_aux_get_redirs() assignments = self.com_assignments - node = to_node_cmd_inv_with_io_vars(self.cmd_invocation_with_io_vars, edges, redirs, assignments) + node = to_node_cmd_inv_with_io_vars( + self.cmd_invocation_with_io_vars, edges, redirs, assignments + ) # TODO: think about redirections # old code for this: # rest_argument_fids, new_redirs = create_command_arguments_redirs(com_name_ast, @@ -157,30 +170,34 @@ def apply_redirections(self, edges): unhandled_redirs = [] for redirection in self.com_redirs: ## Handle To redirections that have to do with stdout - if (redirection.is_to_file() and redirection.is_for_stdout()): + if redirection.is_to_file() and redirection.is_for_stdout(): # log(redirection) file_resource = FileResource(redirection.file_arg) success = False for i in range(len(self.get_output_list())): output_edge_id = self.get_output_list()[i] output_fid = edges[output_edge_id][0] - if(output_fid.has_file_descriptor_resource() - and output_fid.resource.is_stdout()): + if ( + output_fid.has_file_descriptor_resource() + and output_fid.resource.is_stdout() + ): success = True edges[output_edge_id][0].set_resource(file_resource) # self.outputs[i].set_resource(file_resource) - assert(success) - elif (redirection.is_from_file() and redirection.is_for_stdin()): + assert success + elif redirection.is_from_file() and redirection.is_for_stdin(): # log(redirection) file_resource = FileResource(redirection.file_arg) success = False for input_edge_id in self.get_input_list(): input_fid = edges[input_edge_id][0] - if(input_fid.has_file_descriptor_resource() - and input_fid.resource.is_stdin()): + if ( + input_fid.has_file_descriptor_resource() + and input_fid.resource.is_stdin() + ): success = True edges[input_edge_id][0].set_resource(file_resource) - assert(success) + assert success else: log("Warning -- Unhandled redirection:", redirection) unhandled_redirs.append(redirection) @@ -188,7 +205,6 @@ def apply_redirections(self, edges): ## Does it make any sense to keep them and have them in the Final AST. raise NotImplementedError() - ## This renames the from_id (wherever it exists in inputs or outputs) ## to the to_id. ## @@ -202,7 +218,7 @@ def replace_edge(self, from_id, to_id): def replace_edge_in_list(self, edge_ids, from_id, to_id): new_edge_ids = [] for id in edge_ids: - if(id == from_id): + if id == from_id: new_edge_id = to_id else: new_edge_id = id @@ -212,22 +228,30 @@ def replace_edge_in_list(self, edge_ids, from_id, to_id): def get_option_implemented_round_robin_parallelizer(self): for parallelizer in self.parallelizer_list: splitter = parallelizer.get_splitter() - if splitter.is_splitter_round_robin() and parallelizer.are_all_parts_implemented(): + if ( + splitter.is_splitter_round_robin() + and parallelizer.are_all_parts_implemented() + ): return parallelizer return None def get_option_implemented_round_robin_with_unwrap_parallelizer(self): for parallelizer in self.parallelizer_list: splitter = parallelizer.get_splitter() - if splitter.is_splitter_round_robin_with_unwrap_flag() and parallelizer.are_all_parts_implemented(): + if ( + splitter.is_splitter_round_robin_with_unwrap_flag() + and parallelizer.are_all_parts_implemented() + ): return parallelizer return None - def get_option_implemented_consecutive_chunks_parallelizer(self): for parallelizer in self.parallelizer_list: splitter = parallelizer.get_splitter() - if splitter.is_splitter_consec_chunks() and parallelizer.are_all_parts_implemented(): + if ( + splitter.is_splitter_consec_chunks() + and parallelizer.are_all_parts_implemented() + ): return parallelizer return None @@ -235,13 +259,15 @@ def get_option_implemented_consecutive_chunks_parallelizer(self): def make_simple_dfg_node_from_cmd_inv_with_io_vars(cmd_inv_with_io_vars): return DFGNode(cmd_inv_with_io_vars) - def get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization(self): + def get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization( + self, + ): streaming_inputs = self.get_streaming_inputs() - assert (len(streaming_inputs) == 1) + assert len(streaming_inputs) == 1 streaming_input = streaming_inputs[0] configuration_inputs = self.get_configuration_inputs() - assert (len(configuration_inputs) == 0) + assert len(configuration_inputs) == 0 streaming_outputs = self.get_output_list() - assert (len(streaming_outputs) == 1) + assert len(streaming_outputs) == 1 streaming_output = streaming_outputs[0] return streaming_input, streaming_output, configuration_inputs diff --git a/compiler/definitions/ir/file_id.py b/compiler/definitions/ir/file_id.py index ecee07ec0..e3d8eef99 100644 --- a/compiler/definitions/ir/file_id.py +++ b/compiler/definitions/ir/file_id.py @@ -7,6 +7,7 @@ from definitions.ir.resource import * + ## Note: The NULL ident is considered to be the default unknown file id ## ## TODO: WARNING: We have to make sure that a resource in our IR can @@ -29,19 +30,19 @@ def __init__(self, ident, prefix="", resource=None): self.prefix = prefix ## TODO: Remove all union_find ## Initialize the parent - self.resource=resource + self.resource = resource def __repr__(self): - if(isinstance(self.resource, EphemeralResource)): + if isinstance(self.resource, EphemeralResource): output = self.get_fifo_suffix() else: output = "fid:{}:{}".format(self.ident, self.resource) return output def serialize(self): - if(isinstance(self.resource, TemporaryFileResource)): + if isinstance(self.resource, TemporaryFileResource): output = self.get_temporary_file_suffix() - elif(isinstance(self.resource, EphemeralResource)): + elif isinstance(self.resource, EphemeralResource): output = self.get_fifo_suffix() else: output = "{}".format(self.resource) @@ -73,17 +74,17 @@ def to_ast(self, stdin_dash=False): ## check if a file id refers to a pipe ## ## TODO: I am not sure about the FileDescriptor resource - if(isinstance(self.resource, TemporaryFileResource)): + if isinstance(self.resource, TemporaryFileResource): suffix = self.get_temporary_file_suffix() string = os.path.join(config.PASH_TMP_PREFIX, suffix) argument = string_to_argument(string) - elif(isinstance(self.resource, EphemeralResource)): + elif isinstance(self.resource, EphemeralResource): suffix = self.get_fifo_suffix() - string = os.path.join(config.PASH_TMP_PREFIX, suffix) + string = os.path.join(config.PASH_TMP_PREFIX, suffix) ## Quote the argument - argument = [make_kv('Q', string_to_argument(string))] - elif(isinstance(self.resource, FileDescriptorResource)): - if (self.resource.is_stdin() and stdin_dash): + argument = [make_kv("Q", string_to_argument(string))] + elif isinstance(self.resource, FileDescriptorResource): + if self.resource.is_stdin() and stdin_dash: argument = string_to_argument("-") else: raise NotImplementedError() @@ -97,7 +98,7 @@ def set_resource(self, resource): ## The file resource cannot be reset. A pointer can never point to ## more than one file resource. However, we can change an ephemeral ## resource or a file_descriptor resource. - assert(not self.has_file_resource()) + assert not self.has_file_resource() self.resource = resource def get_resource(self): @@ -105,19 +106,19 @@ def get_resource(self): ## Remove this def has_resource(self): - return (not self.resource is None) + return not self.resource is None def has_file_resource(self): - return (isinstance(self.resource, FileResource)) + return isinstance(self.resource, FileResource) def has_file_descriptor_resource(self): - return (isinstance(self.resource, FileDescriptorResource)) + return isinstance(self.resource, FileDescriptorResource) def has_remote_file_resource(self): return isinstance(self.resource, RemoteFileResource) def is_ephemeral(self): - return (isinstance(self.resource, EphemeralResource)) + return isinstance(self.resource, EphemeralResource) def make_temporary_file(self): self.resource = TemporaryFileResource() diff --git a/compiler/definitions/ir/nodes/cat.py b/compiler/definitions/ir/nodes/cat.py index 675b3880b..ced0cb2b7 100644 --- a/compiler/definitions/ir/nodes/cat.py +++ b/compiler/definitions/ir/nodes/cat.py @@ -1,6 +1,11 @@ -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.dfg_node import DFGNode + def make_cat_node(inputs, output): - cmd_inv_cat = CommandInvocationWithIOVars.make_cat_command_invocation_with_io_vars(inputs, output) + cmd_inv_cat = CommandInvocationWithIOVars.make_cat_command_invocation_with_io_vars( + inputs, output + ) return DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(cmd_inv_cat) diff --git a/compiler/definitions/ir/nodes/dfs_split_reader.py b/compiler/definitions/ir/nodes/dfs_split_reader.py index 63855e325..73343ae7d 100644 --- a/compiler/definitions/ir/nodes/dfs_split_reader.py +++ b/compiler/definitions/ir/nodes/dfs_split_reader.py @@ -1,28 +1,40 @@ import os from definitions.ir.dfg_node import * + class DFSSplitReader(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, - com_options = [], com_redirs = [], com_assignments=[]): - - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) + def __init__( + self, + inputs, + outputs, + com_name, + com_category, + com_options=[], + com_redirs=[], + com_assignments=[], + ): + super().__init__( + inputs, + outputs, + com_name, + com_category, + com_options=com_options, + com_redirs=com_redirs, + com_assignments=com_assignments, + ) + + def set_server_address(self, addr): # ex addr: 127.0.0.1:50051 + self.com_options.append((3, Arg.string_to_arg(f"--addr {addr}"))) - def set_server_address(self, addr): # ex addr: 127.0.0.1:50051 - self.com_options.append((3, Arg.string_to_arg(f"--addr {addr}"))) def make_dfs_split_reader_node(inputs, output, split_num, prefix): - split_reader_bin = os.path.join(config.PASH_TOP, config.config['runtime']['dfs_split_reader_binary']) + split_reader_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["dfs_split_reader_binary"] + ) com_name = Arg.string_to_arg(split_reader_bin) com_category = "pure" options = [] options.append((1, Arg.string_to_arg(f"--prefix '{prefix}'"))) options.append((2, Arg.string_to_arg(f"--split {split_num}"))) - return DFSSplitReader(inputs, - [output], - com_name, - com_category, - options) + return DFSSplitReader(inputs, [output], com_name, com_category, options) diff --git a/compiler/definitions/ir/nodes/dgsh_tee.py b/compiler/definitions/ir/nodes/dgsh_tee.py index 16bd5efff..d74ab11dc 100644 --- a/compiler/definitions/ir/nodes/dgsh_tee.py +++ b/compiler/definitions/ir/nodes/dgsh_tee.py @@ -1,32 +1,43 @@ from pash_annotations.datatypes.AccessKind import make_stream_output, make_stream_input from pash_annotations.datatypes.BasicDatatypes import Flag, ArgStringType from pash_annotations.datatypes.BasicDatatypesWithIO import OptionWithIO -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from annotations_utils.util_cmd_invocations import to_ast_flagoption, to_ast_operand from definitions.ir.dfg_node import * + class DGSHTee(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], com_assignments=[] - ): + def __init__(self, cmd_invocation_with_io_vars, com_redirs=[], com_assignments=[]): # TODO []: default - super().__init__(cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments) + super().__init__( + cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + ) + def make_dgsh_tee_node(input_id, output_id): - dgsh_tee_bin = os.path.join(config.PASH_TOP, config.config['runtime']['dgsh_tee_binary']) + dgsh_tee_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["dgsh_tee_binary"] + ) - access_map = {output_id: make_stream_output(), - input_id: make_stream_input()} + access_map = {output_id: make_stream_output(), input_id: make_stream_input()} - flag_option_list = [OptionWithIO("-i", input_id), - OptionWithIO("-o", output_id), - Flag("-I"), - Flag("-f"), - OptionWithIO("-b", ArgStringType(Arg.string_to_arg(str(config.config['runtime']['dgsh_buffer_size']))))] + flag_option_list = [ + OptionWithIO("-i", input_id), + OptionWithIO("-o", output_id), + Flag("-I"), + Flag("-f"), + OptionWithIO( + "-b", + ArgStringType( + Arg.string_to_arg(str(config.config["runtime"]["dgsh_buffer_size"])) + ), + ), + ] cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=dgsh_tee_bin, @@ -34,5 +45,6 @@ def make_dgsh_tee_node(input_id, output_id): operand_list=[], implicit_use_of_streaming_input=None, implicit_use_of_streaming_output=None, - access_map=access_map) + access_map=access_map, + ) return DGSHTee(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/eager.py b/compiler/definitions/ir/nodes/eager.py index 73643768b..a56ac02bc 100644 --- a/compiler/definitions/ir/nodes/eager.py +++ b/compiler/definitions/ir/nodes/eager.py @@ -1,31 +1,41 @@ -from pash_annotations.datatypes.AccessKind import AccessKind, make_stream_output, make_stream_input, make_other_output -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.AccessKind import ( + AccessKind, + make_stream_output, + make_stream_input, + make_other_output, +) +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.dfg_node import * + class Eager(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], com_assignments=[] - ): + def __init__(self, cmd_invocation_with_io_vars, com_redirs=[], com_assignments=[]): # TODO []: default - super().__init__(cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments) + super().__init__( + cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + ) def make_eager_node(input_id, output_id, intermediate_file_id, eager_exec_path): eager_name = eager_exec_path intermediate_file_id_id = intermediate_file_id.get_ident() operand_list = [input_id, output_id, intermediate_file_id_id] - access_map = {output_id: make_stream_output(), - input_id: make_stream_input(), - intermediate_file_id_id: make_other_output()} + access_map = { + output_id: make_stream_output(), + input_id: make_stream_input(), + intermediate_file_id_id: make_other_output(), + } cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=eager_name, flag_option_list=[], operand_list=operand_list, implicit_use_of_streaming_input=None, implicit_use_of_streaming_output=None, - access_map=access_map) + access_map=access_map, + ) return Eager(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/hdfs_cat.py b/compiler/definitions/ir/nodes/hdfs_cat.py index 3fe81012c..3d4c6f5f4 100644 --- a/compiler/definitions/ir/nodes/hdfs_cat.py +++ b/compiler/definitions/ir/nodes/hdfs_cat.py @@ -1,11 +1,25 @@ from definitions.ir.dfg_node import * + class HDFSCat(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, - com_options = [], com_redirs = [], com_assignments=[]): - assert(str(com_name) == "hdfs") - assert(str(com_options[0][1]) == "dfs" and str(com_options[1][1]) == "-cat") - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) + def __init__( + self, + inputs, + outputs, + com_name, + com_category, + com_options=[], + com_redirs=[], + com_assignments=[], + ): + assert str(com_name) == "hdfs" + assert str(com_options[0][1]) == "dfs" and str(com_options[1][1]) == "-cat" + super().__init__( + inputs, + outputs, + com_name, + com_category, + com_options=com_options, + com_redirs=com_redirs, + com_assignments=com_assignments, + ) diff --git a/compiler/definitions/ir/nodes/pash_split.py b/compiler/definitions/ir/nodes/pash_split.py index 621334807..d177dcf48 100644 --- a/compiler/definitions/ir/nodes/pash_split.py +++ b/compiler/definitions/ir/nodes/pash_split.py @@ -1,5 +1,7 @@ from pash_annotations.datatypes.AccessKind import make_stream_input, make_stream_output -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.file_id import * from definitions.ir.dfg_node import * @@ -7,22 +9,30 @@ import config import os + class Split(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + ): # TODO []: default arguments! - super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties) + super().__init__( + cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) + def make_split_file(input_id, out_ids): - auto_split_bin = os.path.join(config.PASH_TOP, config.config['runtime']['auto_split_binary']) + auto_split_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["auto_split_binary"] + ) operand_list = [input_id] operand_list.extend(out_ids) access_map = {output_id: make_stream_output() for output_id in out_ids} @@ -33,5 +43,6 @@ def make_split_file(input_id, out_ids): operand_list=operand_list, implicit_use_of_streaming_input=None, implicit_use_of_streaming_output=None, - access_map=access_map) + access_map=access_map, + ) return Split(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/r_merge.py b/compiler/definitions/ir/nodes/r_merge.py index 345c13e23..c4a982ca1 100644 --- a/compiler/definitions/ir/nodes/r_merge.py +++ b/compiler/definitions/ir/nodes/r_merge.py @@ -1,24 +1,34 @@ from pash_annotations.datatypes.AccessKind import make_stream_input, make_stream_output -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.dfg_node import * + class RMerge(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + ): # TODO []: default arguments! - super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties) + super().__init__( + cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) + def make_r_merge_node(inputs, output): - r_merge_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_merge_binary']) + r_merge_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["r_merge_binary"] + ) # TODO: assume that the inputs and output is provided as operands access_map = {input_id: make_stream_input() for input_id in inputs} access_map[output] = make_stream_output() @@ -28,5 +38,6 @@ def make_r_merge_node(inputs, output): operand_list=inputs, implicit_use_of_streaming_input=None, implicit_use_of_streaming_output=output, - access_map=access_map) + access_map=access_map, + ) return RMerge(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/r_split.py b/compiler/definitions/ir/nodes/r_split.py index aefce4b7c..c5c2b7b78 100644 --- a/compiler/definitions/ir/nodes/r_split.py +++ b/compiler/definitions/ir/nodes/r_split.py @@ -1,8 +1,14 @@ import os -from pash_annotations.datatypes.AccessKind import AccessKind, make_stream_input, make_stream_output +from pash_annotations.datatypes.AccessKind import ( + AccessKind, + make_stream_input, + make_stream_output, +) from pash_annotations.datatypes.BasicDatatypes import Operand, Flag -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) import config @@ -10,40 +16,48 @@ from definitions.ir.file_id import * from shell_ast.ast_util import string_to_argument + class RSplit(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + ): # TODO []: default arguments! - super().__init__(cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties) + super().__init__( + cmd_invocation_with_io_vars=cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) def add_r_flag(self): self.cmd_invocation_with_io_vars.flag_option_list.append(Flag("-r")) def make_r_split(input_id, out_ids, r_split_batch_size): - r_split_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_split_binary']) - operand_list = [input_id, - Operand(Arg.string_to_arg(str(r_split_batch_size)))] + r_split_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["r_split_binary"] + ) + operand_list = [input_id, Operand(Arg.string_to_arg(str(r_split_batch_size)))] operand_list.extend(out_ids) access_map = {output_id: make_stream_output() for output_id in out_ids} access_map[input_id] = make_stream_input() cmd_inv_with_io_vars = CommandInvocationWithIOVars( - cmd_name=r_split_bin, - flag_option_list=[], - operand_list=operand_list, - implicit_use_of_streaming_input=None, - implicit_use_of_streaming_output=None, - access_map=access_map) + cmd_name=r_split_bin, + flag_option_list=[], + operand_list=operand_list, + implicit_use_of_streaming_input=None, + implicit_use_of_streaming_output=None, + access_map=access_map, + ) return RSplit(cmd_inv_with_io_vars) + def make_r_split_with_unwrap_flag(input_id, out_ids, r_split_batch_size): standard_r_split = make_r_split(input_id, out_ids, r_split_batch_size) standard_r_split.add_r_flag() diff --git a/compiler/definitions/ir/nodes/r_unwrap.py b/compiler/definitions/ir/nodes/r_unwrap.py index 931507220..b02d695af 100644 --- a/compiler/definitions/ir/nodes/r_unwrap.py +++ b/compiler/definitions/ir/nodes/r_unwrap.py @@ -1,32 +1,43 @@ from pash_annotations.datatypes.AccessKind import make_stream_input, make_stream_output -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from definitions.ir.dfg_node import * + class RUnwrap(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + ): # TODO []: default - super().__init__(cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties) + super().__init__( + cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) + def make_unwrap_node(inputs, output): - assert(len(inputs) == 1) + assert len(inputs) == 1 input_id = inputs[0] access_map = {input_id: make_stream_input(), output: make_stream_output()} - r_unwrap_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_unwrap_binary']) + r_unwrap_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["r_unwrap_binary"] + ) cmd_inv_with_io_vars = CommandInvocationWithIOVars( cmd_name=r_unwrap_bin, flag_option_list=[], operand_list=[], implicit_use_of_streaming_input=input_id, implicit_use_of_streaming_output=output, - access_map=access_map) + access_map=access_map, + ) return RUnwrap(cmd_inv_with_io_vars) diff --git a/compiler/definitions/ir/nodes/r_wrap.py b/compiler/definitions/ir/nodes/r_wrap.py index 2a5f79ee9..afb93546d 100644 --- a/compiler/definitions/ir/nodes/r_wrap.py +++ b/compiler/definitions/ir/nodes/r_wrap.py @@ -1,26 +1,35 @@ from pash_annotations.datatypes.AccessKind import make_stream_output, make_stream_input from pash_annotations.datatypes.BasicDatatypes import ArgStringType -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) -from annotations_utils.util_cmd_invocations import to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping +from annotations_utils.util_cmd_invocations import ( + to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping, +) from definitions.ir.dfg_node import * from shell_ast.ast_util import * + class RWrap(DFGNode): - def __init__(self, - cmd_invocation_with_io_vars, - com_redirs=[], - com_assignments=[], - parallelizer_list=None, - cmd_related_properties=None, - wrapped_node_name=None): + def __init__( + self, + cmd_invocation_with_io_vars, + com_redirs=[], + com_assignments=[], + parallelizer_list=None, + cmd_related_properties=None, + wrapped_node_name=None, + ): # TODO []: default self.wrapped_node_name = wrapped_node_name - super().__init__(cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties) + super().__init__( + cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) ## Get the label of the node. By default, it is simply the name def get_dot_label(self) -> str: @@ -29,31 +38,36 @@ def get_dot_label(self) -> str: basename = os.path.basename(str(name)) wrapped_node_name = self.wrapped_node_name - return f'{basename}({wrapped_node_name})' + return f"{basename}({wrapped_node_name})" + def wrap_node(node: DFGNode, edges): - r_wrap_bin = os.path.join(config.PASH_TOP, config.config['runtime']['r_wrap_binary']) + r_wrap_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["r_wrap_binary"] + ) ## At the moment we can only wrap a node that takes its input from stdin ## and outputs to stdout. Therefore the node needs to have only one input and one output. ## TO CHECK: with the remodelling also other cases should be handled inputs = node.get_input_list() - assert(len(inputs) == 1) + assert len(inputs) == 1 input_id = inputs[0] outputs = node.get_output_list() ## TODO: Would it make sense for outputs to be less than one? ## TODO: changed this from <= to == 1 to simplify reasoning later for now - assert(len(outputs) == 1) + assert len(outputs) == 1 output_id = outputs[0] access_map = {input_id: make_stream_input(), output_id: make_stream_output()} - #create bash -c argument + # create bash -c argument cmd_inv_with_io_vars: CommandInvocationWithIOVars = node.cmd_invocation_with_io_vars # do we need to copy here? currently, it seems fine cmd_inv_with_io_vars.remove_streaming_inputs() cmd_inv_with_io_vars.remove_streaming_outputs() # any non-streaming inputs or outputs are converted here already! - cmd = to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping(cmd_inv_with_io_vars, edges) + cmd = to_arg_from_cmd_inv_with_io_vars_without_streaming_inputs_or_outputs_for_wrapping( + cmd_inv_with_io_vars, edges + ) bash_command_arg = [Arg.string_to_arg("bash -c")] operand_list = bash_command_arg + [cmd] @@ -64,13 +78,16 @@ def wrap_node(node: DFGNode, edges): operand_list=operand_list, implicit_use_of_streaming_input=input_id, implicit_use_of_streaming_output=output_id, - access_map=access_map) + access_map=access_map, + ) ## TODO: It is not clear if it is safe to just pass redirections and assignments down the line as is redirs = node.com_redirs assignments = node.com_assignments - return RWrap(cmd_inv_with_io_vars, - com_redirs=redirs, - com_assignments=assignments, - wrapped_node_name=node.cmd_invocation_with_io_vars.cmd_name) + return RWrap( + cmd_inv_with_io_vars, + com_redirs=redirs, + com_assignments=assignments, + wrapped_node_name=node.cmd_invocation_with_io_vars.cmd_name, + ) diff --git a/compiler/definitions/ir/nodes/remote_pipe.py b/compiler/definitions/ir/nodes/remote_pipe.py index 7e35faf32..c60d78de0 100644 --- a/compiler/definitions/ir/nodes/remote_pipe.py +++ b/compiler/definitions/ir/nodes/remote_pipe.py @@ -1,12 +1,27 @@ from definitions.ir.dfg_node import * + class RemotePipe(DFGNode): - def __init__(self, inputs, outputs, com_name, com_category, - com_options = [], com_redirs = [], com_assignments=[]): - super().__init__(inputs, outputs, com_name, com_category, - com_options=com_options, - com_redirs=com_redirs, - com_assignments=com_assignments) + def __init__( + self, + inputs, + outputs, + com_name, + com_category, + com_options=[], + com_redirs=[], + com_assignments=[], + ): + super().__init__( + inputs, + outputs, + com_name, + com_category, + com_options=com_options, + com_redirs=com_redirs, + com_assignments=com_assignments, + ) + def make_remote_pipe(inputs, outputs, host_ip, port, is_remote_read, id): com_category = "pure" @@ -14,17 +29,17 @@ def make_remote_pipe(inputs, outputs, host_ip, port, is_remote_read, id): opt_count = 0 if is_remote_read: - remote_pipe_bin = os.path.join(config.PASH_TOP, config.config['runtime']['remote_read_binary']) + remote_pipe_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["remote_read_binary"] + ) else: - remote_pipe_bin = os.path.join(config.PASH_TOP, config.config['runtime']['remote_write_binary']) + remote_pipe_bin = os.path.join( + config.PASH_TOP, config.config["runtime"]["remote_write_binary"] + ) com_name = Arg.string_to_arg(remote_pipe_bin) options.append((opt_count, Arg.string_to_arg(f"--addr {host_ip}:{port}"))) options.append((opt_count + 1, Arg.string_to_arg(f"--id {id}"))) - return RemotePipe(inputs, - outputs, - com_name, - com_category, - com_options=options) + return RemotePipe(inputs, outputs, com_name, com_category, com_options=options) diff --git a/compiler/definitions/ir/redirection.py b/compiler/definitions/ir/redirection.py index 5a4a745df..bee70d714 100644 --- a/compiler/definitions/ir/redirection.py +++ b/compiler/definitions/ir/redirection.py @@ -1,7 +1,8 @@ from definitions.ir.arg import * from shell_ast.ast_util import * -class Redirection(): + +class Redirection: def __init__(self, redirection: RedirectionNode): if isinstance(redirection, FileRedirNode): self.redir_type = FileRedirNode.NodeName @@ -16,32 +17,29 @@ def __init__(self, redirection: RedirectionNode): # log(redirection) ## TODO: Support all redirections - assert(self.redir_type == 'File') - assert(self.redir_subtype in ['To', 'From']) + assert self.redir_type == "File" + assert self.redir_subtype in ["To", "From"] def __repr__(self): - return '({}, {}, {}, {})'.format(self.redir_type, - self.redir_subtype, - self.stream_id, - self.file_arg) + return "({}, {}, {}, {})".format( + self.redir_type, self.redir_subtype, self.stream_id, self.file_arg + ) def to_ast(self): - redir = make_kv(self.redir_type, - [self.redir_subtype, - self.stream_id, - self.file_arg.to_ast()]) + redir = make_kv( + self.redir_type, + [self.redir_subtype, self.stream_id, self.file_arg.to_ast()], + ) return redir def is_to_file(self): - return (self.redir_type == 'File' - and self.redir_subtype == 'To') + return self.redir_type == "File" and self.redir_subtype == "To" def is_for_stdout(self): - return (self.stream_id == 1) + return self.stream_id == 1 def is_from_file(self): - return (self.redir_type == 'File' - and self.redir_subtype == 'From') + return self.redir_type == "File" and self.redir_subtype == "From" def is_for_stdin(self): - return (self.stream_id == 0) + return self.stream_id == 0 diff --git a/compiler/definitions/ir/resource.py b/compiler/definitions/ir/resource.py index c6ad69c5e..4b7b9fe85 100644 --- a/compiler/definitions/ir/resource.py +++ b/compiler/definitions/ir/resource.py @@ -7,6 +7,7 @@ ## TODO: Resources should probably be more elaborate than just a ## string and a line range. They could be URLs, and possibly other things. + ## TODO: Think if we can have any optimizations if we know the size of a resource. class Resource: def __init__(self, uri): @@ -27,26 +28,25 @@ def __eq__(self, other): if isinstance(other, Resource): return self.uri == other.uri return False - + + class FileDescriptorResource(Resource): def __init__(self, fd): - assert(isinstance(fd, tuple) - and len(fd) == 2 - and fd[0] == 'fd') + assert isinstance(fd, tuple) and len(fd) == 2 and fd[0] == "fd" self.uri = fd def is_stdin(self): - return (self.uri == ('fd', 0)) + return self.uri == ("fd", 0) def is_stdout(self): - return (self.uri == ('fd', 1)) + return self.uri == ("fd", 1) class FileResource(Resource): ## The uri is the path of the file. def __init__(self, path): log("class of path", type(path)) - assert(isinstance(path, Arg)) + assert isinstance(path, Arg) ## TODO: Make sure that paths are normalized self.uri = path @@ -55,15 +55,18 @@ def __eq__(self, other): return self.uri == other.uri return False + class TemporaryFileResource(Resource): def __init__(self): self.uri = None + # A FIFO. class EphemeralResource(Resource): def __init__(self): self.uri = None + class RemoteFileResource(Resource): def __init__(self): raise NotImplementedError("RemoteFileResource is an interface") @@ -84,15 +87,16 @@ def _normalize_addr(self, addr): normalized_host = socket.gethostbyaddr(host)[2][0] return normalized_host + class HDFSFileResource(RemoteFileResource): ## The uri is the path of the file. def __init__(self, uri, resource_hosts): """ Params: - uri: Usually the path to the file. The path doesn't include the top directory - which is different between hosts. The str function adds the prefix $HDFS_DATANODE_DIR/ + uri: Usually the path to the file. The path doesn't include the top directory + which is different between hosts. The str function adds the prefix $HDFS_DATANODE_DIR/ which should be defined on host machine worker environment. - resource_hosts: the addresses of all the machines containing + resource_hosts: the addresses of all the machines containing the resource. """ self.uri = uri @@ -107,11 +111,12 @@ def is_available_on(self, host): return host in self.hosts def __repr__(self): - return f'hdfs://{self.uri}' + return f"hdfs://{self.uri}" def __str__(self): return f"$HDFS_DATANODE_DIR/{self.uri}" + # DFS logical split resource class DFSSplitResource(RemoteFileResource): def __init__(self, config, config_path, split_num, hosts): @@ -125,6 +130,6 @@ def is_available_on(self, host): def set_config_path(self, config_path): self.config_path = config_path - + def __str__(self): return self.config_path diff --git a/compiler/dspash/hdfs_file_data.py b/compiler/dspash/hdfs_file_data.py index cffb45677..5b8933d4b 100644 --- a/compiler/dspash/hdfs_file_data.py +++ b/compiler/dspash/hdfs_file_data.py @@ -38,13 +38,14 @@ def paths(self): ) return filepaths + class HDFSFileConfig: def __init__(self, filedata: FileData): - self.blocks : List[HDFSBlock] = [] + self.blocks: List[HDFSBlock] = [] for i, block_path in enumerate(filedata.paths()): hosts = list(map(lambda addr: addr.rsplit(":", 1)[0], filedata.machines[i])) self.blocks.append(HDFSBlock(block_path, hosts)) - + def _serialize(self): data = {"blocks": []} for path, hosts in self.blocks: @@ -57,7 +58,7 @@ def dumps(self): def dump(self, filepath): data = self._serialize() - with open(filepath, 'w') as f: + with open(filepath, "w") as f: json.dump(data, f) def __eq__(self, __o: object) -> bool: @@ -65,10 +66,13 @@ def __eq__(self, __o: object) -> bool: return False return self.blocks == __o.blocks + def get_hdfs_file_data(filename): info = FileData(filename) log = subprocess.check_output( - "hdfs fsck {0} -files -blocks -locations".format(filename), shell=True, stderr=subprocess.PIPE + "hdfs fsck {0} -files -blocks -locations".format(filename), + shell=True, + stderr=subprocess.PIPE, ) count = 0 for line in log.splitlines(): @@ -95,6 +99,7 @@ def get_hdfs_file_data(filename): assert info.size > 0 return info + def _getIPs(raw): rawparts = raw.split(" ") ips = [] @@ -103,6 +108,7 @@ def _getIPs(raw): ips.append(part[index + len("DatanodeInfoWithStorage") + 1 : part.find(",")]) return ips + if __name__ == "__main__": assert len(sys.argv) == 2 filename = sys.argv[1] diff --git a/compiler/dspash/hdfs_utils.py b/compiler/dspash/hdfs_utils.py index c86109702..94fccd60f 100644 --- a/compiler/dspash/hdfs_utils.py +++ b/compiler/dspash/hdfs_utils.py @@ -1,24 +1,30 @@ from dspash.hdfs_file_data import get_hdfs_file_data, FileData, HDFSFileConfig from typing import List, Tuple -def get_cmd_output(cmd:str): - ret = subprocess.check_output(cmd, shell=True, universal_newlines=True, stderr=subprocess.PIPE) + +def get_cmd_output(cmd: str): + ret = subprocess.check_output( + cmd, shell=True, universal_newlines=True, stderr=subprocess.PIPE + ) return ret.strip() -def _remove_prefix(s:str, prefix:str) -> str: + +def _remove_prefix(s: str, prefix: str) -> str: if s.startswith(prefix): - return s[len(prefix):] + return s[len(prefix) :] return s + def get_datanode_dir() -> str: data_dir = get_cmd_output("hdfs getconf -confKey dfs.datanode.data.dir") data_dir = _remove_prefix(data_dir, "file://") return data_dir + def get_file_data(filename: str) -> FileData: return get_hdfs_file_data(filename) + def get_file_config(filename: str) -> HDFSFileConfig: filedata = get_file_data(filename) return HDFSFileConfig(filedata) - diff --git a/compiler/dspash/ir_helper.py b/compiler/dspash/ir_helper.py index 7ce37d80e..f73b63600 100644 --- a/compiler/dspash/ir_helper.py +++ b/compiler/dspash/ir_helper.py @@ -6,6 +6,7 @@ from datetime import datetime from typing import List, Set, Tuple, Dict, Callable from uuid import uuid4 + sys.path.append("/pash/compiler") import config @@ -40,10 +41,11 @@ def read_graph(filename): ir, shell_vars = pickle.load(ir_file) return ir, shell_vars -def save_configs(graph:IR, dfs_configs_paths: Dict[HDFSFileConfig, str]): + +def save_configs(graph: IR, dfs_configs_paths: Dict[HDFSFileConfig, str]): for edge in graph.all_fids(): if isinstance(edge.get_resource(), DFSSplitResource): - resource : DFSSplitResource = edge.get_resource() + resource: DFSSplitResource = edge.get_resource() config: HDFSFileConfig = resource.config if config not in dfs_configs_paths: config_path = ptempfile() @@ -55,14 +57,15 @@ def save_configs(graph:IR, dfs_configs_paths: Dict[HDFSFileConfig, str]): resource.set_config_path(config_path) + def to_shell_file(graph: IR, args) -> str: filename = ptempfile() - + dirs = set() for edge in graph.all_fids(): directory = os.path.join(config.PASH_TMP_PREFIX, edge.prefix) dirs.add(directory) - + for directory in dirs: os.makedirs(directory, exist_ok=True) @@ -74,6 +77,7 @@ def to_shell_file(graph: IR, args) -> str: f.write(script) return filename + def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: """ Takes an optimized IR and splits it subgraphs. Every subgraph is a continues section between a splitter and a merger. @@ -99,7 +103,7 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: """ source_node_ids = graph.source_nodes() input_fifo_map = defaultdict(list) - + subgraphs = [] queue = deque([(source, IR({}, {})) for source in source_node_ids]) @@ -112,13 +116,13 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: input_fids = graph.get_node_input_fids(old_node_id) output_fids = graph.get_node_output_fids(old_node_id) - if(any(map(lambda fid:fid not in visited_edges, input_fids))): + if any(map(lambda fid: fid not in visited_edges, input_fids)): if subgraph.source_nodes(): subgraphs.append(subgraph) continue - + # Second condition makes sure we don't add empty graphs - if len(input_fids) > 1 and subgraph.source_nodes(): # merger node + if len(input_fids) > 1 and subgraph.source_nodes(): # merger node if subgraph not in subgraphs: subgraphs.append(subgraph) subgraph = IR({}, {}) @@ -127,7 +131,7 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: continue else: visited_nodes.add(old_node_id) - + node = graph.get_node(old_node_id).copy() node_id = node.get_id() @@ -141,7 +145,7 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: else: input_edge_id = input_fid.get_ident() subgraph.set_edge_to(input_edge_id, node_id) - # keep track + # keep track input_fifo_map[input_edge_id].append(subgraph) # Add edges coming out of the node @@ -152,7 +156,7 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: # Add edges coming into the node for input_fid in input_fids: if input_fid.get_ident() not in subgraph.edges: - subgraph.add_to_edge(input_fid, node_id) + subgraph.add_to_edge(input_fid, node_id) # Add the node subgraph.add_node(node) @@ -164,21 +168,28 @@ def split_ir(graph: IR) -> Tuple[List[IR], Dict[int, IR]]: subgraphs.append(subgraph) for next_id in next_ids: queue.append((next_id, IR({}, {}))) - + # print(list(map(lambda k : k.all_fids(), graphs))) return subgraphs, input_fifo_map -def add_stdout_fid(graph : IR, file_id_gen: FileIdGen) -> FileId: + +def add_stdout_fid(graph: IR, file_id_gen: FileIdGen) -> FileId: stdout = file_id_gen.next_file_id() - stdout.set_resource(FileDescriptorResource(('fd', 1))) + stdout.set_resource(FileDescriptorResource(("fd", 1))) graph.add_edge(stdout) return stdout -def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, input_fifo_map:Dict[int, IR], get_worker: Callable) -> (IR, Tuple): - """ Takes a list of subgraphs and assigns a worker to each subgraph and augment - the subgraphs with the necessary remote read/write nodes for data movement - between workers. This function also produces graph that should run in - the original shell in which pash was executed. This graph contains + +def assign_workers_to_subgraphs( + subgraphs: List[IR], + file_id_gen: FileIdGen, + input_fifo_map: Dict[int, IR], + get_worker: Callable, +) -> (IR, Tuple): + """Takes a list of subgraphs and assigns a worker to each subgraph and augment + the subgraphs with the necessary remote read/write nodes for data movement + between workers. This function also produces graph that should run in + the original shell in which pash was executed. This graph contains remote read/write nodes for stdin/stdout, named pipes, and files. Args: @@ -197,13 +208,15 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu # Replace output edges and corrosponding input edges with remote read/write for subgraph in subgraphs: - subgraph_critical_fids = list(filter(lambda fid: fid.has_remote_file_resource(), subgraph.all_fids())) + subgraph_critical_fids = list( + filter(lambda fid: fid.has_remote_file_resource(), subgraph.all_fids()) + ) worker = get_worker(subgraph_critical_fids) worker._running_processes += 1 worker_subgraph_pairs.append((worker, subgraph)) sink_nodes = subgraph.sink_nodes() - assert(len(sink_nodes) == 1) - + assert len(sink_nodes) == 1 + for out_edge in subgraph.get_node_output_fids(sink_nodes[0]): stdout = add_stdout_fid(subgraph, file_id_gen) out_edge_id = out_edge.get_ident() @@ -213,9 +226,16 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu subgraph.replace_edge(out_edge_id, ephemeral_edge) edge_uid = uuid4() # Add remote-write node at the end of the subgraph - remote_write = remote_pipe.make_remote_pipe([ephemeral_edge.get_ident()], [stdout.get_ident()], worker.host(), DISCOVERY_PORT, False, edge_uid) + remote_write = remote_pipe.make_remote_pipe( + [ephemeral_edge.get_ident()], + [stdout.get_ident()], + worker.host(), + DISCOVERY_PORT, + False, + edge_uid, + ) subgraph.add_node(remote_write) - + # Copy the old output edge resource new_edge = file_id_gen.next_file_id() new_edge.set_resource(out_edge.get_resource()) @@ -227,8 +247,15 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu else: matching_subgraph = main_graph matching_subgraph.add_edge(new_edge) - - remote_read = remote_pipe.make_remote_pipe([], [new_edge.get_ident()], worker.host(), DISCOVERY_PORT, True, edge_uid) + + remote_read = remote_pipe.make_remote_pipe( + [], + [new_edge.get_ident()], + worker.host(), + DISCOVERY_PORT, + True, + edge_uid, + ) matching_subgraph.add_node(remote_read) # Replace non ephemeral input edges with remote read/write @@ -236,7 +263,10 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu source_nodes = subgraph.source_nodes() for source in source_nodes: for in_edge in subgraph.get_node_input_fids(source): - if in_edge.has_file_resource() or in_edge.has_file_descriptor_resource(): + if ( + in_edge.has_file_resource() + or in_edge.has_file_descriptor_resource() + ): # setup stdout = add_stdout_fid(main_graph, file_id_gen) @@ -247,14 +277,28 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu # Add remote write to main subgraph edge_uid = uuid4() - remote_write = remote_pipe.make_remote_pipe([new_edge.get_ident()], [stdout.get_ident()], HOST, DISCOVERY_PORT, False, edge_uid) + remote_write = remote_pipe.make_remote_pipe( + [new_edge.get_ident()], + [stdout.get_ident()], + HOST, + DISCOVERY_PORT, + False, + edge_uid, + ) main_graph.add_node(remote_write) # Add remote read to current subgraph ephemeral_edge = file_id_gen.next_ephemeral_file_id() subgraph.replace_edge(in_edge.get_ident(), ephemeral_edge) - remote_read = remote_pipe.make_remote_pipe([], [ephemeral_edge.get_ident()], HOST, DISCOVERY_PORT, True, edge_uid) + remote_read = remote_pipe.make_remote_pipe( + [], + [ephemeral_edge.get_ident()], + HOST, + DISCOVERY_PORT, + True, + edge_uid, + ) subgraph.add_node(remote_read) else: # sometimes a command can have both a file resource and an ephemeral resources (example: spell oneliner) @@ -262,18 +306,19 @@ def assign_workers_to_subgraphs(subgraphs:List[IR], file_id_gen: FileIdGen, inpu return main_graph, worker_subgraph_pairs -def prepare_graph_for_remote_exec(filename:str, get_worker:Callable): + +def prepare_graph_for_remote_exec(filename: str, get_worker: Callable): """ Reads the complete ir from filename and splits it into subgraphs where ony the first subgraph represent a continues - segment (merger segment or branched segment) in the graph. + segment (merger segment or branched segment) in the graph. Note: All subgraphs(except first one) read and write from remote pipes. However, we had to add a fake stdout to avoid some problems when converting to shell code. - Returns: + Returns: worker_graph_pairs: List of (worker, subgraph) shell_vars: shell variables - main_graph: The ir we need to execute on the main shell. + main_graph: The ir we need to execute on the main shell. This graph contains edges to correctly redirect the following to remote workers - special pipes (stdin/stdout) - named pipes reading and writing @@ -282,5 +327,7 @@ def prepare_graph_for_remote_exec(filename:str, get_worker:Callable): ir, shell_vars = read_graph(filename) file_id_gen = ir.get_file_id_gen() subgraphs, mapping = split_ir(ir) - main_graph, worker_graph_pairs = assign_workers_to_subgraphs(subgraphs, file_id_gen, mapping, get_worker) + main_graph, worker_graph_pairs = assign_workers_to_subgraphs( + subgraphs, file_id_gen, mapping, get_worker + ) return worker_graph_pairs, shell_vars, main_graph diff --git a/compiler/dspash/socket_utils.py b/compiler/dspash/socket_utils.py index d3c736f3c..0598626fe 100644 --- a/compiler/dspash/socket_utils.py +++ b/compiler/dspash/socket_utils.py @@ -6,20 +6,23 @@ import pickle import struct + def send_msg(sock, msg): # Prefix each message with a 4-byte length (network byte order) - msg = struct.pack('>I', len(msg)) + msg + msg = struct.pack(">I", len(msg)) + msg sock.sendall(msg) + def recv_msg(sock): # Read message length and unpack it into an integer raw_msglen = recvall(sock, 4) if not raw_msglen: return None - msglen = struct.unpack('>I', raw_msglen)[0] + msglen = struct.unpack(">I", raw_msglen)[0] # Read the message data return recvall(sock, msglen) + def recvall(sock, n): # Helper function to recv n bytes or return None if EOF is hit data = bytearray() @@ -30,12 +33,15 @@ def recvall(sock, n): data.extend(packet) return data + def encode_request(obj: dict): return pickle.dumps(obj) + def decode_request(b: bytes): return pickle.loads(b) + ## TODO: SocketManager might need to handle errors more gracefully class SocketManager: def __init__(self, server_address): @@ -56,32 +62,31 @@ def __init__(self, server_address): # log("SocketManager: Created socket") self.sock.bind(server_address) - # log("SocketManager: Successfully bound to socket") + # log("SocketManager: Successfully bound to socket") ## TODO: Check if we need to configure the back# log - self.sock.listen() - # log("SocketManager: Listenting on socket") - + self.sock.listen() + # log("SocketManager: Listenting on socket") def get_next_cmd(self): connection, client_address = self.sock.accept() data = connection.recv(self.buf_size) ## TODO: This could be avoided for efficiency - str_data = data.decode('utf-8') + str_data = data.decode("utf-8") # log("Received data:", str_data) ## TODO: Lift this requirement if needed ## ## We need to ensure that we read a command at once or the command was empty (only relevant in the first invocation) - assert(str_data.endswith("\n") or str_data == "") - + assert str_data.endswith("\n") or str_data == "" + return str_data, connection ## This method respond to the connection we last got input from ## In the case of the UnixPipes, we don't have any state management here ## since all reads/writes go to/from the same fifos def respond(self, message, connection): - bytes_message = message.encode('utf-8') + bytes_message = message.encode("utf-8") connection.sendall(bytes_message) connection.close() diff --git a/compiler/dspash/utils.py b/compiler/dspash/utils.py index 6402c94dd..a503e698b 100644 --- a/compiler/dspash/utils.py +++ b/compiler/dspash/utils.py @@ -3,16 +3,19 @@ import tempfile import uuid + def read_file(file, mode="r"): with open(file, mode) as f: data = f.read() return data + def write_file(file, data, mode="w"): with open(file, mode) as f: n = f.write(data) return n + def create_filename(dir, prefix="", temp=False): if temp: return tempfile.mkstemp(dir=dir, prefix=prefix) diff --git a/compiler/dspash/worker.py b/compiler/dspash/worker.py index 4b60ef766..1df79b5f2 100644 --- a/compiler/dspash/worker.py +++ b/compiler/dspash/worker.py @@ -11,7 +11,7 @@ import uuid import argparse -PASH_TOP = os.environ['PASH_TOP'] +PASH_TOP = os.environ["PASH_TOP"] sys.path.append(os.path.join(PASH_TOP, "compiler")) import config @@ -23,42 +23,43 @@ # from ... import config HOST = socket.gethostbyname(socket.gethostname()) -PORT = 65432 # Port to listen on (non-privileged ports are > 1023) +PORT = 65432 # Port to listen on (non-privileged ports are > 1023) def err_print(*args): print(*args, file=sys.stderr) -def send_success(conn, body, msg = ""): - request = { - 'status': 'OK', - 'body': body, - 'msg': msg - } + +def send_success(conn, body, msg=""): + request = {"status": "OK", "body": body, "msg": msg} send_msg(conn, encode_request(request)) + def parse_exec_request(request): - return request['cmd'] + return request["cmd"] + def parse_exec_graph(request): - return request['graph'], request['shell_variables'], request['functions'] + return request["graph"], request["shell_variables"], request["functions"] + def exec_graph(graph, shell_vars, functions): - config.config['shell_variables'] = shell_vars + config.config["shell_variables"] = shell_vars script_path = to_shell_file(graph, config.pash_args) e = os.environ.copy() - e['PASH_TOP'] = PASH_TOP + e["PASH_TOP"] = PASH_TOP # store functions - functions_file = create_filename(dir=config.PASH_TMP_PREFIX, prefix='pashFuncs') + functions_file = create_filename(dir=config.PASH_TMP_PREFIX, prefix="pashFuncs") write_file(functions_file, functions) cmd = f"source {functions_file}; source {script_path}" rc = subprocess.Popen(cmd, env=e, executable="/bin/bash", shell=True) return rc + class Worker: - def __init__(self, port = None): + def __init__(self, port=None): self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if port == None: # pick a random port @@ -71,19 +72,20 @@ def run(self): connections = [] with self.s: self.s.listen() - while(True): + while True: conn, addr = self.s.accept() - print(f"got new connection") + print(f"got new connection") t = Thread(target=manage_connection, args=[conn, addr]) t.start() connections.append(t) for t in connections: t.join() + def manage_connection(conn, addr): rcs = [] with conn: - print('Connected by', addr) + print("Connected by", addr) dfs_configs_paths = {} while True: data = recv_msg(conn) @@ -92,7 +94,7 @@ def manage_connection(conn, addr): print("got new request") request = decode_request(data) - if request['type'] == 'Exec-Graph': + if request["type"] == "Exec-Graph": graph, shell_vars, functions = parse_exec_graph(request) save_configs(graph, dfs_configs_paths) exec_graph(graph, shell_vars, functions) @@ -104,12 +106,10 @@ def manage_connection(conn, addr): for rc in rcs: rc.wait() + def parse_args(): - parser = argparse.ArgumentParser(description='Process some integers.') - parser.add_argument("--port", - type=int, - help="port to use", - default=65432) + parser = argparse.ArgumentParser(description="Process some integers.") + parser.add_argument("--port", type=int, help="port to use", default=65432) config.add_common_arguments(parser) args = parser.parse_args() config.set_config_globals_from_pash_args(args) @@ -119,19 +119,22 @@ def parse_args(): config.load_config(args.config_path) return args + def init(): args = parse_args() config.LOGGING_PREFIX = f"Worker {config.pash_args.port}: " ## KK: 2023-02-21 Commenting this out, we need to figure out if the new annotations work with the distribution package # config.annotations = load_annotation_files( # config.config['distr_planner']['annotations_dir']) - pash_compiler.runtime_config = config.config['distr_planner'] + pash_compiler.runtime_config = config.config["distr_planner"] pash_compiler.termination = "" + def main(): init() worker = Worker(config.pash_args.port) worker.run() + if __name__ == "__main__": main() diff --git a/compiler/dspash/worker_manager.py b/compiler/dspash/worker_manager.py index 3bcfa1c50..0001a5af9 100644 --- a/compiler/dspash/worker_manager.py +++ b/compiler/dspash/worker_manager.py @@ -5,18 +5,27 @@ import pickle import json -from dspash.socket_utils import SocketManager, encode_request, decode_request, send_msg, recv_msg +from dspash.socket_utils import ( + SocketManager, + encode_request, + decode_request, + send_msg, + recv_msg, +) from util import log from dspash.ir_helper import prepare_graph_for_remote_exec, to_shell_file from dspash.utils import read_file -import config +import config import copy -PORT = 65425 # Port to listen on (non-privileged ports are > 1023) +PORT = 65425 # Port to listen on (non-privileged ports are > 1023) + class WorkerConnection: def __init__(self, host, port): - self._host = socket.gethostbyaddr(host)[2][0] # get ip address in case host needs resolving + self._host = socket.gethostbyaddr(host)[2][ + 0 + ] # get ip address in case host needs resolving self._port = port self._running_processes = 0 self._online = True @@ -26,7 +35,7 @@ def __init__(self, host, port): self._socket.connect((self._host, self._port)) except Exception as e: self._online = False - + def is_online(self): # TODO: create a ping to confirm is online return self._online @@ -42,17 +51,18 @@ def get_running_processes(self): return self._running_processes def send_graph_exec_request(self, graph, shell_vars, functions) -> bool: - request_dict = { 'type': 'Exec-Graph', - 'graph': graph, - 'functions': functions, - 'shell_variables': None # Doesn't seem needed for now - } + request_dict = { + "type": "Exec-Graph", + "graph": graph, + "functions": functions, + "shell_variables": None, # Doesn't seem needed for now + } request = encode_request(request_dict) - #TODO: do I need to open and close connection? + # TODO: do I need to open and close connection? send_msg(self._socket, request) # TODO wait until the command exec finishes and run this in parallel? response_data = recv_msg(self._socket) - if not response_data or decode_request(response_data)['status'] != "OK": + if not response_data or decode_request(response_data)["status"] != "OK": raise Exception(f"didn't recieved ack on request {response_data}") else: # self._running_processes += 1 #TODO: decrease in case of failure or process ended @@ -77,15 +87,16 @@ def __str__(self): def host(self): return self._host -class WorkersManager(): + +class WorkersManager: def __init__(self, workers: WorkerConnection = []): self.workers = workers self.host = socket.gethostbyname(socket.gethostname()) self.args = copy.copy(config.pash_args) # Required to create a correct multi sink graph - self.args.termination = "" + self.args.termination = "" - def get_worker(self, fids = None) -> WorkerConnection: + def get_worker(self, fids=None) -> WorkerConnection: if not fids: fids = [] @@ -93,12 +104,15 @@ def get_worker(self, fids = None) -> WorkerConnection: for worker in self.workers: if not worker.is_online(): continue - + # Skip if any provided fid isn't available on the worker machine if any(map(lambda fid: not fid.is_available_on(worker.host()), fids)): continue - if best_worker is None or best_worker.get_running_processes() > worker.get_running_processes(): + if ( + best_worker is None + or best_worker.get_running_processes() > worker.get_running_processes() + ): best_worker = worker if best_worker == None: @@ -110,31 +124,36 @@ def add_worker(self, host, port): self.workers.append(WorkerConnection(host, port)) def add_workers_from_cluster_config(self, config_path): - with open(config_path, 'r') as f: + with open(config_path, "r") as f: cluster_config = json.load(f) workers = cluster_config["workers"].values() for worker in workers: - host = worker['host'] - port = worker['port'] + host = worker["host"] + port = worker["port"] self.add_worker(host, port) - - + def run(self): workers_manager = self - workers_manager.add_workers_from_cluster_config(os.path.join(config.PASH_TOP, 'cluster.json')) + workers_manager.add_workers_from_cluster_config( + os.path.join(config.PASH_TOP, "cluster.json") + ) - dspash_socket = SocketManager(os.getenv('DSPASH_SOCKET')) + dspash_socket = SocketManager(os.getenv("DSPASH_SOCKET")) while True: request, conn = dspash_socket.get_next_cmd() if request.startswith("Done"): dspash_socket.close() break elif request.startswith("Exec-Graph"): - args = request.split(':', 1)[1].strip() + args = request.split(":", 1)[1].strip() filename, declared_functions_file = args.split() - worker_subgraph_pairs, shell_vars, main_graph = prepare_graph_for_remote_exec(filename, self.get_worker) + ( + worker_subgraph_pairs, + shell_vars, + main_graph, + ) = prepare_graph_for_remote_exec(filename, self.get_worker) script_fname = to_shell_file(main_graph, self.args) log("Master node graph stored in ", script_fname) @@ -148,9 +167,12 @@ def run(self): # Execute subgraphs on workers for worker, subgraph in worker_subgraph_pairs: - worker.send_graph_exec_request(subgraph, shell_vars, declared_functions) + worker.send_graph_exec_request( + subgraph, shell_vars, declared_functions + ) else: raise Exception(f"Unknown request: {request}") - + + if __name__ == "__main__": WorkersManager().run() diff --git a/compiler/env_var_names.py b/compiler/env_var_names.py index 81c45b289..5fe7ac597 100644 --- a/compiler/env_var_names.py +++ b/compiler/env_var_names.py @@ -1,10 +1,11 @@ - ## ## Variable names used in the pash runtime ## + def loop_iters_var() -> str: - return 'pash_loop_iters' + return "pash_loop_iters" + def loop_iter_var(loop_id: int) -> str: - return f'pash_loop_{loop_id}_iter' \ No newline at end of file + return f"pash_loop_{loop_id}_iter" diff --git a/compiler/ir.py b/compiler/ir.py index 211d1242b..386d4d20b 100644 --- a/compiler/ir.py +++ b/compiler/ir.py @@ -2,14 +2,29 @@ from pash_annotations.datatypes.CommandInvocationInitial import CommandInvocationInitial from pash_annotations.datatypes.BasicDatatypes import ArgStringType -from pash_annotations.datatypes.BasicDatatypesWithIO import FileNameWithIOInfo, StdDescriptorWithIOInfo, OptionWithIO -from pash_annotations.annotation_generation.datatypes.InputOutputInfo import InputOutputInfo -from pash_annotations.annotation_generation.datatypes.ParallelizabilityInfo import ParallelizabilityInfo -from pash_annotations.annotation_generation.datatypes.CommandProperties import CommandProperties -from pash_annotations.datatypes.CommandInvocationWithIOVars import CommandInvocationWithIOVars +from pash_annotations.datatypes.BasicDatatypesWithIO import ( + FileNameWithIOInfo, + StdDescriptorWithIOInfo, + OptionWithIO, +) +from pash_annotations.annotation_generation.datatypes.InputOutputInfo import ( + InputOutputInfo, +) +from pash_annotations.annotation_generation.datatypes.ParallelizabilityInfo import ( + ParallelizabilityInfo, +) +from pash_annotations.annotation_generation.datatypes.CommandProperties import ( + CommandProperties, +) +from pash_annotations.datatypes.CommandInvocationWithIOVars import ( + CommandInvocationWithIOVars, +) from annotations_utils.util_parsing import parse_arg_list_to_command_invocation -from annotations_utils.util_cmd_invocations import get_input_output_info_from_cmd_invocation_util, get_parallelizability_info_from_cmd_invocation_util +from annotations_utils.util_cmd_invocations import ( + get_input_output_info_from_cmd_invocation_util, + get_parallelizability_info_from_cmd_invocation_util, +) from annotations_utils.util_file_descriptors import resource_from_file_descriptor from definitions.ir.file_id import * @@ -26,23 +41,26 @@ import config + ## Creates a file id for a given resource def create_file_id_for_resource(resource, fileIdGen): file_id = create_split_file_id(fileIdGen) file_id.set_resource(resource) return file_id + ## Creates a file id that has a given maximum length def create_split_file_id(fileIdGen): file_id = fileIdGen.next_file_id() return file_id + class FileIdGen: - def __init__(self, next = 0, prefix = ""): + def __init__(self, next=0, prefix=""): self.next = next + 1 directory = f"{str(uuid.uuid4().hex)}" self.prefix = f"{directory}/{prefix}" - directory_path = os.path.join(config.PASH_TMP_PREFIX, self.prefix) + directory_path = os.path.join(config.PASH_TMP_PREFIX, self.prefix) os.makedirs(directory_path) def next_file_id(self): @@ -64,37 +82,44 @@ def bump_counter_to_value_of(self, OtherFileIdGen): # TODO: find a better solution to make unique numbers, currently: set to max-value + 1 self.next = OtherFileIdGen.next + 1 + ## Returns the resource or file descriptor related to this specific opt_or_fd -## NOTE: Assumes that everything is expanded. +## NOTE: Assumes that everything is expanded. def get_option_or_fd(opt_or_fd, options, fileIdGen): - if(isinstance(opt_or_fd, tuple) - and len(opt_or_fd) == 2 - and opt_or_fd[0] == "option"): + if ( + isinstance(opt_or_fd, tuple) + and len(opt_or_fd) == 2 + and opt_or_fd[0] == "option" + ): resource = FileResource(Arg(options[opt_or_fd[1]])) else: ## TODO: Make this be a subtype of Resource - if(opt_or_fd == "stdin"): + if opt_or_fd == "stdin": resource = ("fd", 0) - elif(opt_or_fd == "stdout"): + elif opt_or_fd == "stdout": resource = ("fd", 1) - elif(opt_or_fd == "stderr"): + elif opt_or_fd == "stderr": resource = ("fd", 2) else: raise NotImplementedError() resource = FileDescriptorResource(resource) - + fid = create_file_id_for_resource(resource, fileIdGen) return fid + ## Get the options as arguments def get_option(opt_or_fd, options, fileIdGen): - assert(isinstance(opt_or_fd, tuple) - and len(opt_or_fd) == 2 - and opt_or_fd[0] == "option") + assert ( + isinstance(opt_or_fd, tuple) + and len(opt_or_fd) == 2 + and opt_or_fd[0] == "option" + ) arg = Arg(options[opt_or_fd[1]]) return (opt_or_fd[1], arg) -## This function + +## This function def create_edges_from_opt_or_fd_list(opt_or_fd_list, edges_dict, options, fileIdGen): new_edge_list = [] for opt_or_fd in opt_or_fd_list: @@ -105,23 +130,37 @@ def create_edges_from_opt_or_fd_list(opt_or_fd_list, edges_dict, options, fileId return new_edge_list -def find_input_edges(positional_input_list, implicit_use_of_stdin, dfg_edges, fileIdGen) -> List[int]: - assert (not implicit_use_of_stdin or len(positional_input_list) == 0) +def find_input_edges( + positional_input_list, implicit_use_of_stdin, dfg_edges, fileIdGen +) -> List[int]: + assert not implicit_use_of_stdin or len(positional_input_list) == 0 if implicit_use_of_stdin: resources = [FileDescriptorResource(("fd", 0))] else: - resources = [resource_from_file_descriptor(input_el) for input_el in positional_input_list] - file_ids = [create_file_id_for_resource(resource, fileIdGen) for resource in resources] + resources = [ + resource_from_file_descriptor(input_el) + for input_el in positional_input_list + ] + file_ids = [ + create_file_id_for_resource(resource, fileIdGen) for resource in resources + ] return get_edge_list_from_file_id_list(dfg_edges, file_ids) -def find_output_edges(positional_output_list, implicit_use_of_stdout, dfg_edges, fileIdGen) -> List[int]: - assert (not implicit_use_of_stdout or len(positional_output_list) == 0) +def find_output_edges( + positional_output_list, implicit_use_of_stdout, dfg_edges, fileIdGen +) -> List[int]: + assert not implicit_use_of_stdout or len(positional_output_list) == 0 if implicit_use_of_stdout: resources = [FileDescriptorResource(("fd", 1))] else: - resources = [resource_from_file_descriptor(input_el) for input_el in positional_output_list] - file_ids = [create_file_id_for_resource(resource, fileIdGen) for resource in resources] + resources = [ + resource_from_file_descriptor(input_el) + for input_el in positional_output_list + ] + file_ids = [ + create_file_id_for_resource(resource, fileIdGen) for resource in resources + ] return get_edge_list_from_file_id_list(dfg_edges, file_ids) @@ -152,88 +191,124 @@ def add_var_for_descriptor(operand): for i in range(len(command_invocation_with_io.flag_option_list)): flagoption = command_invocation_with_io.flag_option_list[i] - if isinstance(flagoption, OptionWithIO) and not isinstance(flagoption.option_arg, ArgStringType): + if isinstance(flagoption, OptionWithIO) and not isinstance( + flagoption.option_arg, ArgStringType + ): fid_id = add_var_for_descriptor(flagoption.option_arg) new_option = OptionWithIOVar(flagoption.name, fid_id) new_flagoption_list.append(new_option) - else: # Flag + else: # Flag new_flagoption_list.append(flagoption) for i in range(len(command_invocation_with_io.operand_list)): operand = command_invocation_with_io.operand_list[i] - if isinstance(operand, FileNameWithIOInfo) or isinstance(operand, StdDescriptorWithIOInfo): + if isinstance(operand, FileNameWithIOInfo) or isinstance( + operand, StdDescriptorWithIOInfo + ): fid_id = add_var_for_descriptor(operand) new_operand_list.append(fid_id) else: new_operand_list.append(operand) if command_invocation_with_io.implicit_use_of_streaming_input: - new_implicit_use_of_streaming_input = add_var_for_descriptor(command_invocation_with_io.implicit_use_of_streaming_input) + new_implicit_use_of_streaming_input = add_var_for_descriptor( + command_invocation_with_io.implicit_use_of_streaming_input + ) else: new_implicit_use_of_streaming_input = None if command_invocation_with_io.implicit_use_of_streaming_output: - new_implicit_use_of_streaming_output = add_var_for_descriptor(command_invocation_with_io.implicit_use_of_streaming_output) + new_implicit_use_of_streaming_output = add_var_for_descriptor( + command_invocation_with_io.implicit_use_of_streaming_output + ) else: new_implicit_use_of_streaming_output = None - command_invocation_with_io_vars = CommandInvocationWithIOVars(cmd_name=command_invocation_with_io.cmd_name, - flag_option_list=new_flagoption_list, - operand_list=new_operand_list, - implicit_use_of_streaming_input=new_implicit_use_of_streaming_input, - implicit_use_of_streaming_output=new_implicit_use_of_streaming_output, - access_map=access_map) + command_invocation_with_io_vars = CommandInvocationWithIOVars( + cmd_name=command_invocation_with_io.cmd_name, + flag_option_list=new_flagoption_list, + operand_list=new_operand_list, + implicit_use_of_streaming_input=new_implicit_use_of_streaming_input, + implicit_use_of_streaming_output=new_implicit_use_of_streaming_output, + access_map=access_map, + ) return command_invocation_with_io_vars, dfg_edges -def compile_command_to_DFG(fileIdGen, command, options, - redirections=[]): - command_invocation: CommandInvocationInitial = parse_arg_list_to_command_invocation(command, options) - io_info: InputOutputInfo = get_input_output_info_from_cmd_invocation_util(command_invocation) +def compile_command_to_DFG(fileIdGen, command, options, redirections=[]): + command_invocation: CommandInvocationInitial = parse_arg_list_to_command_invocation( + command, options + ) + io_info: InputOutputInfo = get_input_output_info_from_cmd_invocation_util( + command_invocation + ) if io_info is None: - raise Exception(f"InputOutputInformation for {format_arg_chars(command)} not provided so considered side-effectful.") + raise Exception( + f"InputOutputInformation for {format_arg_chars(command)} not provided so considered side-effectful." + ) if io_info.has_other_outputs(): - raise Exception(f"Command {format_arg_chars(command)} has outputs other than streaming.") - para_info: ParallelizabilityInfo = get_parallelizability_info_from_cmd_invocation_util(command_invocation) + raise Exception( + f"Command {format_arg_chars(command)} has outputs other than streaming." + ) + para_info: ParallelizabilityInfo = ( + get_parallelizability_info_from_cmd_invocation_util(command_invocation) + ) if para_info is None: - para_info = ParallelizabilityInfo() # defaults to no parallelizer's and all properties False - command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation(command_invocation) + para_info = ( + ParallelizabilityInfo() + ) # defaults to no parallelizer's and all properties False + command_invocation_with_io = io_info.apply_input_output_info_to_command_invocation( + command_invocation + ) if para_info is None: - para_info = ParallelizabilityInfo() # defaults to no parallelizer's and all properties False - parallelizer_list, round_robin_compatible_with_cat, is_commutative = para_info.unpack_info() - property_dict = [{'round_robin_compatible_with_cat': round_robin_compatible_with_cat, - 'is_commutative': is_commutative}] + para_info = ( + ParallelizabilityInfo() + ) # defaults to no parallelizer's and all properties False + ( + parallelizer_list, + round_robin_compatible_with_cat, + is_commutative, + ) = para_info.unpack_info() + property_dict = [ + { + "round_robin_compatible_with_cat": round_robin_compatible_with_cat, + "is_commutative": is_commutative, + } + ] cmd_related_properties = CommandProperties(property_dict) ## TODO: Make an empty IR and add edges and nodes incrementally (using the methods defined in IR). ## Add all inputs and outputs to the DFG edges - cmd_invocation_with_io_vars, dfg_edges = add_file_id_vars(command_invocation_with_io, fileIdGen) + cmd_invocation_with_io_vars, dfg_edges = add_file_id_vars( + command_invocation_with_io, fileIdGen + ) com_redirs = redirections ## TODO: Add assignments com_assignments = [] ## Assume: Everything must be completely expanded ## TODO: Add an assertion about that. - dfg_node = DFGNode(cmd_invocation_with_io_vars, - com_redirs=com_redirs, - com_assignments=com_assignments, - parallelizer_list=parallelizer_list, - cmd_related_properties=cmd_related_properties - ) + dfg_node = DFGNode( + cmd_invocation_with_io_vars, + com_redirs=com_redirs, + com_assignments=com_assignments, + parallelizer_list=parallelizer_list, + cmd_related_properties=cmd_related_properties, + ) # log(f'Dfg node: {dfg_node}') node_id = dfg_node.get_id() ## Assign the from, to node in edges for fid_id in dfg_node.get_input_list(): fid, from_node, to_node = dfg_edges[fid_id] - assert(to_node is None) + assert to_node is None dfg_edges[fid_id] = (fid, from_node, node_id) - + for fid_id in dfg_node.get_output_list(): fid, from_node, to_node = dfg_edges[fid_id] - assert(from_node is None) + assert from_node is None dfg_edges[fid_id] = (fid, node_id, to_node) - - dfg_nodes = {node_id : dfg_node} + + dfg_nodes = {node_id: dfg_node} dfg = IR(dfg_nodes, dfg_edges) # log(f'IR: {dfg}') return dfg @@ -243,20 +318,17 @@ def compile_command_to_DFG(fileIdGen, command, options, ## Node builder functions ## + def make_tee(input, outputs): com_name = Arg.string_to_arg("tee") com_category = "pure" - return DFGNode([input], - outputs, - com_name, - com_category) + return DFGNode([input], outputs, com_name, com_category) ## Note: This might need more information. E.g. all the file ## descriptors of the IR, and in general any other local information ## that might be relevant. class IR: - ## TODO: Embed the fileIdGen as a field of the IR ## IR Assumptions: @@ -266,7 +338,7 @@ class IR: ## ## - If two nodes have the same file as output, then they both ## write to it concurrently. - def __init__(self, nodes, edges, background = False): + def __init__(self, nodes, edges, background=False): self.nodes = nodes self.edges = edges self.background = background @@ -277,28 +349,30 @@ def __init__(self, nodes, edges, background = False): self.apply_redirections() def __repr__(self): - output = "(|-{} IR: {} {}-|)".format(self.get_stdin(), list(self.nodes.values()), self.get_stdout()) + output = "(|-{} IR: {} {}-|)".format( + self.get_stdin(), list(self.nodes.values()), self.get_stdout() + ) return output ## Initialize all edges def apply_redirections(self): for _, node in self.nodes.items(): node.apply_redirections(self.edges) - + ## We need to merge common files after redirections have been applied. self.combine_common_files() - ## Refactor these to call .add_edge, and .set_edge_to/from + ## Refactor these to call .add_edge, and .set_edge_to/from ## Add an edge that points to a node def add_to_edge(self, to_edge, node_id): edge_id = to_edge.get_ident() - assert(not edge_id in self.edges) + assert not edge_id in self.edges self.edges[edge_id] = (to_edge, None, node_id) ## Add an edge that starts from a node def add_from_edge(self, node_id, from_edge): edge_id = from_edge.get_ident() - assert(not edge_id in self.edges) + assert not edge_id in self.edges self.edges[edge_id] = (from_edge, node_id, None) def set_edge_to(self, edge_id, to_node_id): @@ -310,19 +384,19 @@ def set_edge_from(self, edge_id, from_node_id): self.edges[edge_id] = (edge_fid, from_node_id, to_node) def get_edge_fid(self, fid_id): - if(fid_id in self.edges): + if fid_id in self.edges: return self.edges[fid_id][0] else: return None def get_edge_from(self, edge_id): - if(edge_id in self.edges): + if edge_id in self.edges: return self.edges[edge_id][1] else: return None def replace_edge(self, old_edge_id, new_edge_fid): - assert(new_edge_fid not in self.all_fids()) + assert new_edge_fid not in self.all_fids() new_edge_id = new_edge_fid.get_ident() old_fid, from_node, to_node = self.edges[old_edge_id] self.edges[new_edge_id] = (new_edge_fid, from_node, to_node) @@ -331,7 +405,7 @@ def replace_edge(self, old_edge_id, new_edge_fid): if to_node: self.get_node(to_node).replace_edge(old_edge_id, new_edge_id) del self.edges[old_edge_id] - + def get_stdin(self): stdin_id = self.get_stdin_id() stdin_fid = self.get_edge_fid(stdin_id) @@ -348,39 +422,43 @@ def get_stdin_id(self): stdin_id = None for edge_id, (edge_fid, _from, _to) in self.edges.items(): resource = edge_fid.get_resource() - if(resource.is_stdin()): - assert(stdin_id is None) + if resource.is_stdin(): + assert stdin_id is None stdin_id = edge_id - return stdin_id + return stdin_id def get_stdout_id(self): ## ASSERT: There must be only one stdout_id = None for edge_id, (edge_fid, _from, _to) in self.edges.items(): resource = edge_fid.get_resource() - if(resource.is_stdout()): + if resource.is_stdout(): # This is not true when using distributed_exec # assert(stdout_id is None) stdout_id = edge_id - return stdout_id + return stdout_id def serialize(self): output = "Nodes:\n" all_file_ids = "" for i, node in enumerate(self.nodes): - serialized_input_file_ids = " ".join([fid.serialize() - for fid in node.get_input_file_ids()]) - serialized_output_file_ids = " ".join([fid.serialize() - for fid in node.get_output_file_ids()]) + serialized_input_file_ids = " ".join( + [fid.serialize() for fid in node.get_input_file_ids()] + ) + serialized_output_file_ids = " ".join( + [fid.serialize() for fid in node.get_output_file_ids()] + ) all_file_ids += serialized_input_file_ids + " " all_file_ids += serialized_output_file_ids + " " - output += "{} in: {} out: {} command: {}\n".format(i, serialized_input_file_ids, - serialized_output_file_ids, - node.serialize()) + output += "{} in: {} out: {} command: {}\n".format( + i, + serialized_input_file_ids, + serialized_output_file_ids, + node.serialize(), + ) output = "File ids:\n{}\n".format(all_file_ids) + output return output - def to_ast(self, drain_streams) -> "list[AstNode]": asts = [] @@ -391,7 +469,7 @@ def to_ast(self, drain_streams) -> "list[AstNode]": ## Redirect stdin stdin_id = self.get_stdin_id() - if (not stdin_id is None): + if not stdin_id is None: ## Create a new ephemeral resource to redirect stdin to. fid = fileIdGen.next_file_id() fid.make_ephemeral() @@ -400,15 +478,21 @@ def to_ast(self, drain_streams) -> "list[AstNode]": _prev_fid, from_node, to_node = self.edges[stdin_id] self.edges[stdin_id] = (fid, from_node, to_node) ## Create a command that redirects stdin to this ephemeral fid - redirect_stdin_script = os.path.join(config.PASH_TOP, config.config['runtime']['redirect_stdin_binary']) - com_args = [string_to_argument('source'), string_to_argument(redirect_stdin_script), file_to_redirect_to] + redirect_stdin_script = os.path.join( + config.PASH_TOP, config.config["runtime"]["redirect_stdin_binary"] + ) + com_args = [ + string_to_argument("source"), + string_to_argument(redirect_stdin_script), + file_to_redirect_to, + ] com = make_command(com_args) asts.append(com) ## Make the dataflow graph ## ## TODO: Normally this should have all sink nodes at the end, but - ## for now we just have the stdout node in the end + ## for now we just have the stdout node in the end ## (since this is always the output in our benchmarks). # sink_node_ids = self.sink_nodes() ## @@ -418,15 +502,14 @@ def to_ast(self, drain_streams) -> "list[AstNode]": ## For now we just allow more than one output by waiting for one of them ## at random. stdout_edge_id = self.get_stdout_id() - if (not stdout_edge_id is None): + if not stdout_edge_id is None: sink_node_ids = [self.edges[stdout_edge_id][1]] else: sink_node_ids = self.sink_nodes() sink_node_ids = [sink_node_ids[0]] - for node_id, node in self.nodes.items(): - if(not node_id in sink_node_ids): + if not node_id in sink_node_ids: node_ast = node.to_ast(self.edges, drain_streams) asts.append(make_background(node_ast)) ## Gather all pids @@ -445,20 +528,20 @@ def to_ast(self, drain_streams) -> "list[AstNode]": ## TODO: Ideally we would like to make them as typed nodes already class_asts = [to_ast_node(ast_node_to_untyped_deep(ast)) for ast in asts] return class_asts - + def collect_pid_assignment(self): ## Creates: ## pids_to_kill="$! $pids_to_kill" - var_name = 'pids_to_kill' - rval = quote_arg([standard_var_ast('!'), - char_to_arg_char(' '), - standard_var_ast(var_name)]) + var_name = "pids_to_kill" + rval = quote_arg( + [standard_var_ast("!"), char_to_arg_char(" "), standard_var_ast(var_name)] + ) return make_assignment(var_name, [rval]) - + def init_pids_to_kill(self): ## Creates: ## pids_to_kill="" - var_name = 'pids_to_kill' + var_name = "pids_to_kill" rval = quote_arg([]) return make_assignment(var_name, [rval]) @@ -469,7 +552,7 @@ def set_ast(self, ast): def set_background(self, background): self.background = background - if (background): + if background: ## Since the IR is in the background, we don't have access to ## its stdin, stdout anymore self.stdin = [] @@ -479,8 +562,8 @@ def is_in_background(self): return self.background def pipe_append(self, other): - assert(self.valid()) - assert(other.valid()) + assert self.valid() + assert other.valid() ## This combines the two IRs by adding all of the nodes ## together, and by union-ing the stdout of the first with the @@ -491,12 +574,11 @@ def pipe_append(self, other): ## both self and other are not empty. my_out = self.get_stdout_id() other_in = other.get_stdin_id() - assert(not my_out is None) - assert(not other_in is None) - + assert not my_out is None + assert not other_in is None _other_in_fid, from_node, other_in_node_id = other.edges[other_in] - assert(from_node is None) + assert from_node is None ## ... = OtherInNode(..., other_in, ...) ## v ## ... = OtherInNode(..., my_out, ...) @@ -506,7 +588,7 @@ def pipe_append(self, other): ## Make the my_out id to be ephemeral file. my_out_fid, from_node, to_node = self.edges[my_out] - assert(to_node is None) + assert to_node is None my_out_fid.make_ephemeral() ## Add the other node in my edges @@ -516,9 +598,9 @@ def pipe_append(self, other): self.union(other) def background_union(self, other): - assert(self.valid()) - assert(other.valid()) - assert(self.is_in_background()) + assert self.valid() + assert other.valid() + assert self.is_in_background() ## This combines two IRs where at least the first one is in ## background. This means that the stdin only works with the second ## the second (or None if both are in background). Also if @@ -526,7 +608,7 @@ def background_union(self, other): ## If one of them is not in the background, then the whole ## thing isn't. - if (not other.is_in_background()): + if not other.is_in_background(): self.set_background(other.is_in_background()) self.union(other) @@ -545,7 +627,6 @@ def union(self, other): ## TODO: Handle connections of common files (pipes, etc) self.combine_common_files() - ## Combines (unions) files that refer to the same resource. ## ## WARNING: This assumes that comparing file names statically @@ -560,7 +641,6 @@ def union(self, other): ## the IR? Maybe it can be true if a command is run with ## variable assignments) def combine_common_files(self): - ## For now we just unify a file if it exists exactly twice, ## once at the input of a node and once at the output of ## another node. If a file exists in several input locations, @@ -572,19 +652,24 @@ def combine_common_files(self): ## of exactly one other node. # log("Combining files for:", self) for node_id1, _node1 in self.nodes.items(): - inputs_with_file_resource = [(id, fid) for id, fid in self.get_node_input_ids_fids(node_id1) - if fid.has_file_resource()] + inputs_with_file_resource = [ + (id, fid) + for id, fid in self.get_node_input_ids_fids(node_id1) + if fid.has_file_resource() + ] for id_in, fid_in in inputs_with_file_resource: in_resource = fid_in.get_resource() number_of_out_resources = 0 for node_id2, _node2 in self.nodes.items(): - outputs_with_file_resource = [(id, fid) for id, fid in self.get_node_output_ids_fids(node_id2) - if fid.has_file_resource()] + outputs_with_file_resource = [ + (id, fid) + for id, fid in self.get_node_output_ids_fids(node_id2) + if fid.has_file_resource() + ] for id_out, fid_out in outputs_with_file_resource: out_resource = fid_out.get_resource() ## Do not combine if the ids of the edges are already the same - if (not id_in == id_out - and in_resource == out_resource): + if not id_in == id_out and in_resource == out_resource: number_of_out_resources += 1 ## They point to the same File resource so we need to unify their fids self.nodes[node_id2].replace_edge(id_out, id_in) @@ -594,7 +679,7 @@ def combine_common_files(self): ## Exit with an error if a file is written by more than one node. ## ## TODO: Could this ever be improved for additional performance? - assert(number_of_out_resources <= 1) + assert number_of_out_resources <= 1 ## Returns all the file identifiers in the IR. def all_fids(self): @@ -603,23 +688,25 @@ def all_fids(self): ## Returns all input fids of the IR def all_input_fids(self): - all_input_fids = [fid for fid, from_node, _to_node in self.edges.values() - if from_node is None] + all_input_fids = [ + fid for fid, from_node, _to_node in self.edges.values() if from_node is None + ] return all_input_fids ## Returns all output fids of the IR def all_output_fids(self): - all_output_fids = [fid for fid, _from_node, to_node in self.edges.values() - if to_node is None] + all_output_fids = [ + fid for fid, _from_node, to_node in self.edges.values() if to_node is None + ] return all_output_fids ## Returns the sources of the IR. ## This includes both the nodes that have an incoming edge (file) that has no from_node, - ## but also nodes that have no incoming edge (generator nodes). + ## but also nodes that have no incoming edge (generator nodes). def source_nodes(self): sources = set() for _edge_fid, from_node, to_node in self.edges.values(): - if(from_node is None and not to_node is None): + if from_node is None and not to_node is None: sources.add(to_node) for node_id, node in self.nodes.items(): if len(node.get_input_list()) == 0: @@ -629,7 +716,7 @@ def source_nodes(self): def sink_nodes(self): sources = set() for _edge_fid, from_node, to_node in self.edges.values(): - if(to_node is None and not from_node is None): + if to_node is None and not from_node is None: sources.add(from_node) return list(sources) @@ -646,8 +733,8 @@ def get_next_nodes(self, node_id): next_nodes = [] for edge_id in output_edge_ids: _fid, from_node, to_node = self.edges[edge_id] - assert(from_node == node_id) - if(not to_node is None): + assert from_node == node_id + if not to_node is None: next_nodes.append(to_node) return next_nodes @@ -656,14 +743,17 @@ def get_previous_nodes(self, node_id): previous_nodes = [] for edge_id in input_edge_ids: _fid, from_node, to_node = self.edges[edge_id] - assert(to_node == node_id) - if(not from_node is None): + assert to_node == node_id + if not from_node is None: previous_nodes.append(from_node) return previous_nodes def get_node_input_ids_fids(self, node_id): node = self.get_node(node_id) - return [(input_edge_id, self.edges[input_edge_id][0]) for input_edge_id in node.get_input_list()] + return [ + (input_edge_id, self.edges[input_edge_id][0]) + for input_edge_id in node.get_input_list() + ] def get_node_input_ids(self, node_id): return [fid_id for fid_id, _fid in self.get_node_input_ids_fids(node_id)] @@ -673,7 +763,10 @@ def get_node_input_fids(self, node_id): def get_node_output_ids_fids(self, node_id): node = self.get_node(node_id) - return [(output_edge_id, self.edges[output_edge_id][0]) for output_edge_id in node.get_output_list()] + return [ + (output_edge_id, self.edges[output_edge_id][0]) + for output_edge_id in node.get_output_list() + ] def get_node_output_ids(self, node_id): return [fid_id for fid_id, _fid in self.get_node_output_ids_fids(node_id)] @@ -700,7 +793,6 @@ def remove_node(self, node_id): for out_id in node.get_output_list(): self.set_edge_from(out_id, None) - def add_node(self, node): node_id = node.get_id() self.nodes[node_id] = node @@ -722,29 +814,36 @@ def add_edges(self, edge_fids): def add_edge(self, edge_fid): fid_id = edge_fid.get_ident() - assert(not fid_id in self.edges) + assert not fid_id in self.edges self.edges[fid_id] = (edge_fid, None, None) ## Note: We assume that the lack of nodes is an adequate condition ## to check emptiness. def empty(self): - return (len(self.nodes) == 0) + return len(self.nodes) == 0 - def apply_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size): + def apply_parallelization_to_node( + self, node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ): splitter = parallelizer.get_splitter() if splitter.is_splitter_round_robin(): - self.apply_round_robin_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, - r_split_batch_size) + self.apply_round_robin_parallelization_to_node( + node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ) elif splitter.is_splitter_round_robin_with_unwrap_flag(): - self.apply_round_robin_with_unwrap_flag_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, - r_split_batch_size) + self.apply_round_robin_with_unwrap_flag_parallelization_to_node( + node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ) elif splitter.is_splitter_consec_chunks(): - self.apply_consecutive_chunks_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out) + self.apply_consecutive_chunks_parallelization_to_node( + node_id, parallelizer, fileIdGen, fan_out + ) else: raise Exception("Splitter not yet implemented") - def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, - r_split_batch_size): + def apply_round_robin_parallelization_to_node( + self, node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ): # TODO: this control flow should move done to aggregators once we implement them; # currently, this cannot be done since splitter etc. would be added... aggregator_spec = parallelizer.get_aggregator_spec() @@ -758,124 +857,211 @@ def apply_round_robin_parallelization_to_node(self, node_id, parallelizer, fileI node = self.get_node(node_id) # get info from node, and delete it from graph - streaming_input, streaming_output, configuration_inputs = \ + ( + streaming_input, + streaming_output, + configuration_inputs, + ) = ( node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() + ) original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars - can_be_fused_with_prev = False prev_nodes = self.get_previous_nodes(node_id) if len(prev_nodes) == 1: - first_pred_node, first_pred_cmd_inv = \ - self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) + ( + first_pred_node, + first_pred_cmd_inv, + ) = self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) if isinstance(first_pred_node, r_merge.RMerge): can_be_fused_with_prev = True # remove node to be parallelized - self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + self.remove_node( + node_id + ) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph if can_be_fused_with_prev: - self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None + self.remove_node( + prev_nodes[0] + ) # also sets respective edge to's and from's to None in_mapper_ids = first_pred_cmd_inv.operand_list - else: # cannot be fused so introduce splitter + else: # cannot be fused so introduce splitter # splitter - round_robin_splitter_generator = lambda input_id, output_ids: r_split.make_r_split(input_id, output_ids, r_split_batch_size) - out_split_ids = self.introduce_splitter(round_robin_splitter_generator, fan_out, fileIdGen, streaming_input) + round_robin_splitter_generator = ( + lambda input_id, output_ids: r_split.make_r_split( + input_id, output_ids, r_split_batch_size + ) + ) + out_split_ids = self.introduce_splitter( + round_robin_splitter_generator, fan_out, fileIdGen, streaming_input + ) in_mapper_ids = out_split_ids # mappers - out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, - parallelizer) - out_mapper_ids = [out_ids[0] for out_ids in out_mapper_ids] # since we get list of list back for potential aux info + out_mapper_ids = self.introduce_mappers( + fan_out, + fileIdGen, + in_mapper_ids, + original_cmd_invocation_with_io_vars, + parallelizer, + ) + out_mapper_ids = [ + out_ids[0] for out_ids in out_mapper_ids + ] # since we get list of list back for potential aux info # aggregator - self.introduce_aggregator_for_round_robin(out_mapper_ids, parallelizer, streaming_output) + self.introduce_aggregator_for_round_robin( + out_mapper_ids, parallelizer, streaming_output + ) - def apply_round_robin_with_unwrap_flag_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out, - r_split_batch_size): + def apply_round_robin_with_unwrap_flag_parallelization_to_node( + self, node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ): # round robin with unwrap flag is an inferred parallelizer which ensures that # the command is commutative and has an aggregator for consecutive chunks; # thus we can check whether we can re-open a previous "RR"-parallelization ending with `r_merge` node = self.get_node(node_id) - streaming_input, streaming_output, configuration_inputs = \ + ( + streaming_input, + streaming_output, + configuration_inputs, + ) = ( node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() + ) original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars can_be_fused_with_prev = False prev_nodes = self.get_previous_nodes(node_id) if len(prev_nodes) == 1: - first_pred_node, first_pred_cmd_inv = \ - self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) + ( + first_pred_node, + first_pred_cmd_inv, + ) = self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) if isinstance(first_pred_node, r_merge.RMerge): can_be_fused_with_prev = True # remove node to be parallelized - self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph - - if can_be_fused_with_prev: # and node.is_commutative(): implied by how this kind of splitter is inferred - self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None + self.remove_node( + node_id + ) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + + if ( + can_be_fused_with_prev + ): # and node.is_commutative(): implied by how this kind of splitter is inferred + self.remove_node( + prev_nodes[0] + ) # also sets respective edge to's and from's to None in_unwrap_ids = first_pred_cmd_inv.operand_list out_unwrap_ids = self.introduce_unwraps(fileIdGen, in_unwrap_ids) in_mapper_ids = out_unwrap_ids else: # splitter - round_robin_with_unwrap_flag_splitter_generator = lambda input_id, output_ids: r_split.make_r_split_with_unwrap_flag(input_id, output_ids, r_split_batch_size) - out_split_ids = self.introduce_splitter(round_robin_with_unwrap_flag_splitter_generator, fan_out, fileIdGen, streaming_input) + round_robin_with_unwrap_flag_splitter_generator = ( + lambda input_id, output_ids: r_split.make_r_split_with_unwrap_flag( + input_id, output_ids, r_split_batch_size + ) + ) + out_split_ids = self.introduce_splitter( + round_robin_with_unwrap_flag_splitter_generator, + fan_out, + fileIdGen, + streaming_input, + ) in_mapper_ids = out_split_ids # mappers - out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, - parallelizer) + out_mapper_ids = self.introduce_mappers( + fan_out, + fileIdGen, + in_mapper_ids, + original_cmd_invocation_with_io_vars, + parallelizer, + ) in_aggregator_ids = out_mapper_ids out_aggregator_id = streaming_output - self.introduce_aggregators_for_consec_chunks(fileIdGen, in_aggregator_ids, - original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, - streaming_output) - - def apply_consecutive_chunks_parallelization_to_node(self, node_id, parallelizer, fileIdGen, fan_out): + self.introduce_aggregators_for_consec_chunks( + fileIdGen, + in_aggregator_ids, + original_cmd_invocation_with_io_vars, + out_aggregator_id, + parallelizer, + streaming_output, + ) + + def apply_consecutive_chunks_parallelization_to_node( + self, node_id, parallelizer, fileIdGen, fan_out + ): # check whether we can fuse with previous node's parallelization: # we can do so if the previous node's parallelization is the same, and the aggregator is concatenation # Assumption: it suffices to check that the previous node is an aggregator node of type concatenate # as this is unique for consecutive chunk parallelization (for now, this is true) node = self.get_node(node_id) - streaming_input, streaming_output, configuration_inputs = \ + ( + streaming_input, + streaming_output, + configuration_inputs, + ) = ( node.get_single_streaming_input_single_output_and_configuration_inputs_of_node_for_parallelization() + ) original_cmd_invocation_with_io_vars = node.cmd_invocation_with_io_vars can_be_fused_with_prev = False prev_nodes = self.get_previous_nodes(node_id) if len(prev_nodes) == 1: - first_pred_node, first_pred_cmd_inv = \ - self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) + ( + first_pred_node, + first_pred_cmd_inv, + ) = self.get_only_previous_node_and_only_previous_cmd_invocation(prev_nodes) if first_pred_cmd_inv.is_aggregator_concatenate(): can_be_fused_with_prev = True # remove node to be parallelized - self.remove_node(node_id) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph + self.remove_node( + node_id + ) # remove it here already as as we need to remove edge end points ow. to avoid disconnecting graph to avoid disconnecting graph if can_be_fused_with_prev: - self.remove_node(prev_nodes[0]) # also sets respective edge to's and from's to None + self.remove_node( + prev_nodes[0] + ) # also sets respective edge to's and from's to None in_mapper_ids = first_pred_cmd_inv.operand_list - else: # cannot be fused so introduce splitter + else: # cannot be fused so introduce splitter # splitter - consec_chunks_splitter_generator = lambda input_id, output_ids: pash_split.make_split_file(input_id, output_ids) - out_split_ids = self.introduce_splitter(consec_chunks_splitter_generator, fan_out, fileIdGen, streaming_input) + consec_chunks_splitter_generator = ( + lambda input_id, output_ids: pash_split.make_split_file( + input_id, output_ids + ) + ) + out_split_ids = self.introduce_splitter( + consec_chunks_splitter_generator, fan_out, fileIdGen, streaming_input + ) in_mapper_ids = out_split_ids # mappers - out_mapper_ids = self.introduce_mappers(fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, - parallelizer) + out_mapper_ids = self.introduce_mappers( + fan_out, + fileIdGen, + in_mapper_ids, + original_cmd_invocation_with_io_vars, + parallelizer, + ) # aggregators in_aggregator_ids = out_mapper_ids out_aggregator_id = streaming_output - self.introduce_aggregators_for_consec_chunks(fileIdGen, in_aggregator_ids, - original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, - streaming_output) + self.introduce_aggregators_for_consec_chunks( + fileIdGen, + in_aggregator_ids, + original_cmd_invocation_with_io_vars, + out_aggregator_id, + parallelizer, + streaming_output, + ) def get_only_previous_node_and_only_previous_cmd_invocation(self, prev_nodes): - assert (len(prev_nodes) > 0) + assert len(prev_nodes) > 0 # get info about first one but also ensure that it is the only one if we fuse assert len(prev_nodes) == 1 first_pred_id = prev_nodes[0] @@ -883,7 +1069,9 @@ def get_only_previous_node_and_only_previous_cmd_invocation(self, prev_nodes): first_pred_cmd_inv = first_pred_node.cmd_invocation_with_io_vars return first_pred_node, first_pred_cmd_inv - def introduce_splitter(self, splitter_generator, fan_out, fileIdGen, streaming_input): + def introduce_splitter( + self, splitter_generator, fan_out, fileIdGen, streaming_input + ): out_split_ids = self.generate_ephemeral_edges(fileIdGen, fan_out) splitter = splitter_generator(streaming_input, out_split_ids) self.set_edge_to(streaming_input, splitter.get_id()) @@ -892,23 +1080,38 @@ def introduce_splitter(self, splitter_generator, fan_out, fileIdGen, streaming_i self.add_node(splitter) return out_split_ids - def introduce_mappers(self, fan_out, fileIdGen, in_mapper_ids, original_cmd_invocation_with_io_vars, parallelizer): + def introduce_mappers( + self, + fan_out, + fileIdGen, + in_mapper_ids, + original_cmd_invocation_with_io_vars, + parallelizer, + ): # -> [[input, aux1, aux2], [...], [...], ...] num_aux_mapper_to_aggregator = parallelizer.info_mapper_aggregator out_mapper_ids = [] - for _ in range(0,fan_out): - out_mapper_ids.append(self.generate_ephemeral_edges(fileIdGen, num_aux_mapper_to_aggregator+1)) + for _ in range(0, fan_out): + out_mapper_ids.append( + self.generate_ephemeral_edges( + fileIdGen, num_aux_mapper_to_aggregator + 1 + ) + ) # TODO: Fix that we use different ones here! # list of output, aux_output_1, aux_output_2, ... zip_mapper_in_out_ids = zip(in_mapper_ids, out_mapper_ids) all_mappers = [] - for (in_id, out_ids) in zip_mapper_in_out_ids: + for in_id, out_ids in zip_mapper_in_out_ids: # BEGIN: these 4 lines could be refactored to be a function in graph such that # creating end point of edges and the creation of edges is not decoupled out_id = out_ids[0] aux_out_ids = out_ids[1:] - mapper_cmd_inv = parallelizer.get_actual_mapper(original_cmd_invocation_with_io_vars, in_id, out_id, aux_out_ids) - mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(mapper_cmd_inv) + mapper_cmd_inv = parallelizer.get_actual_mapper( + original_cmd_invocation_with_io_vars, in_id, out_id, aux_out_ids + ) + mapper = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars( + mapper_cmd_inv + ) self.set_edge_to(in_id, mapper.get_id()) self.set_edge_from(out_id, mapper.get_id()) for aux_out_id in aux_out_ids: @@ -926,27 +1129,49 @@ def introduce_mappers(self, fan_out, fileIdGen, in_mapper_ids, original_cmd_invo return out_mapper_ids def introduce_unwraps(self, fileIdGen, in_unwrap_ids): - unwrap_to_commutative_mappers_ids = self.generate_ephemeral_edges(fileIdGen, len(in_unwrap_ids)) + unwrap_to_commutative_mappers_ids = self.generate_ephemeral_edges( + fileIdGen, len(in_unwrap_ids) + ) in_out_unwrap_ids = zip(in_unwrap_ids, unwrap_to_commutative_mappers_ids) for in_unwrap, out_unwrap in in_out_unwrap_ids: unwrap = r_unwrap.make_unwrap_node([in_unwrap], out_unwrap) self.add_node(unwrap) - self.set_edge_to(in_unwrap, unwrap.get_id()) # from are still (wrapped) mappers - self.set_edge_from(out_unwrap, unwrap.get_id()) # to will be set to mappers of current node + self.set_edge_to( + in_unwrap, unwrap.get_id() + ) # from are still (wrapped) mappers + self.set_edge_from( + out_unwrap, unwrap.get_id() + ) # to will be set to mappers of current node in_mapper_ids = unwrap_to_commutative_mappers_ids return in_mapper_ids - def introduce_aggregators_for_consec_chunks(self, fileIdGen, in_aggregator_ids, - original_cmd_invocation_with_io_vars, out_aggregator_id, parallelizer, - streaming_output): + def introduce_aggregators_for_consec_chunks( + self, + fileIdGen, + in_aggregator_ids, + original_cmd_invocation_with_io_vars, + out_aggregator_id, + parallelizer, + streaming_output, + ): # in_aggregator_ids: [[input, aux1, aux2, ...], [...], [...], ...] if parallelizer.info_mapper_aggregator == 0: - in_aggregator_ids = [in_ids[0] for in_ids in in_aggregator_ids] # since we get list of list back for potential aux info + in_aggregator_ids = [ + in_ids[0] for in_ids in in_aggregator_ids + ] # since we get list of list back for potential aux info aggregator_spec = parallelizer.get_aggregator_spec() - if aggregator_spec.is_aggregator_spec_concatenate() or aggregator_spec.is_aggregator_spec_custom_n_ary(): - aggregator_cmd_inv = parallelizer.get_actual_aggregator(original_cmd_invocation_with_io_vars, - in_aggregator_ids, out_aggregator_id) - aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + if ( + aggregator_spec.is_aggregator_spec_concatenate() + or aggregator_spec.is_aggregator_spec_custom_n_ary() + ): + aggregator_cmd_inv = parallelizer.get_actual_aggregator( + original_cmd_invocation_with_io_vars, + in_aggregator_ids, + out_aggregator_id, + ) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars( + aggregator_cmd_inv + ) for in_aggregator_id in in_aggregator_ids: self.set_edge_to(in_aggregator_id, aggregator.get_id()) self.set_edge_from(streaming_output, aggregator.get_id()) @@ -958,17 +1183,29 @@ def introduce_aggregators_for_consec_chunks(self, fileIdGen, in_aggregator_ids, # TODO: we simplify and assume that every mapper produces a single output for now map_in_aggregator_ids = [[id] for id in in_aggregator_ids] # TODO: turn node into cmd_invocation_with_io_vars since this is the only thing required in this function - self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer, map_in_aggregator_ids, out_aggregator_id, fileIdGen) + self.create_generic_aggregator_tree( + original_cmd_invocation_with_io_vars, + parallelizer, + map_in_aggregator_ids, + out_aggregator_id, + fileIdGen, + ) else: raise Exception("aggregator kind not yet implemented") - else: # we got auxiliary information - assert(parallelizer.core_aggregator_spec.is_aggregator_spec_custom_2_ary()) + else: # we got auxiliary information + assert parallelizer.core_aggregator_spec.is_aggregator_spec_custom_2_ary() map_in_aggregator_ids = in_aggregator_ids - self.create_generic_aggregator_tree(original_cmd_invocation_with_io_vars, parallelizer, - map_in_aggregator_ids, out_aggregator_id, fileIdGen) - - - def introduce_aggregator_for_round_robin(self, out_mapper_ids, parallelizer, streaming_output): + self.create_generic_aggregator_tree( + original_cmd_invocation_with_io_vars, + parallelizer, + map_in_aggregator_ids, + out_aggregator_id, + fileIdGen, + ) + + def introduce_aggregator_for_round_robin( + self, out_mapper_ids, parallelizer, streaming_output + ): aggregator_spec = parallelizer.get_aggregator_spec() if aggregator_spec.is_aggregator_spec_concatenate(): in_aggregator_ids = out_mapper_ids @@ -985,13 +1222,10 @@ def introduce_aggregator_for_round_robin(self, out_mapper_ids, parallelizer, str # TODO: this is where the other cases for aggregators need to be added pass - - - ## Replicates an edge using tee and returns the new node_id. def tee_edge(self, edge_id, times, fileIdGen): ## Assert that the edge is unplugged - assert(self.edges[edge_id][2] is None) + assert self.edges[edge_id][2] is None output_fids = [fileIdGen.next_ephemeral_file_id() for _ in range(times)] output_ids = [fid.get_ident() for fid in output_fids] @@ -1005,9 +1239,9 @@ def tee_edge(self, edge_id, times, fileIdGen): self.add_from_edge(new_node_id, edge_fid) self.add_node(new_node) self.set_edge_to(edge_id, new_node_id) - + return new_node_id - + def generate_graphviz(self): ## TODO: It is unclear if importing in here (instead of in general) ## improves startup cost of the pash_runtime when not using graphviz. @@ -1022,7 +1256,7 @@ def generate_graphviz(self): dot = node.add_dot_node(dot, node_id) ## (I/O) File nodes should be boxes - dot.attr('node', shape='box') + dot.attr("node", shape="box") ## Then generate all edges and input+output files for fid, from_node, to_node in self.edges.values(): @@ -1032,7 +1266,7 @@ def generate_graphviz(self): ## TODO: We should investigate why this happens if fid.has_file_resource(): label = fid.serialize() - node_id = f'file-{str(fid.get_ident())}' + node_id = f"file-{str(fid.get_ident())}" dot.node(node_id, label) if from_node is None: @@ -1049,27 +1283,51 @@ def generate_graphviz(self): def edge_node_consistency(self): ## Check if edges and nodes are consistent for edge_id, (_, from_node_id, to_node_id) in self.edges.items(): - if (not from_node_id is None): + if not from_node_id is None: from_node = self.get_node(from_node_id) - if(not (edge_id in from_node.get_output_list())): - log("Consistency Error: Edge id:", edge_id, "is not in the node outputs:", from_node) + if not (edge_id in from_node.get_output_list()): + log( + "Consistency Error: Edge id:", + edge_id, + "is not in the node outputs:", + from_node, + ) return False - if (not to_node_id is None): + if not to_node_id is None: to_node = self.get_node(to_node_id) - if(not (edge_id in to_node.get_input_list())): - log("Consistency Error: Edge id:", edge_id, "is not in the node inputs:", to_node) + if not (edge_id in to_node.get_input_list()): + log( + "Consistency Error: Edge id:", + edge_id, + "is not in the node inputs:", + to_node, + ) return False for node_id, node in self.nodes.items(): for edge_id in node.get_input_list(): _, _, to_node_id = self.edges[edge_id] - if(not (to_node_id == node_id)): - log("Consistency Error: The to_node_id of the input_edge:", edge_id, "of the node:", node, "is equal to:", to_node_id) + if not (to_node_id == node_id): + log( + "Consistency Error: The to_node_id of the input_edge:", + edge_id, + "of the node:", + node, + "is equal to:", + to_node_id, + ) return False for edge_id in node.get_output_list(): _, from_node_id, _ = self.edges[edge_id] - if(not (from_node_id == node_id)): - log("Consistency Error: The from_node_id of the output_edge:", edge_id, "of the node:", node, "is equal to:", from_node_id) + if not (from_node_id == node_id): + log( + "Consistency Error: The from_node_id of the output_edge:", + edge_id, + "of the node:", + node, + "is equal to:", + from_node_id, + ) return False return True @@ -1078,42 +1336,65 @@ def edge_node_consistency(self): ## has at least one node, and stdin, stdout set to some non-null ## file identifiers. def valid(self): - return (len(self.nodes) > 0 and - self.edge_node_consistency() and - (not self.is_in_background() - or (self.get_stdin() is None))) - ## The following is not true. Background IRs should not have stdin, but they can have stdout. - # and self.get_stdout() is None))) - ## The following is not true. A DFG might not have an stdin - # or (not self.is_in_background() - # and not self.get_stdin() is None - # and not self.get_stdout() is None))) + return ( + len(self.nodes) > 0 + and self.edge_node_consistency() + and (not self.is_in_background() or (self.get_stdin() is None)) + ) + ## The following is not true. Background IRs should not have stdin, but they can have stdout. + # and self.get_stdout() is None))) + ## The following is not true. A DFG might not have an stdin + # or (not self.is_in_background() + # and not self.get_stdin() is None + # and not self.get_stdout() is None))) ## This is a function that creates a reduce tree for a given node - def create_generic_aggregator_tree(self, cmd_invocation_with_io_vars, parallelizer, input_ids_for_aggregators, out_aggregator_id, fileIdGen): + def create_generic_aggregator_tree( + self, + cmd_invocation_with_io_vars, + parallelizer, + input_ids_for_aggregators, + out_aggregator_id, + fileIdGen, + ): def function_to_get_binary_aggregator(in_ids, out_ids): if len(out_ids) == 1: - aggregator_cmd_inv = parallelizer.get_actual_aggregator(cmd_invocation_with_io_vars, in_ids, out_ids[0]) - aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + aggregator_cmd_inv = parallelizer.get_actual_aggregator( + cmd_invocation_with_io_vars, in_ids, out_ids[0] + ) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars( + aggregator_cmd_inv + ) return aggregator else: # list has been flattened ... num_input_ids = len(in_ids) - assert(num_input_ids % 2 == 0) + assert num_input_ids % 2 == 0 fst_normal_input = in_ids[0] - fst_aux_inputs_from = in_ids[1:int(num_input_ids/2)] - snd_normal_input = in_ids[int(num_input_ids/2)] - snd_aux_inputs_from = in_ids[int(num_input_ids/2)+1:] + fst_aux_inputs_from = in_ids[1 : int(num_input_ids / 2)] + snd_normal_input = in_ids[int(num_input_ids / 2)] + snd_aux_inputs_from = in_ids[int(num_input_ids / 2) + 1 :] output_to = out_ids[0] aux_outputs_to = out_ids[1:] aggregator_cmd_inv = parallelizer.get_actual_2_ary_aggregator_with_aux( - fst_normal_input, fst_aux_inputs_from, snd_normal_input, snd_aux_inputs_from, - output_to, aux_outputs_to) - aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars(aggregator_cmd_inv) + fst_normal_input, + fst_aux_inputs_from, + snd_normal_input, + snd_aux_inputs_from, + output_to, + aux_outputs_to, + ) + aggregator = DFGNode.make_simple_dfg_node_from_cmd_inv_with_io_vars( + aggregator_cmd_inv + ) return aggregator + ## The Aggregator node takes a sequence of input ids and an output id - all_aggregators, new_edges, final_output_id = self.create_reduce_tree(lambda in_ids, out_ids: function_to_get_binary_aggregator(in_ids, out_ids), - input_ids_for_aggregators, fileIdGen) + all_aggregators, new_edges, final_output_id = self.create_reduce_tree( + lambda in_ids, out_ids: function_to_get_binary_aggregator(in_ids, out_ids), + input_ids_for_aggregators, + fileIdGen, + ) ## Add the edges in the graph self.add_edges(new_edges) ## Add the merge commands in the graph @@ -1135,8 +1416,10 @@ def create_reduce_tree(self, init_func, input_ids, fileIdGen): tree = [] new_edges = [] curr_ids = input_ids - while(len(curr_ids) > 1): - new_level, curr_ids, new_fids = self.create_reduce_tree_level(init_func, curr_ids, fileIdGen) + while len(curr_ids) > 1: + new_level, curr_ids, new_fids = self.create_reduce_tree_level( + init_func, curr_ids, fileIdGen + ) tree += new_level new_edges += new_fids @@ -1145,15 +1428,21 @@ def create_reduce_tree(self, init_func, input_ids, fileIdGen): ## Drain the final auxiliary outputs final_auxiliary_outputs = curr_ids[0][1:] - drain_fids = [fileIdGen.next_file_id() - for final_auxiliary_output in final_auxiliary_outputs] + drain_fids = [ + fileIdGen.next_file_id() + for final_auxiliary_output in final_auxiliary_outputs + ] for drain_fid in drain_fids: - drain_fid.set_resource(FileResource(Arg.string_to_arg('/dev/null'))) + drain_fid.set_resource(FileResource(Arg.string_to_arg("/dev/null"))) new_edges.append(drain_fid) drain_ids = [fid.get_ident() for fid in drain_fids] - drain_cat_commands = [make_cat_node([final_auxiliary_output], drain_id) - for final_auxiliary_output, drain_id in zip(final_auxiliary_outputs, drain_ids)] + drain_cat_commands = [ + make_cat_node([final_auxiliary_output], drain_id) + for final_auxiliary_output, drain_id in zip( + final_auxiliary_outputs, drain_ids + ) + ] return (tree + drain_cat_commands), new_edges, final_output_id @staticmethod @@ -1161,7 +1450,7 @@ def create_reduce_tree(self, init_func, input_ids, fileIdGen): ## output file ids must be lists of lists, as the input file ids and ## the output file ids might contain auxiliary files. def create_reduce_tree_level(init_func, input_ids, fileIdGen): - if(len(input_ids) % 2 == 0): + if len(input_ids) % 2 == 0: output_ids = [] even_input_ids = input_ids else: @@ -1175,7 +1464,9 @@ def create_reduce_tree_level(init_func, input_ids, fileIdGen): new_fids += new_out_fids new_out_ids = [fid.get_ident() for fid in new_out_fids] output_ids.append(new_out_ids) - new_node = IR.create_reduce_node(init_func, even_input_ids[i:i+2], new_out_ids) + new_node = IR.create_reduce_node( + init_func, even_input_ids[i : i + 2], new_out_ids + ) level.append(new_node) return (level, output_ids, new_fids) @@ -1183,6 +1474,5 @@ def create_reduce_tree_level(init_func, input_ids, fileIdGen): ## This function creates one node of the reduce tree def create_reduce_node(init_func, input_ids, output_ids): return init_func(flatten_list(input_ids), output_ids) - # TODO: this is where we need to use our aggregator spec/node - + # TODO: this is where we need to use our aggregator spec/node diff --git a/compiler/ir_to_ast.py b/compiler/ir_to_ast.py index 033eb34d6..1e6277853 100644 --- a/compiler/ir_to_ast.py +++ b/compiler/ir_to_ast.py @@ -6,8 +6,9 @@ from parse import from_ast_objects_to_shell import config -RM_PASH_FIFOS_NAME="rm_pash_fifos" -MKFIFO_PASH_FIFOS_NAME="mkfifo_pash_fifos" +RM_PASH_FIFOS_NAME = "rm_pash_fifos" +MKFIFO_PASH_FIFOS_NAME = "mkfifo_pash_fifos" + def to_shell(ir, args): backend_start_time = datetime.now() @@ -27,9 +28,9 @@ def to_shell(ir, args): def ir2ast(ir, args): clean_up_graph = False drain_streams = False - if(args.termination == "clean_up_graph"): + if args.termination == "clean_up_graph": clean_up_graph = True - elif(args.termination == "drain_stream"): + elif args.termination == "drain_stream": drain_streams = True ## NOTE: We first need to make the main body because it might create additional ephemeral fids. @@ -52,8 +53,7 @@ def ir2ast(ir, args): # log("All fids:", all_fids) ## Find all the ephemeral fids and turn them to ASTs - ephemeral_fids = [fid for fid in all_fids - if fid.is_ephemeral()] + ephemeral_fids = [fid for fid in all_fids if fid.is_ephemeral()] # log("Ephemeral fids:", ephemeral_fids) @@ -67,6 +67,7 @@ def ir2ast(ir, args): return final_asts + def make_rms_f_prologue_epilogue(ephemeral_fids): asts = [] ## Create an `rm -f` for each ephemeral fid @@ -76,6 +77,7 @@ def make_rms_f_prologue_epilogue(ephemeral_fids): asts.append(command) return asts + def make_ir_prologue(ephemeral_fids) -> "list[AstNode]": asts = [] ## Create an `rm -f` for each ephemeral fid @@ -89,7 +91,7 @@ def make_ir_prologue(ephemeral_fids) -> "list[AstNode]": args = [eph_fid.to_ast()] command = make_mkfifo_ast(args) mkfifo_asts.append(command) - + defun_mkfifos = make_defun(MKFIFO_PASH_FIFOS_NAME, make_semi_sequence(mkfifo_asts)) asts.append(defun_mkfifos) @@ -102,14 +104,20 @@ def make_ir_prologue(ephemeral_fids) -> "list[AstNode]": class_asts = [to_ast_node(ast) for ast in asts] return class_asts + def make_ir_epilogue(ephemeral_fids, clean_up_graph, log_file) -> "list[AstNode]": asts = [] - if (clean_up_graph): + if clean_up_graph: ## TODO: Wait for all output nodes not just one - pids = [[standard_var_ast('!')]] - clean_up_path_script = os.path.join(config.PASH_TOP, config.config['runtime']['clean_up_graph_binary']) - com_args = [string_to_argument('source'), string_to_argument(clean_up_path_script)] + pids - if (log_file == ""): + pids = [[standard_var_ast("!")]] + clean_up_path_script = os.path.join( + config.PASH_TOP, config.config["runtime"]["clean_up_graph_binary"] + ) + com_args = [ + string_to_argument("source"), + string_to_argument(clean_up_path_script), + ] + pids + if log_file == "": com = make_command(com_args) else: redirection = redir_append_stderr_to_string_file(log_file) @@ -117,7 +125,7 @@ def make_ir_epilogue(ephemeral_fids, clean_up_graph, log_file) -> "list[AstNode] asts.append(com) else: ## Otherwise we just wait for all processes to die. - wait_com = make_command([string_to_argument('wait')]) + wait_com = make_command([string_to_argument("wait")]) exit_status = make_command([string_to_argument("internal_exec_status=$?")]) asts.extend([wait_com, exit_status]) @@ -125,25 +133,28 @@ def make_ir_epilogue(ephemeral_fids, clean_up_graph, log_file) -> "list[AstNode] call_rm_pash_funs = make_command([string_to_argument(RM_PASH_FIFOS_NAME)]) asts.append(call_rm_pash_funs) - ## Make the following command: + ## Make the following command: # (exit $internal_exec_status) exit_ec_ast = make_exit_ec_ast() asts.append(exit_ec_ast) - + class_asts = [to_ast_node(ast) for ast in asts] return class_asts + def make_exit_ec_ast(): - command = make_command([string_to_argument("exit"), - [make_quoted_variable("internal_exec_status")]]) + command = make_command( + [string_to_argument("exit"), [make_quoted_variable("internal_exec_status")]] + ) ast = make_subshell(command) return ast - + def make_rm_f_ast(arguments): all_args = [string_to_argument("rm"), string_to_argument("-f")] + arguments return make_command(all_args) + def make_mkfifo_ast(arguments): all_args = [string_to_argument("mkfifo")] + arguments return make_command(all_args) diff --git a/compiler/parse.py b/compiler/parse.py index 74514b987..4b680212f 100644 --- a/compiler/parse.py +++ b/compiler/parse.py @@ -12,6 +12,7 @@ import libdash.parser + ## Parses straight a shell script to an AST ## through python without calling it as an executable INITIALIZE_LIBDASH = True @@ -22,33 +23,44 @@ def parse_shell_to_asts(input_script_path): INITIALIZE_LIBDASH = False ## Transform the untyped ast objects to typed ones typed_ast_objects = [] - for untyped_ast, original_text, linno_before, linno_after, in new_ast_objects: - typed_ast = to_ast_node(untyped_ast) - typed_ast_objects.append((typed_ast, original_text, linno_before, linno_after)) + for ( + untyped_ast, + original_text, + linno_before, + linno_after, + ) in new_ast_objects: + typed_ast = to_ast_node(untyped_ast) + typed_ast_objects.append( + (typed_ast, original_text, linno_before, linno_after) + ) return typed_ast_objects except libdash.parser.ParsingException as e: log("Parsing error!", e) sys.exit(1) + def parse_shell_to_asts_interactive(input_script_path: str): return libdash.parser.parse(input_script_path) + def from_ast_objects_to_shell(asts): shell_list = [] for ast in asts: # log("Ast:", ast) - if(isinstance(ast, UnparsedScript)): + if isinstance(ast, UnparsedScript): shell_list.append(ast.text) else: shell_list.append(ast.pretty()) return "\n".join(shell_list) + "\n" + def from_ast_objects_to_shell_file(asts, new_shell_filename): script = from_ast_objects_to_shell(asts) - with open(new_shell_filename, 'w') as new_shell_file: + with open(new_shell_filename, "w") as new_shell_file: new_shell_file.write(script) + ## Simply wraps the string_of_arg def pash_string_of_arg(arg, quoted=False): return string_of_arg(arg, quoted) diff --git a/compiler/pash.py b/compiler/pash.py index c8fee1391..627da39af 100755 --- a/compiler/pash.py +++ b/compiler/pash.py @@ -17,93 +17,122 @@ LOGGING_PREFIX = "PaSh: " + @logging_prefix(LOGGING_PREFIX) def main(): ## Parse arguments args, shell_name = parse_args() ## If it is interactive we need a different execution mode ## - ## The user can also ask for an interactive mode irregardless of whether pash was invoked in interactive mode. - if(len(args.input) == 0 or args.interactive): + ## The user can also ask for an interactive mode irregardless of whether pash was invoked in interactive mode. + if len(args.input) == 0 or args.interactive: log("ERROR: --interactive option is not supported!", level=0) - assert(False) + assert False else: input_script_path = args.input[0] input_script_arguments = args.input[1:] ## Preprocess and execute the parsed ASTs - return_code = preprocess_and_execute_asts(input_script_path, args, input_script_arguments, shell_name) - - log("-" * 40) #log end marker + return_code = preprocess_and_execute_asts( + input_script_path, args, input_script_arguments, shell_name + ) + + log("-" * 40) # log end marker ## Return the exit code of the executed script sys.exit(return_code) -def preprocess_and_execute_asts(input_script_path, args, input_script_arguments, shell_name): + +def preprocess_and_execute_asts( + input_script_path, args, input_script_arguments, shell_name +): preprocessed_shell_script = preprocess(input_script_path, args) - if(args.output_preprocessed): + if args.output_preprocessed: log("Preprocessed script:") log(preprocessed_shell_script) - + ## Write the new shell script to a file to execute fname = ptempfile() log("Preprocessed script stored in:", fname) - with open(fname, 'w') as new_shell_file: + with open(fname, "w") as new_shell_file: new_shell_file.write(preprocessed_shell_script) - ## 4. Execute the preprocessed version of the input script - if(not args.preprocess_only): - return_code = execute_script(fname, args.command, input_script_arguments, shell_name) + if not args.preprocess_only: + return_code = execute_script( + fname, args.command, input_script_arguments, shell_name + ) else: return_code = 0 return return_code - def parse_args(): prog_name = sys.argv[0] - if 'PASH_FROM_SH' in os.environ: - prog_name = os.environ['PASH_FROM_SH'] + if "PASH_FROM_SH" in os.environ: + prog_name = os.environ["PASH_FROM_SH"] ## We need to set `+` as a prefix char too - parser = argparse.ArgumentParser(prog_name, prefix_chars='-+') - parser.add_argument("input", nargs='*', help="the script to be compiled and executed (followed by any command-line arguments") - parser.add_argument("--preprocess_only", - help="only preprocess the input script and not execute it", - action="store_true") - parser.add_argument("--output_preprocessed", - help=" output the preprocessed script", - action="store_true") - parser.add_argument("--interactive", - help="Executes the script using an interactive internal shell session (experimental)", - action="store_true") - parser.add_argument("-c", "--command", - help="Evaluate the following as a script, rather than a file", - default=None) + parser = argparse.ArgumentParser(prog_name, prefix_chars="-+") + parser.add_argument( + "input", + nargs="*", + help="the script to be compiled and executed (followed by any command-line arguments", + ) + parser.add_argument( + "--preprocess_only", + help="only preprocess the input script and not execute it", + action="store_true", + ) + parser.add_argument( + "--output_preprocessed", + help=" output the preprocessed script", + action="store_true", + ) + parser.add_argument( + "--interactive", + help="Executes the script using an interactive internal shell session (experimental)", + action="store_true", + ) + parser.add_argument( + "-c", + "--command", + help="Evaluate the following as a script, rather than a file", + default=None, + ) ## This is not the correct way to parse these, because more than one option can be given together, e.g., -ae - parser.add_argument("-a", - help="Enabling the `allexport` shell option", - action="store_true", - default=False) - parser.add_argument("+a", - help="Disabling the `allexport` shell option", - action="store_false", - default=False) + parser.add_argument( + "-a", + help="Enabling the `allexport` shell option", + action="store_true", + default=False, + ) + parser.add_argument( + "+a", + help="Disabling the `allexport` shell option", + action="store_false", + default=False, + ) ## These two are here for compatibility with respect to bash - parser.add_argument("-v", - help="(experimental) prints shell input lines as they are read", - action="store_true") - parser.add_argument("-x", - help="(experimental) prints commands and their arguments as they execute", - action="store_true") + parser.add_argument( + "-v", + help="(experimental) prints shell input lines as they are read", + action="store_true", + ) + parser.add_argument( + "-x", + help="(experimental) prints commands and their arguments as they execute", + action="store_true", + ) ## Deprecated argument... keeping here just to output the message ## TODO: Do that with a custom argparse Action (KK: I tried and failed) - parser.add_argument("--expand_using_bash_mirror", - help="DEPRECATED: instead of expanding using the internal expansion code, expand using a bash mirror process (slow)", - action="store_true") + parser.add_argument( + "--expand_using_bash_mirror", + help="DEPRECATED: instead of expanding using the internal expansion code, expand using a bash mirror process (slow)", + action="store_true", + ) ## Set the preprocessing mode to PaSh - parser.set_defaults(preprocess_mode='pash') + parser.set_defaults(preprocess_mode="pash") config.add_common_arguments(parser) args = parser.parse_args() @@ -132,34 +161,38 @@ def parse_args(): ## Print the deprecated argument if args.expand_using_bash_mirror: - log("WARNING: Option --expand_using_bash_mirror is deprecated and is *ignored*.", level=0) + log( + "WARNING: Option --expand_using_bash_mirror is deprecated and is *ignored*.", + level=0, + ) ## TODO: We might need to have a better default (like $0 of pa.sh) shell_name = "pash" if args.command is not None: fname = ptempfile() - with open(fname, 'w') as f: + with open(fname, "w") as f: f.write(args.command) ## If the shell is invoked with -c and arguments after it, then these arguments ## need to be assigned to $0, $1, $2, ... and not $1, $2, $3, ... - if(len(args.input) > 0): + if len(args.input) > 0: ## Assign $0 shell_name = args.input[0] args.input = args.input[1:] args.input = [fname] + args.input - elif (len(args.input) > 0): + elif len(args.input) > 0: shell_name = args.input[0] - return args, shell_name + def shell_env(shell_name: str): new_env = os.environ.copy() new_env["PASH_TMP_PREFIX"] = config.PASH_TMP_PREFIX new_env["pash_shell_name"] = shell_name return new_env + ## The following two functions need to correspond completely def bash_prefix_args(): subprocess_args = ["/usr/bin/env", "bash"] @@ -174,28 +207,36 @@ def bash_prefix_args(): subprocess_args.append("-x") return subprocess_args + def bash_exec_string(shell_name): flags = [] if config.pash_args.a: - flags.append('-a') + flags.append("-a") if config.pash_args.v: - flags.append('-v') + flags.append("-v") if config.pash_args.x: - flags.append('-x') + flags.append("-x") return "exec -a{} bash {} -s $@\n".format(shell_name, " ".join(flags)) + def execute_script(compiled_script_filename, command, arguments, shell_name): new_env = shell_env(shell_name) subprocess_args = bash_prefix_args() - subprocess_args += ["-c", 'source {}'.format(compiled_script_filename), shell_name] + arguments + subprocess_args += [ + "-c", + "source {}".format(compiled_script_filename), + shell_name, + ] + arguments # subprocess_args = ["/usr/bin/env", "bash", compiled_script_filename] + arguments - log("Executing:", "PASH_TMP_PREFIX={} pash_shell_name={} {}".format(config.PASH_TMP_PREFIX, - shell_name, - " ".join(subprocess_args))) + log( + "Executing:", + "PASH_TMP_PREFIX={} pash_shell_name={} {}".format( + config.PASH_TMP_PREFIX, shell_name, " ".join(subprocess_args) + ), + ) exec_obj = subprocess.run(subprocess_args, env=new_env, close_fds=False) return exec_obj.returncode + if __name__ == "__main__": main() - - diff --git a/compiler/pash_compilation_server.py b/compiler/pash_compilation_server.py index efc766724..47e352867 100644 --- a/compiler/pash_compilation_server.py +++ b/compiler/pash_compilation_server.py @@ -3,6 +3,7 @@ import traceback from threading import Thread from datetime import datetime, timedelta + # import queue from sh_expand import env_vars_util @@ -15,7 +16,7 @@ import server_util ## -## A Daemon (not with the strict Unix sense) +## A Daemon (not with the strict Unix sense) ## that responds to requests for compilation ## @@ -24,8 +25,10 @@ def handler(signum, frame): log("Signal:", signum, "caught") shutdown() + signal.signal(signal.SIGTERM, handler) + def parse_args(): parser = argparse.ArgumentParser(add_help=False) config.add_common_arguments(parser) @@ -33,13 +36,14 @@ def parse_args(): return args + # Initialize the daemon def init(): ## Set the logging prefix config.LOGGING_PREFIX = "Daemon: " - + args = parse_args() config.set_config_globals_from_pash_args(args) @@ -47,12 +51,11 @@ def init(): if not config.config: config.load_config(args.config_path) - pash_compiler.runtime_config = config.config['distr_planner'] + pash_compiler.runtime_config = config.config["distr_planner"] return args - ## ## This class holds information for each process id ## @@ -66,7 +69,7 @@ def __init__(self, input_ir, compiler_config, exec_time=None, start_exec_time=No def set_exec_time(self, exec_time): self.exec_time = exec_time - + def set_start_exec_time(self, start_exec_time): self.start_exec_time = start_exec_time @@ -74,19 +77,19 @@ def get_start_exec_time(self): return self.start_exec_time def __repr__(self): - return f'ProcIdInfo(InputIR:{self.input_ir}, CompConfig:{self.compiler_config}, ExecTime:{self.exec_time})' + return f"ProcIdInfo(InputIR:{self.input_ir}, CompConfig:{self.compiler_config}, ExecTime:{self.exec_time})" class Scheduler: - """ Takes care of running processes in parallel if there is no conflict. + """Takes care of running processes in parallel if there is no conflict. The scheduler relies on the fact that process will wait for a compilation response. This allows it to control wether to allow the next process to run or wait for all other process. Flow: - input cmd -> - | Compile -> + input cmd -> + | Compile -> 1- Try compiling the pipeline 2- Wait for any unsafe processes to finish - 3- Check compilation for success and any conficts + 3- Check compilation for success and any conficts - no side effects -> allow to run in parallel by sending a response - failed or conflict -> wait for all process to exit then run this process in unsafe mode @@ -102,7 +105,9 @@ class Scheduler: def __init__(self): self.input_resources = set() self.output_resources = set() - self.process_resources = {} # map process_id -> (input_resources, output_resources) + self.process_resources = ( + {} + ) # map process_id -> (input_resources, output_resources) self.next_id = 0 self.running_procs = 0 self.unsafe_running = False @@ -112,7 +117,7 @@ def __init__(self): self.reader_pipes_are_blocking = True self.request_processing_start_time = 0 ## TODO: Make that be a class or something - + ## A map that keeps mappings between proc_id and (input_ir, width, exec_time) self.process_id_input_ir_map = {} ## This is a map from input IRs, i.e., locations in the code, to a list of process_ids @@ -121,7 +126,9 @@ def __init__(self): def check_resources_safety(self, process_id): proc_input_resources, proc_output_resources = self.process_resources[process_id] all_proc_resources = proc_input_resources.union(proc_output_resources) - if self.output_resources.intersection(all_proc_resources) or self.input_resources.intersection(proc_output_resources): + if self.output_resources.intersection( + all_proc_resources + ) or self.input_resources.intersection(proc_output_resources): return False return True @@ -144,12 +151,12 @@ def determine_compiler_config(self, input_ir_file): ## Goal: Find the highest width that gives benefits ## ## Strategy, start trying lower widths, if the time seems to drop, keep trying lower. - ## + ## width_avgs = self.get_averages_per_width(input_ir_file) log("Width averages:", width_avgs) widths = width_avgs.keys() - - ## If we have at least 1, with a specific width, + + ## If we have at least 1, with a specific width, ## and the minimum width has the lowest average, then try one lower if len(widths) > 0: min_width = min(widths) @@ -168,7 +175,10 @@ def determine_compiler_config(self, input_ir_file): if best_width == min_width and min_width > 1: ## Divide the min_width by 2 and try again selected_width = min_width // 2 - log("Best width is the lowest width, trying with width:", selected_width) + log( + "Best width is the lowest width, trying with width:", + selected_width, + ) else: selected_width = best_width log("Best width is:", best_width, "We will keep executing with it.") @@ -199,19 +209,19 @@ def get_averages_per_width(self, input_ir_file): width_times[width].append(exec_time) except: width_times[width] = [exec_time] - + ## We have gathered all times for each width width_avgs = {} for width, exec_times in width_times.items(): width_avgs[width] = sum(exec_times) / len(exec_times) - + return width_avgs ## This adds the time measurement, or just removes the entry if there is no exec_time (for space reclamation) def handle_time_measurement(self, process_id, exec_time): ## TODO: Could put those behind the profile_driven check too to not fill memory - assert(self.process_id_input_ir_map[process_id].exec_time is None) - + assert self.process_id_input_ir_map[process_id].exec_time is None + ## If we don't have the exec time we do Nothing ## ## TODO: Consider removing past entries that have no execution time. @@ -223,8 +233,10 @@ def handle_time_measurement(self, process_id, exec_time): # log("All measurements:", self.process_id_input_ir_map) def add_proc_id_map(self, process_id, input_ir_file, compiler_config): - assert(not process_id in self.process_id_input_ir_map) - self.process_id_input_ir_map[process_id] = ProcIdInfo(input_ir_file, compiler_config) + assert not process_id in self.process_id_input_ir_map + self.process_id_input_ir_map[process_id] = ProcIdInfo( + input_ir_file, compiler_config + ) ## Add the mapping from ir to process_id try: @@ -246,7 +258,9 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file): config.set_vars_file(var_file, vars_dict) variable_reading_end_time = datetime.now() - print_time_delta("Variable Loading", variable_reading_start_time, variable_reading_end_time) + print_time_delta( + "Variable Loading", variable_reading_start_time, variable_reading_end_time + ) daemon_compile_start_time = datetime.now() ## TODO: Make the compiler config based on profiling data @@ -255,39 +269,60 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file): self.add_proc_id_map(process_id, input_ir_file, compiler_config) ast_or_ir = pash_compiler.compile_ir( - input_ir_file, compiled_script_file, config.pash_args, compiler_config) + input_ir_file, compiled_script_file, config.pash_args, compiler_config + ) daemon_compile_end_time = datetime.now() - print_time_delta("Daemon Compile", daemon_compile_start_time, daemon_compile_end_time) + print_time_delta( + "Daemon Compile", daemon_compile_start_time, daemon_compile_end_time + ) self.wait_unsafe() if ast_or_ir != None: compile_success = True - maybe_generate_graphviz(ast_or_ir, config.pash_args, name=f'dfg-{process_id}') - - - proc_input_resources = set(map(lambda out: str(out.resource) if str( - out.resource) != "None" else out, ast_or_ir.all_input_fids())) - proc_output_resources = set(map(lambda out: str(out.resource) if str( - out.resource) != "None" else out, ast_or_ir.all_output_fids())) - - self.process_resources[process_id] = (proc_input_resources, proc_output_resources) + maybe_generate_graphviz( + ast_or_ir, config.pash_args, name=f"dfg-{process_id}" + ) + + proc_input_resources = set( + map( + lambda out: str(out.resource) + if str(out.resource) != "None" + else out, + ast_or_ir.all_input_fids(), + ) + ) + proc_output_resources = set( + map( + lambda out: str(out.resource) + if str(out.resource) != "None" + else out, + ast_or_ir.all_output_fids(), + ) + ) + + self.process_resources[process_id] = ( + proc_input_resources, + proc_output_resources, + ) run_parallel = self.check_resources_safety(process_id) if run_parallel: self.input_resources = self.input_resources.union(proc_input_resources) - self.output_resources = self.output_resources.union(proc_output_resources) + self.output_resources = self.output_resources.union( + proc_output_resources + ) - if not run_parallel: self.wait_for_all() - + if compile_success: response = server_util.success_response( - f'{process_id} {compiled_script_file} {var_file} {input_ir_file}') + f"{process_id} {compiled_script_file} {var_file} {input_ir_file}" + ) else: - response = server_util.error_response(f'{process_id} failed to compile') + response = server_util.error_response(f"{process_id} failed to compile") self.unsafe_running = True ## Do not increase the running procs if assert_compiler_success is enabled @@ -299,7 +334,9 @@ def compile_and_add(self, compiled_script_file, var_file, input_ir_file): ## Get the time before we start executing (roughly) to determine how much time this command execution will take command_exec_start_time = datetime.now() - self.process_id_input_ir_map[process_id].set_start_exec_time(command_exec_start_time) + self.process_id_input_ir_map[process_id].set_start_exec_time( + command_exec_start_time + ) return response def remove_process(self, process_id): @@ -307,8 +344,18 @@ def remove_process(self, process_id): if process_id in self.process_resources: del self.process_resources[process_id] # TODO: Should be improved to not rebuild inputs and outputs from scratch maybe use counters - self.input_resources = set().union(*[input_resources for input_resources, _ in self.process_resources.values()]) - self.output_resources = set().union(*[output_resources for _, output_resources in self.process_resources.values()]) + self.input_resources = set().union( + *[ + input_resources + for input_resources, _ in self.process_resources.values() + ] + ) + self.output_resources = set().union( + *[ + output_resources + for _, output_resources in self.process_resources.values() + ] + ) self.running_procs -= 1 if self.running_procs == 0: @@ -319,25 +366,32 @@ def get_next_id(self): return self.next_id def wait_for_all(self): - log("Waiting for all processes to finish. There are", self.running_procs, "processes remaining.") + log( + "Waiting for all processes to finish. There are", + self.running_procs, + "processes remaining.", + ) while self.running_procs > 0: input_cmd = self.get_input() # must be exit command or something is wrong - if (input_cmd.startswith("Exit:")): + if input_cmd.startswith("Exit:"): self.handle_exit(input_cmd) else: - raise Exception( - f"Command should be exit but it was {input_cmd}") + raise Exception(f"Command should be exit but it was {input_cmd}") self.unsafe_running = False def handle_exit(self, input_cmd): - assert(input_cmd.startswith("Exit:")) + assert input_cmd.startswith("Exit:") process_id = int(input_cmd.split(":")[1]) - ## Get the execution time + ## Get the execution time command_finish_exec_time = datetime.now() - command_start_exec_time = self.process_id_input_ir_map[process_id].get_start_exec_time() - exec_time = (command_finish_exec_time - command_start_exec_time) / timedelta(milliseconds=1) + command_start_exec_time = self.process_id_input_ir_map[ + process_id + ].get_start_exec_time() + exec_time = (command_finish_exec_time - command_start_exec_time) / timedelta( + milliseconds=1 + ) log("Process:", process_id, "exited. Exec time was:", exec_time) self.handle_time_measurement(process_id, exec_time) self.remove_process(process_id) @@ -347,34 +401,42 @@ def handle_exit(self, input_cmd): def wait_unsafe(self): log("Unsafe running:", self.unsafe_running) if self.unsafe_running: - assert(self.running_procs == 1) + assert self.running_procs == 1 self.wait_for_all() self.unsafe_running = False def parse_and_run_cmd(self, input_cmd): - if(input_cmd.startswith("Compile")): - compiled_script_file, var_file, input_ir_file = self.__parse_compile_command( - input_cmd) - response = self.compile_and_add(compiled_script_file, var_file, input_ir_file) + if input_cmd.startswith("Compile"): + ( + compiled_script_file, + var_file, + input_ir_file, + ) = self.__parse_compile_command(input_cmd) + response = self.compile_and_add( + compiled_script_file, var_file, input_ir_file + ) request_processing_end_time = datetime.now() - print_time_delta("Request handling", self.request_processing_start_time, request_processing_end_time) + print_time_delta( + "Request handling", + self.request_processing_start_time, + request_processing_end_time, + ) ## Send output to the specific command self.respond(response) - elif (input_cmd.startswith("Exit:")): + elif input_cmd.startswith("Exit:"): self.handle_exit(input_cmd) - elif (input_cmd.startswith("Done")): + elif input_cmd.startswith("Done"): self.wait_for_all() ## We send output to the top level pash process ## to signify that we are done. self.respond("All finished") self.done = True - elif (input_cmd.startswith("Daemon Start") or input_cmd == ""): + elif input_cmd.startswith("Daemon Start") or input_cmd == "": ## This happens when pa.sh first connects to daemon to see if it is on self.close_last_connection() else: - log(server_util.error_response(f'Error: Unsupported command: {input_cmd}')) - raise Exception(f'Error: Unsupported command: {input_cmd}') - + log(server_util.error_response(f"Error: Unsupported command: {input_cmd}")) + raise Exception(f"Error: Unsupported command: {input_cmd}") ## This method calls the reader to get an input def get_input(self): @@ -396,16 +458,20 @@ def __parse_compile_command(self, input): input_ir_file = components[2].split(":")[1] return compiled_script_file, var_file, input_ir_file except: - raise Exception(f'Parsing failure for line: {input}') + raise Exception(f"Parsing failure for line: {input}") def run(self): ## By default communicate through sockets, except if the user wants to do it through pipes - if (config.pash_args.daemon_communicates_through_unix_pipes): + if config.pash_args.daemon_communicates_through_unix_pipes: in_filename = os.getenv("RUNTIME_IN_FIFO") out_filename = os.getenv("RUNTIME_OUT_FIFO") - self.connection_manager = server_util.UnixPipeReader(in_filename, out_filename, self.reader_pipes_are_blocking) + self.connection_manager = server_util.UnixPipeReader( + in_filename, out_filename, self.reader_pipes_are_blocking + ) else: - self.connection_manager = server_util.SocketManager(os.getenv('DAEMON_SOCKET')) + self.connection_manager = server_util.SocketManager( + os.getenv("DAEMON_SOCKET") + ) while not self.done: # Process a single request input_cmd = self.get_input() @@ -413,17 +479,17 @@ def run(self): ## Parse the command (potentially also sending a response) self.parse_and_run_cmd(input_cmd) - + self.connection_manager.close() shutdown() - def shutdown(): ## There may be races since this is called through the signal handling log("PaSh daemon is shutting down...") log("PaSh daemon shut down successfully...") + def main(): args = init() if args.distributed_exec: @@ -434,7 +500,7 @@ def main(): scheduler = Scheduler() scheduler.run() - + if __name__ == "__main__": main() diff --git a/compiler/pash_compiler.py b/compiler/pash_compiler.py index 5d07f5c14..6b4e6829a 100644 --- a/compiler/pash_compiler.py +++ b/compiler/pash_compiler.py @@ -4,7 +4,9 @@ import traceback from datetime import datetime -from pash_annotations.annotation_generation.datatypes.parallelizability.AggregatorKind import AggregatorKindEnum +from pash_annotations.annotation_generation.datatypes.parallelizability.AggregatorKind import ( + AggregatorKindEnum, +) from sh_expand import env_vars_util @@ -26,10 +28,13 @@ import definitions.ir.nodes.r_unwrap as r_unwrap import definitions.ir.nodes.dgsh_tee as dgsh_tee import definitions.ir.nodes.dfs_split_reader as dfs_split_reader + # Distirbuted Exec -import dspash.hdfs_utils as hdfs_utils +import dspash.hdfs_utils as hdfs_utils runtime_config = {} + + ## We want to catch all exceptions here so that they are logged correctly ## and not just printed to the stderr. def main(): @@ -40,6 +45,7 @@ def main(): log(traceback.format_exc()) sys.exit(1) + def main_body(): global runtime_config @@ -51,7 +57,7 @@ def main_body(): if not config.config: config.load_config(args.config_path) - runtime_config = config.config['distr_planner'] + runtime_config = config.config["distr_planner"] ## Read any shell variables files if present vars_dict = env_vars_util.read_vars_file(args.var_file) @@ -61,30 +67,39 @@ def main_body(): ## Call the main procedure compiler_config = CompilerConfig(args.width) - ast_or_ir = compile_optimize_output_script(args.input_ir, args.compiled_script_file, args, compiler_config) + ast_or_ir = compile_optimize_output_script( + args.input_ir, args.compiled_script_file, args, compiler_config + ) maybe_generate_graphviz(ast_or_ir, args) def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("compiled_script_file", - help="the file in which to output the compiled script") - parser.add_argument("input_ir", - help="the file containing the dataflow graph to be optimized and executed") - parser.add_argument("--var_file", - help="determines the path of a file containing all shell variables.", - default=None) + parser.add_argument( + "compiled_script_file", help="the file in which to output the compiled script" + ) + parser.add_argument( + "input_ir", + help="the file containing the dataflow graph to be optimized and executed", + ) + parser.add_argument( + "--var_file", + help="determines the path of a file containing all shell variables.", + default=None, + ) config.add_common_arguments(parser) args, unknown_args = parser.parse_known_args() return args + ## TODO: Add more fields from args in this class CompilerConfig: def __init__(self, width): self.width = width - + def __repr__(self): - return f'CompilerConfig(Width:{self.width})' + return f"CompilerConfig(Width:{self.width})" + def compile_ir(ir_filename, compiled_script_file, args, compiler_config): """ @@ -92,61 +107,72 @@ def compile_ir(ir_filename, compiled_script_file, args, compiler_config): """ ret = None try: - ret = compile_optimize_output_script(ir_filename, compiled_script_file, args, compiler_config) + ret = compile_optimize_output_script( + ir_filename, compiled_script_file, args, compiler_config + ) except Exception as e: log("WARNING: Exception caught:", e) # traceback.print_exc() return ret -def compile_optimize_output_script(ir_filename, compiled_script_file, args, compiler_config): + +def compile_optimize_output_script( + ir_filename, compiled_script_file, args, compiler_config +): global runtime_config - + ret = None ## Load the df_region from a file candidate_df_region = load_df_region(ir_filename) - + ## Compile it - optimized_ast_or_ir = compile_optimize_df_region(candidate_df_region, args, compiler_config) + optimized_ast_or_ir = compile_optimize_df_region( + candidate_df_region, args, compiler_config + ) ## Call the backend that executes the optimized dataflow graph ## TODO: Should never be the case for now. This is obsolete. - assert(not runtime_config['distr_backend']) + assert not runtime_config["distr_backend"] ## If the candidate DF region was indeed a DF region then we have an IR ## which should be translated to a parallel script. - if(isinstance(optimized_ast_or_ir, IR)): + if isinstance(optimized_ast_or_ir, IR): if args.distributed_exec: ir_filename = ptempfile() - script_to_execute = f"$PASH_TOP/compiler/dspash/remote_exec_graph.sh {ir_filename}\n" + script_to_execute = ( + f"$PASH_TOP/compiler/dspash/remote_exec_graph.sh {ir_filename}\n" + ) ## This might not be needed anymore (since the output script is output anyway) ## TODO: This is probably useless, remove maybe_log_optimized_script(script_to_execute, args) with open(ir_filename, "wb") as f: - obj = (optimized_ast_or_ir, config.config['shell_variables']) + obj = (optimized_ast_or_ir, config.config["shell_variables"]) pickle.dump(obj, f) else: script_to_execute = to_shell(optimized_ast_or_ir, args) - + log("Optimized script saved in:", compiled_script_file) with open(compiled_script_file, "w") as f: f.write(script_to_execute) - + ret = optimized_ast_or_ir else: raise Exception("Script failed to compile!") - + return ret + def load_df_region(ir_filename): - log("Retrieving candidate DF region: {} ... ".format(ir_filename), end='') + log("Retrieving candidate DF region: {} ... ".format(ir_filename), end="") with open(ir_filename, "rb") as ir_file: candidate_df_region = pickle.load(ir_file) log("Done!") return candidate_df_region + def compile_optimize_df_region(df_region, args, compiler_config): ## Compile the candidate DF regions compilation_start_time = datetime.now() @@ -155,7 +181,7 @@ def compile_optimize_df_region(df_region, args, compiler_config): print_time_delta("Compilation", compilation_start_time, compilation_end_time) ## Optimize all the IRs that can be optimized - if(args.no_optimize): + if args.no_optimize: optimized_asts_and_irs = asts_and_irs else: optimized_asts_and_irs = optimize_irs(asts_and_irs, args, compiler_config) @@ -168,28 +194,30 @@ def compile_optimize_df_region(df_region, args, compiler_config): ## ## TODO: This might bite us with the quick-abort. ## It might complicate things having a script whose half is compiled to a graph and its other half not. - assert(len(optimized_asts_and_irs) == 1) + assert len(optimized_asts_and_irs) == 1 optimized_ast_or_ir = optimized_asts_and_irs[0] - + return optimized_ast_or_ir + def maybe_log_optimized_script(script_to_execute, args): ## TODO: Merge this write with the one below. Maybe even move this logic in `pash_runtime.sh` ## Output the optimized shell script for inspection - if(args.output_optimized): - output_script_path = runtime_config['optimized_script_filename'] + if args.output_optimized: + output_script_path = runtime_config["optimized_script_filename"] with open(output_script_path, "w") as output_script_file: log("Optimized script:") log(script_to_execute) output_script_file.write(script_to_execute) + def compile_candidate_df_region(candidate_df_region, config): ## This is for the files in the IR fileIdGen = FileIdGen() - + ## If the candidate DF region is not from the top level then ## it won't be a list and thus we need to make it into a list to compile it. - if(not isinstance(candidate_df_region, list)): + if not isinstance(candidate_df_region, list): candidate_df_region = [candidate_df_region] ## Compile the asts @@ -200,6 +228,7 @@ def compile_candidate_df_region(candidate_df_region, config): return compiled_asts + ## TODO: Switch args to compiler_config def optimize_irs(asts_and_irs, args, compiler_config): global runtime_config @@ -208,25 +237,28 @@ def optimize_irs(asts_and_irs, args, compiler_config): optimized_asts_and_irs = [] for ast_or_ir in asts_and_irs: - if(isinstance(ast_or_ir, IR)): + if isinstance(ast_or_ir, IR): ## Assert that the graph that was returned from compilation is valid - assert(ast_or_ir.valid()) + assert ast_or_ir.valid() # log(ir_node) # with cProfile.Profile() as pr: - distributed_graph = choose_and_apply_parallelizing_transformations(ast_or_ir, compiler_config.width, - runtime_config['batch_size'], - args.r_split_batch_size) + distributed_graph = choose_and_apply_parallelizing_transformations( + ast_or_ir, + compiler_config.width, + runtime_config["batch_size"], + args.r_split_batch_size, + ) # pr.print_stats() # Eagers are added in remote notes when using distributed exec - if(not args.no_eager and not args.distributed_exec): + if not args.no_eager and not args.distributed_exec: eager_distributed_graph = add_eager_nodes(distributed_graph) else: eager_distributed_graph = distributed_graph ## Assert that the graph stayed valid after all transformations - assert(eager_distributed_graph.valid()) + assert eager_distributed_graph.valid() ## Print statistics of output nodes print_graph_statistics(eager_distributed_graph) @@ -249,30 +281,37 @@ def print_graph_statistics(graph): log("Eager nodes:", len(eager_nodes)) -def choose_and_apply_parallelizing_transformations(graph, fan_out, batch_size, r_split_batch_size): +def choose_and_apply_parallelizing_transformations( + graph, fan_out, batch_size, r_split_batch_size +): parallelizer_map = choose_parallelizing_transformations(graph) - apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, - r_split_batch_size) + apply_parallelizing_transformations( + graph, parallelizer_map, fan_out, batch_size, r_split_batch_size + ) return graph -def choose_parallelizing_transformations(graph): # shall return map +def choose_parallelizing_transformations(graph): # shall return map source_node_ids = graph.source_nodes() parallelizer_map = {} workset = source_node_ids visited = set() # We apply a modified BFS such that we ensure that we know which parallelizer was chosen for all previous nodes # and assume that the decision for any subsequent node will exploit any potential synergy effects - while (len(workset) > 0): + while len(workset) > 0: curr_id = workset.pop(0) - assert(isinstance(curr_id, int)) - all_previous_nodes_visited = all(prev in visited for prev in graph.get_previous_nodes(curr_id)) + assert isinstance(curr_id, int) + all_previous_nodes_visited = all( + prev in visited for prev in graph.get_previous_nodes(curr_id) + ) if not all_previous_nodes_visited: workset.append(curr_id) elif not curr_id in visited: next_node_ids = graph.get_next_nodes(curr_id) workset += next_node_ids - parallelizer_map[curr_id] = choose_parallelizing_transformation(curr_id, graph) + parallelizer_map[curr_id] = choose_parallelizing_transformation( + curr_id, graph + ) visited.add(curr_id) return parallelizer_map @@ -281,29 +320,41 @@ def choose_parallelizing_transformations(graph): # shall return map ## 1. The round robin ## 2. The round robin after having performed unwrap (not sure why this is the second priority) ## 3. The consecutive chunks -## -## TODO: In the future, we could develop more complex strategies -def choose_parallelizing_transformation(curr_id, graph): # shall return map entry +## +## TODO: In the future, we could develop more complex strategies +def choose_parallelizing_transformation(curr_id, graph): # shall return map entry curr = graph.get_node(curr_id) - list_all_parallelizers_in_priority = [curr.get_option_implemented_round_robin_parallelizer(), - curr.get_option_implemented_round_robin_with_unwrap_parallelizer(), - curr.get_option_implemented_consecutive_chunks_parallelizer()] - return next((item for item in list_all_parallelizers_in_priority if item is not None), None) - - -def apply_parallelizing_transformations(graph, parallelizer_map, fan_out, batch_size, r_split_batch_size): + list_all_parallelizers_in_priority = [ + curr.get_option_implemented_round_robin_parallelizer(), + curr.get_option_implemented_round_robin_with_unwrap_parallelizer(), + curr.get_option_implemented_consecutive_chunks_parallelizer(), + ] + return next( + (item for item in list_all_parallelizers_in_priority if item is not None), None + ) + + +def apply_parallelizing_transformations( + graph, parallelizer_map, fan_out, batch_size, r_split_batch_size +): fileIdGen = graph.get_file_id_gen() - node_id_non_none_parallelizer_list = [(node_id, parallelizer) for (node_id, parallelizer) in parallelizer_map.items() - if parallelizer is not None] - for (node_id, parallelizer) in node_id_non_none_parallelizer_list: - graph.apply_parallelization_to_node(node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size) + node_id_non_none_parallelizer_list = [ + (node_id, parallelizer) + for (node_id, parallelizer) in parallelizer_map.items() + if parallelizer is not None + ] + for node_id, parallelizer in node_id_non_none_parallelizer_list: + graph.apply_parallelization_to_node( + node_id, parallelizer, fileIdGen, fan_out, r_split_batch_size + ) + def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen): """ Replaces hdfs cat with a cat per block, each cat uses has an HDFSResource input fid Returns: A normal Cat that merges the blocks (will be removed when parallizing next_node) """ - assert(isinstance(hdfs_cat, HDFSCat)) + assert isinstance(hdfs_cat, HDFSCat) ## At the moment this only works for nodes that have one standard input. if len(next_node.get_standard_inputs()) != 1: @@ -316,9 +367,11 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen): # Create a cat command per file block file_config = hdfs_utils.get_file_config(hdfs_filepath) - dummy_config_path = ptempfile() # Dummy config file, should be updated by workers + dummy_config_path = ptempfile() # Dummy config file, should be updated by workers for split_num, block in enumerate(file_config.blocks): - resource = DFSSplitResource(file_config.dumps(), dummy_config_path, split_num, block.hosts) + resource = DFSSplitResource( + file_config.dumps(), dummy_config_path, split_num, block.hosts + ) block_fid = fileIdGen.next_file_id() block_fid.set_resource(resource) graph.add_edge(block_fid) @@ -328,7 +381,12 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen): output_ids.append(output_fid.get_ident()) graph.add_edge(output_fid) - split_reader_node = dfs_split_reader.make_dfs_split_reader_node([block_fid.get_ident()], output_fid.get_ident(), split_num, config.HDFS_PREFIX) + split_reader_node = dfs_split_reader.make_dfs_split_reader_node( + [block_fid.get_ident()], + output_fid.get_ident(), + split_num, + config.HDFS_PREFIX, + ) graph.add_node(split_reader_node) # Remove the HDFS Cat command as it's not used anymore @@ -342,7 +400,6 @@ def split_hdfs_cat_input(hdfs_cat, next_node, graph, fileIdGen): return new_merger - ## This functions adds an eager on a given edge. def add_eager(eager_input_id, graph, fileIdGen): new_fid = fileIdGen.next_ephemeral_file_id() @@ -356,7 +413,7 @@ def add_eager(eager_input_id, graph, fileIdGen): ## Modify the next node inputs to be the new inputs next_node_id = graph.edges[eager_input_id][2] - if(not next_node_id is None): + if not next_node_id is None: next_node = graph.get_node(next_node_id) next_node.replace_edge(eager_input_id, new_id) graph.set_edge_to(new_id, next_node_id) @@ -373,12 +430,16 @@ def add_eager_nodes(graph): fileIdGen = graph.get_file_id_gen() ## Get the next nodes - workset = [node for source_node_id in source_node_ids for node in graph.get_next_nodes(source_node_id)] + workset = [ + node + for source_node_id in source_node_ids + for node in graph.get_next_nodes(source_node_id) + ] visited = set() - while (len(workset) > 0): + while len(workset) > 0: curr_id = workset.pop(0) curr = graph.get_node(curr_id) - if (not curr_id in visited): + if not curr_id in visited: visited.add(curr_id) next_node_ids = graph.get_next_nodes(curr_id) workset += next_node_ids @@ -387,7 +448,7 @@ def add_eager_nodes(graph): ## Add eager nodes if the node has more than one input curr_input_ids = graph.get_node_input_ids(curr_id) - if (len(curr_input_ids) > 1): + if len(curr_input_ids) > 1: ## TODO: If we know that a command reads its inputs in a list, ## then we might not need to put an eager on its first input. ## Note: This cannot be done for `sort -m` so we need to know in the @@ -395,23 +456,23 @@ def add_eager_nodes(graph): for curr_input_id in curr_input_ids: _fid, from_node, to_node = graph.edges[curr_input_id] - assert(to_node == curr_id) + assert to_node == curr_id ## If the edge is an input edge, then we don't want to put eager. - if(not from_node is None): + if not from_node is None: add_eager(curr_input_id, graph, fileIdGen) - if(isinstance(curr, Split)): + if isinstance(curr, Split): eager_input_ids = curr.get_output_list()[:-1] for edge_id in eager_input_ids: add_eager(edge_id, graph, fileIdGen) - ## Add an eager after r_unwrap - if(isinstance(curr, r_unwrap.RUnwrap)): + ## Add an eager after r_unwrap + if isinstance(curr, r_unwrap.RUnwrap): eager_input_id = curr.get_output_list()[0] add_eager(eager_input_id, graph, fileIdGen) ## Add an eager after r_split - if(isinstance(curr, r_split.RSplit)): + if isinstance(curr, r_split.RSplit): eager_input_ids = curr.get_output_list() for edge_id in eager_input_ids: add_eager(edge_id, graph, fileIdGen) diff --git a/compiler/pash_graphviz.py b/compiler/pash_graphviz.py index 425a00df4..70ad53909 100644 --- a/compiler/pash_graphviz.py +++ b/compiler/pash_graphviz.py @@ -1,34 +1,37 @@ - import os from ir import * from util import * ## Ensure that PASH_TMP_PREFIX is set by pa.sh -assert(not os.getenv('PASH_TIMESTAMP') is None) -PASH_TIMESTAMP = os.getenv('PASH_TIMESTAMP') -DIR_NAME = f'pash_graphviz_{PASH_TIMESTAMP}' +assert not os.getenv("PASH_TIMESTAMP") is None +PASH_TIMESTAMP = os.getenv("PASH_TIMESTAMP") +DIR_NAME = f"pash_graphviz_{PASH_TIMESTAMP}" + def maybe_init_graphviz_dir(args): if not args.graphviz == "no": init_graphviz_dir(args) + def init_graphviz_dir(args): graphviz_dir_path = os.path.join(args.graphviz_dir, DIR_NAME) try: os.mkdir(graphviz_dir_path) except: - print(f'Error: Graphviz dir:{graphviz_dir_path} could not be created!') + print(f"Error: Graphviz dir:{graphviz_dir_path} could not be created!") exit(1) - + log("Created graphviz dir:", graphviz_dir_path) -def maybe_generate_graphviz(ir: IR, args, name='dfg'): + +def maybe_generate_graphviz(ir: IR, args, name="dfg"): if not args.graphviz == "no": generate_graphviz(ir, args, name=name) -def generate_graphviz(ir: IR, args, name='dfg'): + +def generate_graphviz(ir: IR, args, name="dfg"): ## TODO: It is unclear if importing in here (instead of in general) ## improves startup cost of the pash_runtime when not using graphviz. import graphviz diff --git a/compiler/preprocessor/preprocessor.py b/compiler/preprocessor/preprocessor.py index e13e21ea6..11139e17b 100644 --- a/compiler/preprocessor/preprocessor.py +++ b/compiler/preprocessor/preprocessor.py @@ -12,34 +12,49 @@ LOGGING_PREFIX = "PaSh Preprocessor: " + @logging_prefix(LOGGING_PREFIX) def preprocess(input_script_path, args): ## 1. Execute the POSIX shell parser that returns the AST in JSON preprocessing_parsing_start_time = datetime.now() ast_objects = parse_shell_to_asts(input_script_path) preprocessing_parsing_end_time = datetime.now() - print_time_delta("Preprocessing -- Parsing", preprocessing_parsing_start_time, preprocessing_parsing_end_time) + print_time_delta( + "Preprocessing -- Parsing", + preprocessing_parsing_start_time, + preprocessing_parsing_end_time, + ) ## 2. Preprocess ASTs by replacing possible candidates for compilation ## with calls to the PaSh runtime. preprocessing_pash_start_time = datetime.now() preprocessed_asts = preprocess_asts(ast_objects, args) preprocessing_pash_end_time = datetime.now() - print_time_delta("Preprocessing -- PaSh", preprocessing_pash_start_time, preprocessing_pash_end_time) + print_time_delta( + "Preprocessing -- PaSh", + preprocessing_pash_start_time, + preprocessing_pash_end_time, + ) ## 3. Translate the new AST back to shell syntax preprocessing_unparsing_start_time = datetime.now() preprocessed_shell_script = from_ast_objects_to_shell(preprocessed_asts) preprocessing_unparsing_end_time = datetime.now() - print_time_delta("Preprocessing -- Unparsing", preprocessing_unparsing_start_time, preprocessing_unparsing_end_time) + print_time_delta( + "Preprocessing -- Unparsing", + preprocessing_unparsing_start_time, + preprocessing_unparsing_end_time, + ) return preprocessed_shell_script def preprocess_asts(ast_objects, args): trans_mode = transformation_options.TransformationType(args.preprocess_mode) if trans_mode is transformation_options.TransformationType.SPECULATIVE: - trans_options = transformation_options.SpeculativeTransformationState(po_file=args.partial_order_file) + trans_options = transformation_options.SpeculativeTransformationState( + po_file=args.partial_order_file + ) util_spec.initialize(trans_options) elif trans_mode is transformation_options.TransformationType.AIRFLOW: trans_options = transformation_options.AirflowTransformationState() @@ -59,11 +74,14 @@ def preprocess_asts(ast_objects, args): ## Then inform the scheduler that it can read it unix_socket_file = os.getenv("PASH_SPEC_SCHEDULER_SOCKET") - msg = util_spec.scheduler_server_init_po_msg(trans_options.get_partial_order_file()) + msg = util_spec.scheduler_server_init_po_msg( + trans_options.get_partial_order_file() + ) server_util.unix_socket_send_and_forget(unix_socket_file, msg) return preprocessed_asts + ## ## This is the command line interface for the preprocessor ## @@ -71,21 +89,28 @@ def main(): parser = argparse.ArgumentParser() config.add_general_config_arguments(parser) - subparsers = parser.add_subparsers(help='sub-command help') + subparsers = parser.add_subparsers(help="sub-command help") # create the parser for the "a" command - parser_pash = subparsers.add_parser('pash', help='Preprocess the script so that it can be run with PaSh') + parser_pash = subparsers.add_parser( + "pash", help="Preprocess the script so that it can be run with PaSh" + ) config.add_common_arguments(parser_pash) parser_pash.add_argument("input", help="the script to be preprocessed") - parser_pash.set_defaults(preprocess_mode='pash') + parser_pash.set_defaults(preprocess_mode="pash") # create the parser for the "b" command - parser_spec = subparsers.add_parser('spec', help='Preprocess the script so that it can be run with speculation') + parser_spec = subparsers.add_parser( + "spec", help="Preprocess the script so that it can be run with speculation" + ) parser_spec.add_argument("input", help="the script to be preprocessed") ## TODO: When we better integrate, this should be automatically set. - parser_spec.add_argument("partial_order_file", help="the file to store the partial order (currently just a sequence)") - parser_spec.set_defaults(preprocess_mode='spec') + parser_spec.add_argument( + "partial_order_file", + help="the file to store the partial order (currently just a sequence)", + ) + parser_spec.set_defaults(preprocess_mode="spec") args = parser.parse_args() config.set_config_globals_from_pash_args(args) @@ -99,5 +124,6 @@ def main(): preprocessed_shell_script = preprocess(args.input, args) print(preprocessed_shell_script) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/compiler/server_util.py b/compiler/server_util.py index 0bee98d3e..c50db3a50 100644 --- a/compiler/server_util.py +++ b/compiler/server_util.py @@ -4,15 +4,17 @@ import config from util import log + def success_response(string): - return f'OK: {string}\n' + return f"OK: {string}\n" def error_response(string): - return f'ERROR: {string}\n' + return f"ERROR: {string}\n" + class UnixPipeReader: - def __init__(self, in_filename, out_filename, blocking = True): + def __init__(self, in_filename, out_filename, blocking=True): self.in_filename = in_filename self.out_filename = out_filename self.buffer = "" @@ -35,7 +37,6 @@ def get_next_cmd(self): cmd = self.get_next_cmd_aux() return cmd - def get_next_cmd_aux(self): """ This method return depends on the reading mode. In blocking mode this method will @@ -46,13 +47,15 @@ def get_next_cmd_aux(self): input_buffer = "" if self.buffer: # Don't wait on fin if cmd buffer isn't empty - log("Reader buffer isn't empty. Using it instead of reading new data for the next command") + log( + "Reader buffer isn't empty. Using it instead of reading new data for the next command" + ) input_buffer = self.buffer else: log("Reader buffer is empty. Reading new data from input fifo") if self.blocking: with open(self.in_filename) as fin: - # This seems to be necessary for reading the full data. + # This seems to be necessary for reading the full data. # It seems like slower/smaller machines might not read the full data in one read while True: data = fin.read() @@ -64,7 +67,7 @@ def get_next_cmd_aux(self): log("Input buffer:", input_buffer) if "\n" in input_buffer: - cmd, rest = input_buffer.split("\n", 1) # split on the first \n only + cmd, rest = input_buffer.split("\n", 1) # split on the first \n only self.buffer = rest else: cmd = input_buffer @@ -83,7 +86,6 @@ def respond(self, message): fout.flush() fout.close() - ## This method doesn't do anything for unix pipe reader since we always read and write ## to and from the same fifos def close_last_connection(self): @@ -99,18 +101,16 @@ def unix_socket_send_and_forget(socket_file: str, msg: str): try: sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) sock.connect(socket_file) - msg_with_newline = msg + '\n' - byte_msg = msg_with_newline.encode('utf-8') + msg_with_newline = msg + "\n" + byte_msg = msg_with_newline.encode("utf-8") sock.sendall(byte_msg) data = sock.recv(config.SOCKET_BUF_SIZE) - str_data = data.decode('utf-8') + str_data = data.decode("utf-8") ## There should be no response on these messages - assert(len(str_data) == 0) + assert len(str_data) == 0 finally: log("Sent message:", msg, "to server.", level=1) sock.close() - - ## TODO: Instead of this, think of using a standard SocketServer @@ -137,28 +137,27 @@ def __init__(self, socket_addr: str): log("SocketManager: Created socket") self.sock.bind(server_address) - log("SocketManager: Successfully bound to socket") + log("SocketManager: Successfully bound to socket") ## TODO: Check if we need to configure the backlog - self.sock.listen() - log("SocketManager: Listenting on socket") + self.sock.listen() + log("SocketManager: Listenting on socket") ## Connection stack self.connections = [] - def get_next_cmd(self): connection, client_address = self.sock.accept() data = connection.recv(self.buf_size) ## TODO: This could be avoided for efficiency - str_data = data.decode('utf-8') + str_data = data.decode("utf-8") log("Received data:", str_data) ## TODO: Lift this requirement if needed ## ## We need to ensure that we read a command at once or the command was empty (only relevant in the first invocation) - assert(str_data.endswith("\n") or str_data == "") - + assert str_data.endswith("\n") or str_data == "" + self.connections.append(connection) return str_data @@ -166,7 +165,7 @@ def get_next_cmd(self): ## In the case of the UnixPipes, we don't have any state management here ## since all reads/writes go to/from the same fifos def respond(self, message): - bytes_message = message.encode('utf-8') + bytes_message = message.encode("utf-8") self.connections[-1].sendall(bytes_message) self.close_last_connection() diff --git a/compiler/shell_ast/ast_util.py b/compiler/shell_ast/ast_util.py index 3abc5ddbb..c1f09ab2d 100644 --- a/compiler/shell_ast/ast_util.py +++ b/compiler/shell_ast/ast_util.py @@ -6,8 +6,10 @@ ## This class is used by the preprocessor in ast_to_ir class PreprocessedAST: - def __init__(self, ast, replace_whole, non_maximal, something_replaced=True, last_ast=False): - assert(isinstance(ast, AstNode)) + def __init__( + self, ast, replace_whole, non_maximal, something_replaced=True, last_ast=False + ): + assert isinstance(ast, AstNode) self.ast = ast self.replace_whole = replace_whole self.non_maximal = non_maximal @@ -26,6 +28,7 @@ def will_anything_be_replaced(self): def is_last_ast(self): return self.last_ast + ## This class represents text that was not modified at all by preprocessing, and therefore does not ## need to be unparsed. class UnparsedScript: @@ -37,99 +40,123 @@ def __init__(self, text): ## Pattern matching for the AST ## + def check_if_ast_is_supported(construct, arguments, **kwargs): return + def format_args(args): formatted_args = [format_arg_chars(arg_chars) for arg_chars in args] return formatted_args + def format_arg_chars(arg_chars): chars = [format_arg_char(arg_char) for arg_char in arg_chars] return "".join(chars) + def format_arg_char(arg_char: ArgChar) -> str: return arg_char.format() + def string_to_carg_char_list(string: str) -> "list[CArgChar]": ret = [CArgChar(ord(char)) for char in string] return ret + def string_to_arguments(string): return [string_to_argument(word) for word in string.split(" ")] + def string_to_argument(string): ret = [char_to_arg_char(char) for char in string] return ret + def concat_arguments(arg1, arg2): ## Arguments are simply `arg_char list` and therefore can just be concatenated return arg1 + arg2 + ## FIXME: This is certainly not complete. It is used to generate the ## AST for the call to the distributed planner. It only handles simple ## characters def char_to_arg_char(char): - return ['C' , ord(char)] + return ["C", ord(char)] + def escaped_char(char): - return ['E' , ord(char)] + return ["E", ord(char)] + def standard_var_ast(string): return make_kv("V", ["Normal", False, string, []]) + def make_arith(arg): - return make_kv("A", arg) + return make_kv("A", arg) + def make_quoted_variable(string): return make_kv("Q", [standard_var_ast(string)]) + def quote_arg(arg): return make_kv("Q", arg) + def redir_append_stderr_to_string_file(string): - return make_kv("File",["Append",2,string_to_argument(string)]) + return make_kv("File", ["Append", 2, string_to_argument(string)]) + def redir_stdout_to_file(arg): - return make_kv("File",["To", 1, arg]) + return make_kv("File", ["To", 1, arg]) + def redir_file_to_stdin(arg): - return make_kv("File",["From", 0, arg]) + return make_kv("File", ["From", 0, arg]) + def make_background(body, redirections=[]): lineno = 0 node = make_kv("Background", [lineno, body, redirections]) return node + def make_backquote(node): node = make_kv("B", node) return node + def make_subshell(body, redirections=[]): lineno = 0 node = make_kv("Subshell", [lineno, body, redirections]) return node + def make_command(arguments, redirections=[], assignments=[]): lineno = 0 node = make_kv("Command", [lineno, assignments, arguments, redirections]) return node + def make_nop(): return make_command([string_to_argument(":")]) + def make_assignment(var, value): lineno = 0 - assignment=(var, value) - assignments=[assignment] + assignment = (var, value) + assignments = [assignment] node = make_kv("Command", [lineno, assignments, [], []]) return node + def make_semi_sequence(asts): - if(len(asts) == 0): + if len(asts) == 0: return make_nop() - if(len(asts) == 1): + if len(asts) == 1: return asts[0] else: acc = asts[-1] @@ -139,35 +166,41 @@ def make_semi_sequence(asts): acc = make_kv("Semi", [ast, acc]) return acc + def make_defun(name, body): lineno = 0 node = make_kv("Defun", [lineno, name, body]) return node + ## ## Make some nodes ## + def make_export_var_constant_string(var_name: str, value: str): node = make_export_var(var_name, string_to_argument(value)) return node + def make_export_var(var_name: str, arg_char_list): ## An argument is an arg_char_list - arg1 = string_to_argument(f'{var_name}=') - arguments = [string_to_argument("export"), - concat_arguments(arg1, arg_char_list)] + arg1 = string_to_argument(f"{var_name}=") + arguments = [string_to_argument("export"), concat_arguments(arg1, arg_char_list)] ## Pass all relevant argument to the planner node = make_command(arguments) return node + def export_pash_loop_iters_for_current_context(all_loop_ids: "list[int]"): if len(all_loop_ids) > 0: iter_var_names = [loop_iter_var(loop_id) for loop_id in all_loop_ids] - iter_vars = [standard_var_ast(iter_var_name) for iter_var_name in iter_var_names] + iter_vars = [ + standard_var_ast(iter_var_name) for iter_var_name in iter_var_names + ] concatted_vars = [iter_vars[0]] for iter_var in iter_vars[1:]: - concatted_vars.append(char_to_arg_char('-')) + concatted_vars.append(char_to_arg_char("-")) concatted_vars.append(iter_var) quoted_vars = [quote_arg(concatted_vars)] else: @@ -181,46 +214,46 @@ def export_pash_loop_iters_for_current_context(all_loop_ids: "list[int]"): def make_unset_var(var_name: str): ## An argument is an arg_char_list - arguments = [string_to_argument("unset"), - string_to_argument(var_name)] + arguments = [string_to_argument("unset"), string_to_argument(var_name)] ## Pass all relevant argument to the planner node = make_command(arguments) return node + def make_increment_var(var_name: str): - arg = string_to_argument(f'{var_name}+1') + arg = string_to_argument(f"{var_name}+1") arith_expr = make_arith(arg) - assignments = [[var_name, - [arith_expr]]] + assignments = [[var_name, [arith_expr]]] node = make_command([], assignments=assignments) return node + def make_echo_ast(argument, var_file_path): nodes = [] ## Source variables if present - if(not var_file_path is None): + if not var_file_path is None: arguments = [string_to_argument("source"), string_to_argument(var_file_path)] line_number = 0 - node = make_kv('Command', [line_number, [], arguments, []]) + node = make_kv("Command", [line_number, [], arguments, []]) nodes.append(node) ## Reset the exit status - variable_arg = make_kv('V', ['Normal', "false", 'pash_previous_exit_status', []]) + variable_arg = make_kv("V", ["Normal", "false", "pash_previous_exit_status", []]) arguments = [string_to_argument("exit"), [variable_arg]] - exit_node = make_kv('Command', [0, [], arguments, []]) - node = make_kv('Subshell', [0, exit_node, []]) + exit_node = make_kv("Command", [0, [], arguments, []]) + node = make_kv("Subshell", [0, exit_node, []]) nodes.append(node) ## Reset the input arguments - variable_arg = make_kv('V', ['Normal', "false", 'pash_input_args', []]) + variable_arg = make_kv("V", ["Normal", "false", "pash_input_args", []]) arguments = [string_to_argument("set"), string_to_argument("--"), [variable_arg]] - set_node = make_kv('Command', [0, [], arguments, []]) + set_node = make_kv("Command", [0, [], arguments, []]) nodes.append(set_node) arguments = [string_to_argument("echo"), string_to_argument("-n"), argument] line_number = 0 - node = make_kv('Command', [line_number, [], arguments, []]) + node = make_kv("Command", [line_number, [], arguments, []]) nodes.append(node) return nodes diff --git a/compiler/speculative/util_spec.py b/compiler/speculative/util_spec.py index 7783832fe..c117e4c6f 100644 --- a/compiler/speculative/util_spec.py +++ b/compiler/speculative/util_spec.py @@ -1,4 +1,3 @@ - import os import config @@ -8,6 +7,7 @@ ## This file contains utility functions useful for the speculative execution component ## + def initialize(trans_options) -> None: ## Make the directory that contains the files in the partial order dir_path = partial_order_directory() @@ -15,24 +15,31 @@ def initialize(trans_options) -> None: # ## Initialize the po file # initialize_po_file(trans_options, dir_path) + def partial_order_directory() -> str: - return f'{config.PASH_TMP_PREFIX}/speculative/partial_order/' + return f"{config.PASH_TMP_PREFIX}/speculative/partial_order/" + def partial_order_file_path(): - return f'{config.PASH_TMP_PREFIX}/speculative/partial_order_file' + return f"{config.PASH_TMP_PREFIX}/speculative/partial_order_file" + def initialize_po_file(trans_options, dir_path) -> None: ## Initializae the partial order file - with open(trans_options.get_partial_order_file(), 'w') as f: - f.write(f'# Partial order files path:\n') - f.write(f'{dir_path}\n') + with open(trans_options.get_partial_order_file(), "w") as f: + f.write(f"# Partial order files path:\n") + f.write(f"{dir_path}\n") + def scheduler_server_init_po_msg(partial_order_file: str) -> str: - return f'Init:{partial_order_file}' + return f"Init:{partial_order_file}" + ## TODO: To support partial orders, we need to pass some more context here, ## i.e., the connections of this node. Now it assumes we have a sequence. -def save_df_region(text_to_output: str, trans_options, df_region_id: int, predecessor_ids: int) -> None: +def save_df_region( + text_to_output: str, trans_options, df_region_id: int, predecessor_ids: int +) -> None: ## To support loops we also need to associate nodes with their surrounding loops current_loop_context = trans_options.get_current_loop_context() log("Df region:", df_region_id, "loop context:", current_loop_context) @@ -41,7 +48,7 @@ def save_df_region(text_to_output: str, trans_options, df_region_id: int, predec trans_options.add_node_loop_context(df_region_id, current_loop_context) # Save df_region as text in its own file - df_region_path = f'{partial_order_directory()}/{df_region_id}' + df_region_path = f"{partial_order_directory()}/{df_region_id}" with open(df_region_path, "w") as f: f.write(text_to_output) @@ -50,21 +57,24 @@ def save_df_region(text_to_output: str, trans_options, df_region_id: int, predec trans_options.add_edge(predecessor, df_region_id) - ## TODO: Figure out a way to put all serialization/deserialization of messages ## and parsing/unparsing in a specific module. + ## TODO: Move serialization to a partial_order_file.py def serialize_edge(from_id: int, to_id: int) -> str: - return f'{from_id} -> {to_id}\n' + return f"{from_id} -> {to_id}\n" + def serialize_number_of_nodes(number_of_ids: int) -> str: - return f'{number_of_ids}\n' + return f"{number_of_ids}\n" + def serialize_loop_context(node_id: int, loop_contexts) -> str: ## Galaxy brain serialization loop_contexts_str = ",".join([str(loop_ctx) for loop_ctx in loop_contexts]) - return f'{node_id}-loop_ctx-{loop_contexts_str}\n' + return f"{node_id}-loop_ctx-{loop_contexts_str}\n" + ## TODO: Eventually we might want to retrieve the number_of_ids from trans_options def save_number_of_nodes(trans_options): @@ -73,6 +83,7 @@ def save_number_of_nodes(trans_options): with open(partial_order_file_path, "a") as po_file: po_file.write(serialize_number_of_nodes(number_of_ids)) + def save_loop_contexts(trans_options): loop_context_dict = trans_options.get_all_loop_contexts() log("Loop context dict:", loop_context_dict) @@ -82,6 +93,7 @@ def save_loop_contexts(trans_options): loop_ctx = loop_context_dict[node_id] po_file.write(serialize_loop_context(node_id, loop_ctx)) + def serialize_partial_order(trans_options): ## Initialize the po file dir_path = partial_order_directory() diff --git a/compiler/util.py b/compiler/util.py index 2c131e0f7..4406a6dcb 100644 --- a/compiler/util.py +++ b/compiler/util.py @@ -2,30 +2,34 @@ import functools import logging from typing import Optional, TypeVar, Union, List, Any + TType = TypeVar("TType") import os import sys import config import tempfile + def flatten_list(lst): return [item for sublist in lst for item in sublist] + def unzip(lst): - res = [[ i for i, j in lst ], - [ j for i, j in lst ]] + res = [[i for i, j in lst], [j for i, j in lst]] return res + def pad(lst, index): - if(index >= len(lst)): + if index >= len(lst): lst += [None] * (index + 1 - len(lst)) return lst + def print_time_delta(prefix, start_time, end_time): ## Always output time in the log. time_difference = (end_time - start_time) / timedelta(milliseconds=1) ## If output_time flag is set, log the time - if (config.OUTPUT_TIME): + if config.OUTPUT_TIME: log("{} time:".format(prefix), time_difference, " ms", level=0) else: log("{} time:".format(prefix), time_difference, " ms") @@ -41,17 +45,21 @@ def wrapper(*args, **kwargs): result = func(*args, **kwargs) config.LOGGING_PREFIX = old_prefix return result + return wrapper + return decorator + ## This is a wrapper for prints -def log(*args, end='\n', level=1): +def log(*args, end="\n", level=1): ## If the debug logging level is at least ## as high as this log message. ## TODO: Allow all levels if level >= 1: concatted_args = " ".join([str(a) for a in list(args)]) - logging.info(f'{config.LOGGING_PREFIX} {concatted_args}') + logging.info(f"{config.LOGGING_PREFIX} {concatted_args}") + def ptempfile(): fd, name = tempfile.mkstemp(dir=config.PASH_TMP_PREFIX) @@ -59,21 +67,27 @@ def ptempfile(): os.close(fd) return name -def return_empty_list_if_none_else_itself(arg: Optional[TType]) -> Union[TType, List[Any]]: #list always empty + +def return_empty_list_if_none_else_itself( + arg: Optional[TType], +) -> Union[TType, List[Any]]: # list always empty if arg is None: return [] else: return arg + def return_default_if_none_else_itself(arg: Optional[TType], default: TType) -> TType: if arg is None: return default else: return arg + ## This function gets a key and a value from the ast json format def get_kv(dic): return (dic[0], dic[1]) + def make_kv(key, val): return [key, val] diff --git a/scripts/test_eval/logparser.py b/scripts/test_eval/logparser.py index ea3a82872..a48e78cf1 100644 --- a/scripts/test_eval/logparser.py +++ b/scripts/test_eval/logparser.py @@ -6,19 +6,20 @@ DEFAULT_LOG_FOLDER = "tmp_log/" + class LogParser: """ A class used to parse the pa.sh log files - All parse_* methods return a dataframe of only the files parsed in this call. + All parse_* methods return a dataframe of only the files parsed in this call. Use get_df for all parsed files across multible calls to parse_*. Methods: parse_file: parses a log file parse_folder: parses log files in a folder parse_log: parses a given log string - get_df: returns a comprehensive dataframe of every - log parsed (using any of the functions above) + get_df: returns a comprehensive dataframe of every + log parsed (using any of the functions above) during the function lifetime. Dataframe columns: @@ -48,24 +49,44 @@ class LogParser: def __init__(self, df=None): self.df = df if df else pd.DataFrame() - - def parse_log(self, log: str)->pd.DataFrame: + + def parse_log(self, log: str) -> pd.DataFrame: """ Parses a pa.sh log with path file_path Return: A single entry pandas dataframe, or None if failed """ - - border = "-"*40 + + border = "-" * 40 argslog, pashlog, timelog = log.split(border) - args_of_interest = set(["input", "width", "output_time", "no_eager", "r_split", "r_split_batch_size", "IN", "dgsh_tee"]) + args_of_interest = set( + [ + "input", + "width", + "output_time", + "no_eager", + "r_split", + "r_split_batch_size", + "IN", + "dgsh_tee", + ] + ) parsed_args = self.__parse_args__(argslog, args_of_interest) - tags_of_interest = set(["Execution time", "Backend time", "Compilation time", "Preprocessing time", "Eager nodes", "Compiler exited with code"]) + tags_of_interest = set( + [ + "Execution time", + "Backend time", + "Compilation time", + "Preprocessing time", + "Eager nodes", + "Compiler exited with code", + ] + ) parsed_log = self.__parse_pash_log__(pashlog, tags_of_interest) - #can be empty + # can be empty parsed_time = self.__parse_time_log__(timelog) if not parsed_args["input"]: @@ -77,23 +98,23 @@ def parse_log(self, log: str)->pd.DataFrame: split_type = "r-split" if parsed_args["r_split"] else "auto-split" data = { - #From Args - "test_name" : test_name, + # From Args + "test_name": test_name, "IN": os.path.basename(parsed_args["IN"]), - "split_type" : split_type, - "no_eager" : parsed_args["no_eager"], + "split_type": split_type, + "no_eager": parsed_args["no_eager"], "width": int(parsed_args["width"]), "r_split_batch_size": int(parsed_args["r_split_batch_size"]), "dgsh_tee": parsed_args["dgsh_tee"], - #From pash log + # From pash log "exec_time": parsed_log["Execution time"], "backend_time": parsed_log["Backend time"], "compilation_time": parsed_log["Compilation time"], "preprocess_time": parsed_log["Preprocessing time"], "eager_nodes": int(parsed_log["Eager nodes"]), - "compiler_exit" : parsed_log["Compiler exited with code"], - #From time - "gnu_real": parsed_time["gnu_real"], + "compiler_exit": parsed_log["Compiler exited with code"], + # From time + "gnu_real": parsed_time["gnu_real"], "gnu_usr": parsed_time["user"], "gnu_sys": parsed_time["sys"], "cpu%": parsed_time["cpu%"], @@ -103,12 +124,13 @@ def parse_log(self, log: str)->pd.DataFrame: "minor_pagefaults": int(parsed_time["minor_pagefaults"]), } - #update local and global df + # update local and global df df = df.append(data, ignore_index=True) self.df = self.df.append(data, ignore_index=True) return df - def parse_file(self, log_file: str)->pd.DataFrame: + + def parse_file(self, log_file: str) -> pd.DataFrame: """ Parses a pa.sh log with path file_path Return: @@ -120,11 +142,10 @@ def parse_file(self, log_file: str)->pd.DataFrame: df = self.parse_log(log) return df except: - print("failed to parse", log_file) - return pd.DataFrame() - + print("failed to parse", log_file) + return pd.DataFrame() - def parse_folder(self, path: str)->pd.DataFrame: + def parse_folder(self, path: str) -> pd.DataFrame: """ Parses all valid files ending with .log in the path directory. Params: @@ -132,12 +153,14 @@ def parse_folder(self, path: str)->pd.DataFrame: Return: pandas dataframe with all parsed logs """ - log_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".log")] + log_files = [ + os.path.join(path, f) for f in os.listdir(path) if f.endswith(".log") + ] ret_df = pd.DataFrame() for log_file in log_files: - df = self.parse_file(log_file) - ret_df = ret_df.append(df, ignore_index=True) - + df = self.parse_file(log_file) + ret_df = ret_df.append(df, ignore_index=True) + return ret_df def get_df(self): @@ -149,7 +172,7 @@ def get_df(self): def __parse_args__(self, args: str, args_of_interest): lines = args.split("\n") - args_dict = {i:False for i in args_of_interest} + args_dict = {i: False for i in args_of_interest} for line in lines: try: arg, val = line.split(" ") @@ -161,11 +184,11 @@ def __parse_args__(self, args: str, args_of_interest): except: continue return args_dict - - def __parse_pash_log__(self, args: str, tags_of_interest) : + + def __parse_pash_log__(self, args: str, tags_of_interest): lines = args.split("\n") - log_dict = {i:0 for i in tags_of_interest} - + log_dict = {i: 0 for i in tags_of_interest} + for line in lines: try: tag, val = line.split(": ") @@ -180,50 +203,69 @@ def __parse_pash_log__(self, args: str, tags_of_interest) : def __parse_time_log__(self, timelog: str): data_start = timelog.find("Command being timed: ") - time_data = timelog[data_start: ] + time_data = timelog[data_start:] - lines = [line.split(": ")[1] for line in time_data.replace("\t", "").split("\n")[:-1]] + lines = [ + line.split(": ")[1] for line in time_data.replace("\t", "").split("\n")[:-1] + ] if len(lines) < 23: - lines = [False]*23 + lines = [False] * 23 data = { - "command" : lines[0], - "user" : lines[1], - "sys" : lines[2], - "cpu%" : lines[3], - "gnu_real" : lines[4], - "max_resident" : lines[9], + "command": lines[0], + "user": lines[1], + "sys": lines[2], + "cpu%": lines[3], + "gnu_real": lines[4], + "max_resident": lines[9], "average_resident": lines[10], - "major_pagefaults" : lines[11], - "minor_pagefaults" : lines[12], - "exit_status" : lines[22] + "major_pagefaults": lines[11], + "minor_pagefaults": lines[12], + "exit_status": lines[22], } return data -#can be used in case we only can parse the time (default commands) + +# can be used in case we only can parse the time (default commands) def process_gnu_time(time_data): data_start = time_data.find("Command being timed: ") - time_data = time_data[data_start: ] - lines = [line.split(": ")[1] for line in time_data.replace("\t", "").split("\n")[:-1]] + time_data = time_data[data_start:] + lines = [ + line.split(": ")[1] for line in time_data.replace("\t", "").split("\n")[:-1] + ] data = { - "command" : lines[0], - "user" : lines[1], - "sys" : lines[2], - "cpu%" : lines[3], - "gnu_real" : lines[4], - "max_resident" : lines[9], + "command": lines[0], + "user": lines[1], + "sys": lines[2], + "cpu%": lines[3], + "gnu_real": lines[4], + "max_resident": lines[9], "average_resident": lines[10], - "major_pagefault" : lines[11], - "minor_pagefault" : lines[12], - "exit_status" : lines[22] + "major_pagefault": lines[11], + "minor_pagefault": lines[12], + "exit_status": lines[22], } return data -if __name__ == '__main__': - #sample execution + +if __name__ == "__main__": + # sample execution log_parser = LogParser() - #can pass folder name in first argument + # can pass folder name in first argument if len(argv) > 1: df = log_parser.parse_folder(argv[1]) else: df = log_parser.parse_folder(DEFAULT_LOG_FOLDER) - print(log_parser.get_df()[["test_name", "IN", "r_split_batch_size", "no_eager", "split_type", "exec_time", "cpu%", "width"]].to_string(index = False)) \ No newline at end of file + print( + log_parser.get_df()[ + [ + "test_name", + "IN", + "r_split_batch_size", + "no_eager", + "split_type", + "exec_time", + "cpu%", + "width", + ] + ].to_string(index=False) + ) diff --git a/scripts/test_eval/tester.py b/scripts/test_eval/tester.py index fca5e8825..901d127ac 100644 --- a/scripts/test_eval/tester.py +++ b/scripts/test_eval/tester.py @@ -5,43 +5,76 @@ import pandas as pd import uuid -GIT_TOP_CMD = [ 'git', 'rev-parse', '--show-toplevel', '--show-superproject-working-tree'] -if 'PASH_TOP' in os.environ: - PASH_TOP = os.environ['PASH_TOP'] +GIT_TOP_CMD = [ + "git", + "rev-parse", + "--show-toplevel", + "--show-superproject-working-tree", +] +if "PASH_TOP" in os.environ: + PASH_TOP = os.environ["PASH_TOP"] else: - PASH_TOP = run(GIT_TOP_CMD, stdout=PIPE, stderr=PIPE, universal_newlines=True).stdout.rstrip() + PASH_TOP = run( + GIT_TOP_CMD, stdout=PIPE, stderr=PIPE, universal_newlines=True + ).stdout.rstrip() + class Tests(LogParser): - def __init__(self, in_file = None, batch_sz = 100000): + def __init__(self, in_file=None, batch_sz=100000): self.in_file = in_file self.batch_sz = str(batch_sz) self.log_parser = LogParser() def time(self, command, env, stdout=PIPE): - time_command = ["/usr/bin/time" , "-v", "bash"] + time_command = ["/usr/bin/time", "-v", "bash"] time_command.extend(command) - result = run(time_command, stdout=PIPE, universal_newlines=True, stdin=None, stderr=PIPE, env=env) + result = run( + time_command, + stdout=PIPE, + universal_newlines=True, + stdin=None, + stderr=PIPE, + env=env, + ) return result def get_df(self): return self.log_parser.get_df() - def run_test(self, test_path, width = 2, r_split=False, batch_size=None, in_file=None, no_eager=False, dgsh_tee=False, log_folder=DEFAULT_LOG_FOLDER): - if in_file==None: + def run_test( + self, + test_path, + width=2, + r_split=False, + batch_size=None, + in_file=None, + no_eager=False, + dgsh_tee=False, + log_folder=DEFAULT_LOG_FOLDER, + ): + if in_file == None: in_file = self.in_file - + new_env = os.environ.copy() if in_file == None: in_file = self.in_file new_env["IN"] = in_file new_env["PASH_TOP"] = PASH_TOP - - command = [f"{PASH_TOP}/pa.sh", test_path, "--output_time", f"-w {width}", "-d 1"] + + command = [ + f"{PASH_TOP}/pa.sh", + test_path, + "--output_time", + f"-w {width}", + "-d 1", + ] if r_split: command.append("--r_split") - batch_size = str(batch_size) if batch_size else self.batch_sz #str(int(os.path.getsize(in_file)/90)) + batch_size = ( + str(batch_size) if batch_size else self.batch_sz + ) # str(int(os.path.getsize(in_file)/90)) command.append("--r_split_batch_size") command.append(batch_size) if no_eager: @@ -50,43 +83,77 @@ def run_test(self, test_path, width = 2, r_split=False, batch_size=None, in_file command.append("--dgsh_tee") result = self.time(command, new_env) - - #add IN file to log + + # add IN file to log result.stderr = f"IN {in_file}\n" + result.stderr - #write stderr to log_file if provided + # write stderr to log_file if provided log_file = self.__get_log_file__(test_path, log_folder) - with open(log_file, 'w') as f: + with open(log_file, "w") as f: f.write(result.stderr) - if result.returncode != 0: print(f"failed running: {test_path}") if log_file: print(f"log in {log_file}") - + df = self.log_parser.parse_log(result.stderr) return result, df - #Run provided tests in folder x with the env files - def run_folder_tests(self, tests, folder, width = 2, r_split=False, batch_size=None, in_file=None, no_eager=False, dgsh_tee=False, log_folder=None): + # Run provided tests in folder x with the env files + def run_folder_tests( + self, + tests, + folder, + width=2, + r_split=False, + batch_size=None, + in_file=None, + no_eager=False, + dgsh_tee=False, + log_folder=None, + ): pass - #run a list of tests, each test should be the full path of .sh file - #if log_folder provided it generates unique name for each log - def run_test_list(self, tests, width = 2, r_split=False, batch_size=None, in_file=None, no_eager=False, dgsh_tee=False, log_folder=DEFAULT_LOG_FOLDER): + # run a list of tests, each test should be the full path of .sh file + # if log_folder provided it generates unique name for each log + def run_test_list( + self, + tests, + width=2, + r_split=False, + batch_size=None, + in_file=None, + no_eager=False, + dgsh_tee=False, + log_folder=DEFAULT_LOG_FOLDER, + ): df = pd.DataFrame() for test in tests: - result, dfnew = self.run_test(test, width, r_split, batch_size, in_file, no_eager, dgsh_tee, log_folder) + result, dfnew = self.run_test( + test, + width, + r_split, + batch_size, + in_file, + no_eager, + dgsh_tee, + log_folder, + ) df = df.append(dfnew, ignore_index=True) return df - + def __get_log_file__(self, test_path, log_folder): if not os.path.exists(log_folder): os.makedirs(log_folder, exist_ok=True) - temp_filename = os.path.basename(test_path).replace(".sh", "") + "_" + str(uuid.uuid4()) + ".log" + temp_filename = ( + os.path.basename(test_path).replace(".sh", "") + + "_" + + str(uuid.uuid4()) + + ".log" + ) log_file = os.path.join(log_folder, temp_filename) - return log_file \ No newline at end of file + return log_file diff --git a/scripts/ws-client.py b/scripts/ws-client.py index b0f44a933..56a4aabcf 100644 --- a/scripts/ws-client.py +++ b/scripts/ws-client.py @@ -5,58 +5,77 @@ from websocket import create_connection -RESULT_POLLING_FREQUENCY=60 +RESULT_POLLING_FREQUENCY = 60 + def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("-b", "--target_branch", - help="the target branch to fork and run the tests on") - parser.add_argument("-c", "--target_commit", - help="the target commit to checkout to run the tests on") - parser.add_argument("-m", "--mode", - help="the execution mode. `run` runs and waits until the results are there, `wait` just waits, and `check` just returns the current task", - choices=['run', 'wait', 'check'], - default='run') + parser.add_argument( + "-b", "--target_branch", help="the target branch to fork and run the tests on" + ) + parser.add_argument( + "-c", + "--target_commit", + help="the target commit to checkout to run the tests on", + ) + parser.add_argument( + "-m", + "--mode", + help="the execution mode. `run` runs and waits until the results are there, `wait` just waits, and `check` just returns the current task", + choices=["run", "wait", "check"], + default="run", + ) args = parser.parse_args() return args + def issue_test_run(websocket, target_commit, target_branch): - run_tests_req_data = {"cmd": {"job": "issue", - "benchmark": "CORRECTNESS", - "commit": target_commit, - "branch": target_branch, - }} - msg = json.dumps(run_tests_req_data) + run_tests_req_data = { + "cmd": { + "job": "issue", + "benchmark": "CORRECTNESS", + "commit": target_commit, + "branch": target_branch, + } + } + msg = json.dumps(run_tests_req_data) websocket.send(msg) - print("POSIX Tests request made for branch:", target_branch, "and commit:", target_commit, file=sys.stderr) + print( + "POSIX Tests request made for branch:", + target_branch, + "and commit:", + target_commit, + file=sys.stderr, + ) + def fetch_runs(websocket): - data = {"cmd": {"job": "/fetch_runs", - "count": 50}} - msg = json.dumps(data) + data = {"cmd": {"job": "/fetch_runs", "count": 50}} + msg = json.dumps(data) # print("Sending:", msg, file=sys.stderr) websocket.send(msg) # print("Sent!", file=sys.stderr) res = websocket.recv() - runs_data = json.loads(res) + runs_data = json.loads(res) return runs_data + def current_task(websocket): data = {"cmd": {"job": "/current_task"}} - msg = json.dumps(data) + msg = json.dumps(data) # print("Sending:", msg, file=sys.stderr) websocket.send(msg) # print("Sent!", file=sys.stderr) res = websocket.recv() - res_data = json.loads(res) + res_data = json.loads(res) return res_data + def wait_for_result(websocket, target_commit): found = False sleep_duration = RESULT_POLLING_FREQUENCY while not found: - ## Fetch all runs runs_data = fetch_runs(websocket) result_rows = runs_data["data"]["rows"] @@ -96,7 +115,7 @@ def wait_for_result(websocket, target_commit): if args.mode == "run": ## Issue the POSIX tests requests issue_test_run(ws, target_commit, target_branch) - + ## ## Wait until we have the POSIX test results ## @@ -106,4 +125,4 @@ def wait_for_result(websocket, target_commit): print(result_row) -ws.close() \ No newline at end of file +ws.close()