diff --git a/scripts/expanderAggregate.py b/scripts/expanderAggregate.py index 835a3e8c..0f9b29fa 100644 --- a/scripts/expanderAggregate.py +++ b/scripts/expanderAggregate.py @@ -1,50 +1,48 @@ from pathlib import Path +from typing import Union, Optional + import openpyxl from openpyxl.styles import Font import sys import argparse +ID_SPACE = (15000, 16000) + +FREE_IDS = list(range(ID_SPACE[0] + 1, ID_SPACE[1])) +PREFIX = 'BCIO' +ID_WIDTH = 6 + + +def register_id(id: Union[str, int]) -> Optional[int]: + if isinstance(id, str): + [prefix, id_str] = id.split(":") + if prefix.lower() != prefix.lower(): + return None + + id = int(id_str.strip()) + + if id in FREE_IDS: + FREE_IDS.remove(id) + + return id +def new_id() -> int: + return FREE_IDS.pop(0) -def generate_unique_id(base_id, id_list, num): - num_list = [] - split_base = base_id.split(":") - #print("Got split base",split_base[0]) - #print("id_list is ",id_list) - - # Get all BCIO IDs - for id in id_list: - split_id = id.split(":") - if split_id[0] == "BCIO": - split_id_num = int(split_id[1]) - if split_id_num > 15000 and split_id_num < 16000: - num_list.append(split_id_num) - num_list.sort() - - new_num = max(num_list) + 1 # The default is the latest one - - if split_base[0] == 'BCIO': - split_base_num = int(split_base[1]) - if split_base_num > 15000 and split_base_num < 16000: - if split_base_num+num not in num_list: # First fill gaps - new_num = split_base_num + num - - id_list.append("BCIO:"+str(new_num).zfill(6)) - - return id_list, "BCIO:"+str(new_num).zfill(6) - - -def add_extra_values(header, row, aggregate, id_list):#, num): + +def new_id_str() -> str: + return f"{PREFIX}:{str(new_id()).zfill(ID_WIDTH)}" + + +def add_extra_values(header, row, aggregate): aggregate = aggregate.lower() aggregate_list = aggregate.split(";") - #add "aggregate" to beginning of aggregate_list - aggregate_list.insert(0, "aggregate") + # add "aggregate" to beginning of aggregate_list + aggregate_list.insert(0, "aggregate") extra_rows = [] - i = 0#num for agg in aggregate_list: agg = agg.strip() - i = i+1 extra_values = {} name = "" for key, cell in zip(header, row): @@ -59,11 +57,11 @@ def add_extra_values(header, row, aggregate, id_list):#, num): elif key == "Parent": if agg == "aggregate": extra_values[key] = 'data item' - else: + else: extra_values[key] = f"{name} population statistic" - elif key == "ID": - id_list, new_id = generate_unique_id(cell.value, id_list, i) - extra_values[key] = new_id + elif key == "ID": + new_id = new_id_str() + extra_values[key] = new_id elif key == "Definition": extra_values[key] = "The " + agg + " of " + name + " in a population." elif key in ["Curation status"]: @@ -73,69 +71,55 @@ def add_extra_values(header, row, aggregate, id_list):#, num): elif key == "REL 'aggregate of'": extra_values[key] = name else: - extra_values[key] = "" + extra_values[key] = "" if any(extra_values.values()): extra_rows.append(extra_values) - return id_list, extra_rows#, num + + return extra_rows ## PROGRAM EXECUTION --- required argument: input file name if __name__ == '__main__': - parser=argparse.ArgumentParser() - parser.add_argument('--inputExcel', '-i',help='Name of the input Excel spreadsheet file') - - args=parser.parse_args() + parser = argparse.ArgumentParser() + parser.add_argument('--inputExcel', '-i', help='Name of the input Excel spreadsheet file') + + args = parser.parse_args() inputFileName = args.inputExcel - - if inputFileName is None : + + if inputFileName is None: parser.print_help() sys.exit('Not enough arguments. Expected at least -i "Excel file name" ') pathpath = str(Path(inputFileName).parents[0]) basename = str(Path(inputFileName).stem) suffix = str(Path(inputFileName).suffix) - + wb: openpyxl.Workbook = openpyxl.load_workbook(inputFileName) sheet = wb.active data = sheet.rows rows = [] aggregate_list = ["Mean", "Minimum", "Maximum", "Median"] header = [i.value for i in next(data)] - - #build ID list: - id_list = [] + + # build ID list: for row in sheet[2:sheet.max_row]: values = {} extra_rows = [] for key, cell in zip(header, row): values[key] = cell.value if key == "ID" and cell.value != None: - id_list.append(cell.value) - - # next_id = max(int(x.split(":")[-1]) for x in id_list if x.split(":")[0] == 'BCIO') + 1 - # # Generate missing IDs - # for row in sheet[2:sheet.max_row]: - # for key, cell in zip(header, row): - # if key == "ID" and cell.value is None: - # cell.value = f"BCIO:{next_id:06}" - # id_list.append(cell.value) - # next_id += 1 - wb.save(inputFileName) - + register_id(cell.value) # copy original: for row in sheet[2:sheet.max_row]: values = {} - extra_rows = [] - for key, cell in zip(header, row): + for key, cell in zip(header, row): values[key] = cell.value if any(values.values()): rows.append(values) - #create and add extra rows to end of sheet: - #num = -1 for row in sheet[2:sheet.max_row]: values = {} extra_rows = [] @@ -143,37 +127,30 @@ def add_extra_values(header, row, aggregate, id_list):#, num): for key, cell in zip(header, row): values[key] = cell.value if key == "Aggregate" and cell.value != None and cell.value != "": - #num = num+1 - id_list, extra_rows = add_extra_values(header, row, cell.value, id_list)#, num) #, num - for extra_row in extra_rows: #add to end of sheet - rows.append(extra_row) - - for r in range(len(rows)): - row = [v for v in rows[r].values()] - if "Aggregate" in header: - cell = row[header.index("Aggregate")] + extra_rows = add_extra_values(header, row, cell.value) - #new sheet to save: + for extra_row in extra_rows: # add to end of sheet + rows.append(extra_row) + # new sheet to save: save_wb = openpyxl.Workbook() save_sheet = save_wb.active for c in range(len(header)): - save_sheet.cell(row=1, column=c+1).value=header[c] - save_sheet.cell(row=1, column=c+1).font = Font(size=12,bold=True) + save_sheet.cell(row=1, column=c + 1).value = header[c] + save_sheet.cell(row=1, column=c + 1).font = Font(size=12, bold=True) + for r in range(len(rows)): row = [v for v in rows[r].values()] - #print("got row", row) - for c in range(len(header)): - #todo: why index out of bounds error here? Empty cells - try: - #print("got c:", row[c]) - save_sheet.cell(row=r+2, column=c+1).value=row[c] - except: - pass - # print("row[c] not there") - #save: + # print("got row", row) + for c in range(len(header)): + # todo: why index out of bounds error here? Empty cells + try: + # print("got c:", row[c]) + save_sheet.cell(row=r + 2, column=c + 1).value = row[c] + except: + pass + # print("row[c] not there") + # save: save_wb.save(pathpath + "/" + basename + "_Expanded.xlsx") print("success") - -