From 1af9f1a6d0a4db3a7204a8eeb9908b1da7f64cbc Mon Sep 17 00:00:00 2001 From: Darrin Schultz Date: Mon, 10 May 2021 13:36:38 -0700 Subject: [PATCH] updating odp for li - significance plots were not plotting --- scripts/odp | 142 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 98 insertions(+), 44 deletions(-) diff --git a/scripts/odp b/scripts/odp index 398d6b5..5e29d52 100644 --- a/scripts/odp +++ b/scripts/odp @@ -15,7 +15,7 @@ import sys configfile: "config.yaml" # check for legal config entries. Useful for finding misspelled entries -legal = ["proteins", "prot_to_loc", "genome", "minscafsize", "manual_breaks", "chrom_to_color"] +legal = ["proteins", "prot_to_loc", "genome", "minscafsize", "manual_breaks", "chrom_to_color", "plotorder"] illegal = set() for this_axis in ["xaxisspecies", "yaxisspecies"]: if this_axis in config: @@ -95,6 +95,7 @@ rule all: # working on plotting for color expand("synteny_analysis/prot_to_color/{colorby}_prottocolor.tsv", colorby = [x for x in config["xaxisspecies"] if "chrom_to_color" in config["xaxisspecies"][x]]), + # These need to be rewritten such that x and y do not match expand_avoid_matching_x_and_y_third( "synteny_analysis/plots/synteny_colored_by/{}_and_{}_coloredby_{}_synteny_autobreaks.pdf", config["xaxisspecies"], config["yaxisspecies"], @@ -123,20 +124,20 @@ rule all: "synteny_analysis/dvalue_table_breaks/{}_and_{}_xbreaks_manual.tsv", config["xaxisspecies"], config["yaxisspecies"]), - ##make the correlation plots - whole chromosomes - #expand_avoid_matching_x_and_y( - # "synteny_analysis/plots/significance/wholechr/{}_and_{}_fisher_wholechr.pdf", - # config["xaxisspecies"], config["yaxisspecies"]), - #expand_avoid_matching_x_and_y( - # "synteny_analysis/plots/significance/wholechr_colors/{}_and_{}_fisher_chromcolor.pdf", - # config["xaxisspecies"], config["yaxisspecies"]), - ## this currently doesn't work - need to trace back where the errors are from - #expand_avoid_matching_x_and_y( - #"synteny_analysis/plots/significance/breaks/{}_and_{}_fisher_manualbreaks.pdf", - # config["xaxisspecies"], config["yaxisspecies"]), - #expand_avoid_matching_x_and_y( - #"synteny_analysis/plots/significance/breaks/{}_and_{}_fisher_autobreaks.pdf", - # config["xaxisspecies"], config["yaxisspecies"]) + #make the correlation plots - whole chromosomes + expand_avoid_matching_x_and_y( + "synteny_analysis/plots/significance/wholechr/{}_and_{}_fisher_wholechr.pdf", + config["xaxisspecies"], config["yaxisspecies"]), + expand_avoid_matching_x_and_y( + "synteny_analysis/plots/significance/wholechr_colors/{}_and_{}_fisher_chromcolor.pdf", + config["xaxisspecies"], config["yaxisspecies"]), + # this currently doesn't work - need to trace back where the errors are from + expand_avoid_matching_x_and_y( + "synteny_analysis/plots/significance/breaks/{}_and_{}_fisher_manualbreaks.pdf", + config["xaxisspecies"], config["yaxisspecies"]), + expand_avoid_matching_x_and_y( + "synteny_analysis/plots/significance/breaks/{}_and_{}_fisher_autobreaks.pdf", + config["xaxisspecies"], config["yaxisspecies"]) def filter_fasta_chrom(chrom_file, input_fasta, output_fasta): @@ -345,7 +346,7 @@ rule get_genome_coords_x: print($name, length($seq), sum) }} \ }}' {input.genome} | \ sort -k2 -nr | \ - awk '{{sum = sum + $2; print($1, $2, sum) }}' > {output.coords} + awk '{{sum = sum + $2; print($1, $2, sum, sum - $2) }}' > {output.coords} """ rule get_genome_coords_y: @@ -363,22 +364,50 @@ rule get_genome_coords_y: print($name, length($seq), sum) }} \ }}' {input.genome} | \ sort -k2 -nr | \ - awk '{{sum = sum + $2; print($1, $2, sum) }}' > {output.coords} + awk '{{sum = sum + $2; print($1, $2, sum, sum - $2) }}' > {output.coords} """ +def genome_coords_to_plotstart_dict(path_to_genocoords_file): + """ + Takes a genome coords file where: + - col1: scaf name + - col2: scaflen + - col3: cumsum of the total plot size + - col4: the plot starting position for that scaffold + + sca1 3822568 3822568 0 + sca2 2667796 6490364 3822568 + sca3 2526311 9016675 2667796 + sca4 2410750 11427425 2526311 + sca5 2150379 13577804 2410750 + sca6 1771964 15349768 2150379 + + And returns a dict where col1 (scaf name) is key + and col4 (plotting offset) is the value + """ + offset_dict = {} + with open(path_to_genocoords_file, "r") as f: + for line in f: + line = line.strip() + if line: + splitd = line.split() + offset_dict[splitd[0]] = int(splitd[3]) + return offset_dict + def genome_coords_to_offset_dict(path_to_genocoords_file): """ Takes a genome coords file where: - col1: scaf name - col2: scaflen - col3: cumsum of the total plot size + - col4: the plot starting position for that scaffold - sca1 3822568 3822568 - sca2 2667796 6490364 - sca3 2526311 9016675 - sca4 2410750 11427425 - sca5 2150379 13577804 - sca6 1771964 15349768 + sca1 3822568 3822568 0 + sca2 2667796 6490364 3822568 + sca3 2526311 9016675 2667796 + sca4 2410750 11427425 2526311 + sca5 2150379 13577804 2410750 + sca6 1771964 15349768 2150379 And returns a dict where col1 (scaf name) is key and col3 (plotting offset) is the value @@ -501,7 +530,7 @@ def parse_coords(coords_file, sample, xory, max_coord = 0 lines_at = [] df = pd.read_csv(coords_file, header = None, sep = " ") - df.columns = ["scaf", "scaflen", "cumsum"] + df.columns = ["scaf", "scaflen", "cumsum", "coordstart"] # now figure out if we need to sort or not drop_nas = True # make sure that plotorder and sort_by_x_coord_blast aren't there together @@ -631,23 +660,24 @@ def calc_D_for_y_and_x(df, x_offset, y_offset, x_scaf_to_len, y_scaf_to_len): barright = -1 barmiddle = -1 barwidth = -1 - if i == 0: - thisend = row["{}stop".format(thisdir)] - nextstart = xdf.loc[i+1, "{}start".format(thisdir)] - barleft = this_offset[thisx] - barright = thisend + ((nextstart-thisend)/2) - elif i == (len(xdf) - 1): - prevend = xdf.loc[i-1, "{}stop".format(thisdir)] - thisstart = row["{}start".format(thisdir)] - barleft = prevend + ((thisstart-prevend)/2) - barright = this_scaf_to_len[thisx] - else: - prevend = xdf.loc[i-1, "{}stop".format(thisdir)] - thisstart = row["{}start".format(thisdir)] - thisend = row["{}stop".format(thisdir)] - nextstart = xdf.loc[i+1, "{}start".format(thisdir)] - barleft = prevend + ((thisstart-prevend)/2) - barright = thisend + ((nextstart-thisend)/2) + if len(xdf) > 1: + if i == 0: + thisend = row["{}stop".format(thisdir)] + nextstart = xdf.loc[i+1, "{}start".format(thisdir)] + barleft = this_offset[thisx] + barright = thisend + ((nextstart-thisend)/2) + elif i == (len(xdf) - 1): + prevend = xdf.loc[i-1, "{}stop".format(thisdir)] + thisstart = row["{}start".format(thisdir)] + barleft = prevend + ((thisstart-prevend)/2) + barright = this_scaf_to_len[thisx] + else: + prevend = xdf.loc[i-1, "{}stop".format(thisdir)] + thisstart = row["{}start".format(thisdir)] + thisend = row["{}stop".format(thisdir)] + nextstart = xdf.loc[i+1, "{}start".format(thisdir)] + barleft = prevend + ((thisstart-prevend)/2) + barright = thisend + ((nextstart-thisend)/2) xdf.loc[i, "D{}_barleft".format(thisdir)] = barleft xdf.loc[i, "D{}_barright".format(thisdir)] = barright xdf.loc[i, "D{}_barmiddle".format(thisdir)] = barleft + ((barright - barleft)/2) @@ -658,7 +688,8 @@ def calc_D_for_y_and_x(df, x_offset, y_offset, x_scaf_to_len, y_scaf_to_len): df.reset_index(drop=True, inplace = True) return df -def determine_breaks(df, scaf_to_breaks_set, scaf_to_offset_dict, sort_direction, auto_breaks): +def determine_breaks(df, scaf_to_breaks_set, scaf_to_offset_dict, + sort_direction, auto_breaks): """ determines the major breaks in Dx or Dy to use as partitions. @@ -696,8 +727,17 @@ def determine_breaks(df, scaf_to_breaks_set, scaf_to_offset_dict, sort_direction offset_position = thisposition + scaf_to_offset_dict[thisscaf] subdf = df.loc[df[sort_order[sort_direction]["end"]] <= offset_position, ] subdf = subdf.sort_values(by=[sort_order[sort_direction]["end"]]) + tempdf = df.loc[df[sort_order[sort_direction]["chrom"]] == thisscaf, ["xgene", "ygene", "xstart"]] + #print(thisscaf, thisposition, offset_position) + #print(tempdf) + #print(subdf.loc[subdf.index[-1]]) + #print() + #sys.exit() manual_breaks_indices.add(subdf.index[-1]) manual_breaks_indices = list(manual_breaks_indices) + #if not auto_breaks: + # print("manual_breaks") + # print(manual_breaks_indices) all_ranges = set() if auto_breaks: @@ -880,6 +920,8 @@ def gen_plotting_df(ycoords_file, xcoords_file, ystruct["prot_plot_start"] = {} ystruct["prot_plot_middle"] = {} ystruct["prot_plot_stop"] = {} + #print("xstruct") + #print(xstruct) # get rid of proteins that we don't need along the x axis # also set the plotting position based on the offset for prot in list(xstruct["prot_to_middle"].keys()): @@ -910,6 +952,8 @@ def gen_plotting_df(ycoords_file, xcoords_file, "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"] df = df[["xgene", "ygene", "bitscore", "evalue"]] + df["xgene"] = df["xgene"].astype(str) + df["ygene"] = df["ygene"].astype(str) #print(x_prot_to_loc) df["xstart"] = df["xgene"].map(xstruct["prot_plot_start"]) df["xmiddle"] = df["xgene"].map(xstruct["prot_plot_middle"]) @@ -1012,11 +1056,13 @@ rule generate_breaks_file: if scaf not in y_breaks: y_breaks[scaf] = set() y_breaks[scaf].add(position) - x_offset = genome_coords_to_offset_dict(input.xcoords) - y_offset = genome_coords_to_offset_dict(input.ycoords) + x_offset = genome_coords_to_plotstart_dict(input.xcoords) + y_offset = genome_coords_to_plotstart_dict(input.ycoords) print("printing the input before finding the breaks") df = pd.read_csv(input.table, delimiter="\t", index_col=0) + df["xgene"] = df["xgene"].astype(str) + df["ygene"] = df["ygene"].astype(str) print(df) # parse the manual breaks in the config file @@ -1072,7 +1118,11 @@ def synteny_plot(plotting_df, xcoords_file, ycoords_file, # dotted lines to show breaks in synteny # example entries are like this: xbreaks_df = pd.read_csv(xbreaks_file, delimiter="\t", index_col=0) + xbreaks_df["xgene"] = xbreaks_df["xgene"].astype(str) + xbreaks_df["ygene"] = xbreaks_df["ygene"].astype(str) ybreaks_df = pd.read_csv(ybreaks_file, delimiter="\t", index_col=0) + ybreaks_df["xgene"] = ybreaks_df["xgene"].astype(str) + ybreaks_df["ygene"] = ybreaks_df["ygene"].astype(str) # first make a lookup table of how to calculate the @@ -1090,6 +1140,8 @@ def synteny_plot(plotting_df, xcoords_file, ycoords_file, # first make a lookup table df = pd.read_csv(plotting_df, delimiter="\t", index_col=0) + df["xgene"] = df["xgene"].astype(str) + df["ygene"] = df["ygene"].astype(str) xgene = df["xgene"] x = df["xmiddle"] @@ -1387,6 +1439,8 @@ def fishers_exact(plotting_df, # first make a lookup table df = pd.read_csv(plotting_df, delimiter="\t", index_col=0) + df["xgene"] = df["xgene"].astype(str) + df["ygene"] = df["ygene"].astype(str) fisher_dict = [] if style == "breaks":