Skip to content

Commit

Permalink
Won't fix #93. Refining K562E special cases, output is still wrong
Browse files Browse the repository at this point in the history
  • Loading branch information
mdozmorov committed Jul 31, 2015
1 parent a4584e1 commit f653cd7
Showing 1 changed file with 16 additions and 12 deletions.
28 changes: 16 additions & 12 deletions grsnp/dbcreator_encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,15 +444,19 @@ def _get_celltype(f_name, gf_group):
if f_name.startswith(padding[gf_group]):
f_name = f_name[len(padding[gf_group]):]

# these are special cases
if "K562b" in f_name:
cell_type = "K562"
f_name = f_name.replace("K562b","K562")
categories = re.findall('[A-Z][^A-Z]*', f_name.split('.')[0])
elif "K562E" in f_name and "K562Ezh2" not in f_name:
cell_type = "K562"
f_name = f_name.replace("K562E","K562")
categories = re.findall('[A-Z][^A-Z]*', f_name.split('.')[0])
# # these are special cases
# if "K562b" in f_name:
# cell_type = "K562"
# f_name = f_name.replace("K562b","K562")
# categories = re.findall('[A-Z][^A-Z]*', f_name.split('.')[0])
# elif "K562E" in f_name and "K562Ezh2" not in f_name and "HaibK562" not in f_name and "SydhK562" not in f_name:
# cell_type = "K562"
# f_name = f_name.replace("K562Efos","K562Fos") # Capitalize factor letters
# f_name = f_name.replace("K562Egata","K562Gata")
# f_name = f_name.replace("K562Ehdac8","K562Hdac8")
# f_name = f_name.replace("K562Ejun","K562Jun")
# pdb.set_trace()
# categories = re.findall('[A-Z][^A-Z]*', f_name.split('.')[0])
categories = re.findall('[A-Z][^A-Z]*', f_name.split('.')[0])
return categories[1]

Expand Down Expand Up @@ -784,9 +788,9 @@ def update_progress(progress):
download_dir = os.path.join(args["data_dir"],"downloads",args['organism'])
gfs = args["featuregroups"].split(",")
gf_descriptions = _read_description_file(data_dir,args["organism"])
# for grp in ["wgEncodeSydhHistone"]:
for grp in gf_grp_sett.keys():
create_feature_set(data_dir,args['organism'],grp,None,6) # Remove ',2' limit to create full database
for grp in ["wgEncodeAwgTfbsUniform"]:
# for grp in gf_grp_sett.keys():
create_feature_set(data_dir,args['organism'],grp,None) # Remove ',2' limit to create full database
else:
print "ERROR: Requires UCSC organism code. Use --help for more information"
sys.exit()
Expand Down

0 comments on commit f653cd7

Please sign in to comment.