-
Notifications
You must be signed in to change notification settings - Fork 0
/
clusterQuality.py
executable file
·130 lines (103 loc) · 3.56 KB
/
clusterQuality.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# coding =utf-8
# Read tagged edges file and compute rand index by compraring to "ground truth"
# gene cluster read count.
import sys
from sklearn import metrics
from collections import defaultdict as dd
# Disabling warning for this code
import warnings
warnings.filterwarnings("ignore")
clusterFile = sys.argv[1] # tabbed cluster file
fasta_file = sys.argv[2] # fasta file
# Clustering general info
nbCluster = 0
nbElem = 0
single = 0
classes = []
# SciKit label vectors
labels_pred = []
labels_true = []
# storing clusters
clusters = {}
# PARSING FASTA FILE
# Allows us to keep the real number of reads and
# associate read names to their id if required.
read_names = []
with open(fasta_file) as f:
for line in f:
if(line[0] == ">"):
line = line.rstrip("\n").split()[0][1:]
read_names.append(line)
# PARSING CLUSTER FILE
with open(clusterFile) as f:
for cluster in f:
data = cluster.rstrip("\n").split("\t")
cl_size = 0
for el in data:
# if read names are number, we must use their id
# to retreived their original name from the fasta
try:
read_name = read_names[int(el)]
except ValueError:
# if the read name is not a number
# then just use it as is
read_name = el
# Associating read name to cluster number
clusters[read_name] = nbCluster
# cluster size
cl_size += 1
# updating singleton count
if(cl_size == 1):
single += 1
# and updating cluster id / total sizes
nbCluster += 1
nbElem += cl_size
# LABELING
print("Generating labels...")
for read in read_names:
# In the case not all singleton are included in the cluster file
# we need to count them now
# trying to fetch read cluster id
try:
cl_id = clusters[read]
except KeyError:
# If key not found, this read is not in a cluster, so a singleton
single += 1
# Using the current number of cluster as cluster id
cl_id = nbCluster
# And increasing the number of cluster available
nbCluster += 1
# "truth"
tag = read.split("_")[-1]
# Storing distinct classes
if(tag not in classes):
classes.append(tag)
# "thruth" label
labels_true.append(classes.index(tag))
# prediction label
labels_pred.append(cl_id)
print("Number of clusters:", nbCluster, sep="\t")
print("Number of classes:", len(set(labels_true)), sep="\t")
print("Singletons: ", single, sep="\t")
print("Total number of elements", nbElem, sep="\t")
##############################################################################
# Metrics
#
# print(metrics.classification_report(labels_true, labels_pred))#, labels=classes))
MI = metrics.mutual_info_score(labels_true, labels_pred)
print("Mutual information :", MI, sep="\t")
# # Adjusted Rand Index
ari = metrics.adjusted_rand_score(labels_true, labels_pred)
print("Adjusted Rand Index :", ari, sep="\t")
# # Homogeneity, completeness and V-measure
h, c, vm = metrics.homogeneity_completeness_v_measure(labels_true, labels_pred)
print("Homogeneity :", h, sep="\t")
print("Completeness :", c, sep="\t")
print("V-measure :", vm, sep="\t")
# acc = metrics.accuracy_score(labels_true, labels_pred)
# print("Accuracy :", acc, sep="\t")
# Precision, recall, fscore
# pr, rc, f_score, support = metrics.precision_recall_fscore_support(labels_true, labels_pred)
# print("Precison :", pr, sep="\t")
# print("Recall :", rc, sep="\t")
# print("F1-Score :", f_score, sep="\t")