diff --git a/coral/sql.py b/coral/sql.py index 280d6db..a8c58d8 100644 --- a/coral/sql.py +++ b/coral/sql.py @@ -17,6 +17,10 @@ DEBUG = False +HDBSCAN = "hdbscan" +K_PROTOTYPES = "k-prototypes" +K_MODES = "k-modes" +K_MEANS = "k-means" if DEBUG: # for debugging. Shut down api-1-container @@ -883,46 +887,7 @@ def recommendSplit(): if int(request.values["numberOfClusters"]) > 0: optimal_k = int(request.values["numberOfClusters"]) else: - # determine useful number of clusters - # get the optimal number of clusters - n_clusters_range = range(2, 60) # arbitrarily chosen (talk to the user about this) - - # Calculate within-cluster sum of squares (inertia) for different k values - inertia_values = [] - for k in n_clusters_range: - kmeans = KMeans(n_clusters=k, n_init='auto') - kmeans.fit(tissues_attribute_df) - inertia_values.append(kmeans.inertia_) - - # Calculate the rate of change of inertia - # Inertia measures how well a dataset was clustered by K-Means. It is calculated by measuring the distance between each data point and its centroid, squaring this distance, and summing these squares across one cluster. - # https://www.codecademy.com/learn/machine-learning/modules/dspath-clustering/cheatsheet - rate_of_change = np.diff(inertia_values) # rate of change from 1 to 2, 2 to 3, etc - - # Find the "elbow point" where the rate of change starts to slow down - # Calculate the "elbow point" where the rate of change slows down - # if no elbow_point is found, the last index of inertia_values is chosen - elbow_point = len(inertia_values) - 1 - - _log.debug("rate_of_change %s", rate_of_change) - _log.debug("inertia_values %s", inertia_values) - - for i in range(len(rate_of_change) - 1): - diff1 = rate_of_change[i] - diff2 = rate_of_change[i + 1] - change_ratio = diff2 / diff1 - _log.debug("change_ratio %s", change_ratio) - if change_ratio < 0.1: # this is an "arbitrary" threshold. The smaller the threshold, the more clusters are chosen - elbow_point = i # the rate_of_change show e.g. the change from 3 clusters to 4 cluster in index 2 of rate_of_change. - # so the elbow point is the index of the rate_of_change where the change from e.g. 3 to 4 is not big anymore: index 2. 3 clusters is a good amount of clusters. - break - - optimal_k = n_clusters_range[elbow_point] # e.g. at the elbow point: 2 the number of clusters is 3 - - # somehow, for this data, this approach does not work. The change_ratio does not start high and go down, but is e.g. 0.40, 0.49, 0.61, 0.63, 0.59, 0.79, 0.49, 1.37, etc - - _log.debug("Optimal number of clusters: %s", optimal_k) - + optimal_k = k_ellbow(tissues_attribute_df, range(2, 10), K_MEANS) # do the clustering with the optimal or userdefined number of clusters clusterer_attribute_kmeans = KMeans(n_clusters=optimal_k, n_init='auto') clusterer_attribute_kmeans.fit(tissues_attribute_df) @@ -991,6 +956,58 @@ def recommendSplit(): abort(400, error) +# Helper function to determine a useful number of clusters +def k_ellbow(tissues_attribute_df, n_clusters_range=range(2, 60), cluster_method=K_MEANS, position_of_cat_attr=None): + + # Calculate within-cluster sum of squares (inertia) for different k values + inertia_values = [] + for k in n_clusters_range: + if cluster_method == K_MEANS: + kmeans = KMeans(n_clusters=k, n_init='auto') + kmeans.fit(tissues_attribute_df) + inertia_values.append(kmeans.inertia_) + # Inertia measures how well a dataset was clustered by K-Means. + # It is calculated by measuring the distance between each data point and its centroid, squaring this distance, and summing these squares across one cluster. + elif cluster_method == K_PROTOTYPES: + clusterer = KPrototypes(n_clusters=k, init='Cao', n_init=1, verbose=2) + clusterer.fit(tissues_attribute_df, categorical=position_of_cat_attr) + inertia_values.append(clusterer.cost_) + # is cost_ the right value to use here? + # For the K-prototypes function, cost is defined as the sum distance of all points to their respective cluster centroids. + # ==> same as inertia + + # Calculate the rate of change of inertia + # Inertia measures how well a dataset was clustered by K-Means. It is calculated by measuring the distance between each data point and its centroid, squaring this distance, and summing these squares across one cluster. + # https://www.codecademy.com/learn/machine-learning/modules/dspath-clustering/cheatsheet + rate_of_change = np.diff(inertia_values) # rate of change from 1 to 2, 2 to 3, etc + + # Find the "elbow point" where the rate of change starts to slow down + # Calculate the "elbow point" where the rate of change slows down + # if no elbow_point is found, the last index of inertia_values is chosen + elbow_point = len(inertia_values) - 1 + + _log.debug("rate_of_change %s", rate_of_change) + _log.debug("inertia_values %s", inertia_values) + + for i in range(len(rate_of_change) - 1): + diff1 = rate_of_change[i] + diff2 = rate_of_change[i + 1] + change_ratio = diff2 / diff1 + _log.debug("change_ratio %s", change_ratio) + if change_ratio < 0.1: # this is an "arbitrary" threshold. The smaller the threshold, the more clusters are chosen + elbow_point = i # the rate_of_change show e.g. the change from 3 clusters to 4 cluster in index 2 of rate_of_change. + # so the elbow point is the index of the rate_of_change where the change from e.g. 3 to 4 is not big anymore: index 2. 3 clusters is a good amount of clusters. + break + + optimal_k = n_clusters_range[elbow_point] # e.g. at the elbow point: 2 the number of clusters is 3 + + # somehow, for this data, this approach does not work. The change_ratio does not start high and go down, but is e.g. 0.40, 0.49, 0.61, 0.63, 0.59, 0.79, 0.49, 1.37, etc + + _log.debug("Optimal number of clusters: %s", optimal_k) + return optimal_k + + + @app.route("/createAutomatically", methods=["GET", "POST"]) @login_required def create_automatically(): @@ -1013,10 +1030,6 @@ def create_automatically(): # if there is just one (numerical) attribute, use hdbscan # if there are two (numerical) attributes, use hdbscan # if there is one categorical and one numerical attribute, use k-prototypes - HDBSCAN = "hdbscan" - K_PROTOTYPES = "k-prototypes" - K_MODES = "k-modes" - K_MEANS = "k-means" cluster_method = None query = QueryElements() @@ -1076,15 +1089,16 @@ def create_automatically(): # kmeans end # add the cluster labels to the tissues elif cluster_method == K_PROTOTYPES: - # 1 categorical and 1 numerical attribute ==> k-prototypes - # get the numerical and the categorical attributes - # get the numerical attributes and categorical attributes from tissues_attriubte_df according to the attribute types - position_of_cat_attr = 0 - if attribute0["type"] == "number": - position_of_cat_attr = 1 - num_clusters = 2 - clusterer = KPrototypes(n_clusters=num_clusters, init='Cao', n_init=1, verbose=2) - clusterer.fit(tissues_attribute_df, categorical=[position_of_cat_attr]) + # find the positions of the categorical attributes + position_of_cat_attr = [] + for i in range(len(attributes)): + if attributes[i]["type"] == "categorical": + position_of_cat_attr.append(i) + if number_of_clusters == 0: + # determine optimal number of clusters with ellbow method: + number_of_clusters = k_ellbow(tissues_attribute_df, range(2, 20), K_PROTOTYPES, position_of_cat_attr) + clusterer = KPrototypes(n_clusters=number_of_clusters, init='Cao', n_init=1, verbose=2) + clusterer.fit(tissues_attribute_df, categorical=position_of_cat_attr) # Get cluster labels labels = clusterer.labels_ elif cluster_method == K_MODES: @@ -1136,6 +1150,7 @@ def create_automatically(): abort(400, error) + # saved this just before for experimenting with attributes array # def create_automatically(): # # error msg is wrong, it is based on the cohortData route diff --git a/src/Taskview/tasks/Filter.ts b/src/Taskview/tasks/Filter.ts index 7c1bc88..c48cb58 100644 --- a/src/Taskview/tasks/Filter.ts +++ b/src/Taskview/tasks/Filter.ts @@ -162,9 +162,12 @@ export class Filter extends ATask { this.controls.insertAdjacentHTML( 'afterbegin', ` -
- -
+
+ + + + +
`, ); @@ -172,7 +175,14 @@ export class Filter extends ATask { .select('button.createAutomaticallyBtn') .on('click', () => { console.log("createAutomaticallyBtn clicked"); - this.createAutomatically(); + this.createAutomatically(false); + }); + + select(this.controls) + .select('button.createAutomaticallyWithNumberOfClustersBtn') + .on('click', () => { + console.log("createAutomaticallyWithNumberOfClustersBtn clicked"); + this.createAutomatically(true); }); } @@ -183,12 +193,12 @@ export class Filter extends ATask { // todo: implement let numberOfClusters = 0; - // if (useNumberOfClusters) { - // // select the bins field - // // binsCount = (this.controls.querySelector('#split input.bins') as HTMLInputElement).valueAsNumber; - // numberOfClusters = (this.controls.querySelector(`#split #recommendSplitControls input.clusters`) as HTMLInputElement).valueAsNumber; - // console.log("numberOfClusters", numberOfClusters); - // } + if (useNumberOfClusters) { + // select the bins field + // let controls = this.controls; + numberOfClusters = (this.controls.querySelector(`.controls input.clusters`) as HTMLInputElement).valueAsNumber; + console.log("numberOfClusters", numberOfClusters); + } let newCohortIds = []; let attributesMapped = this.attributes.map((attr) => {return {dataKey: attr.dataKey, type: attr.type}}); @@ -205,22 +215,22 @@ export class Filter extends ATask { console.log("createAutomatically scatterplot data", newCohortIds); } // TODO: create the cohorts and show them - // - // let cohortDescs: INewCohortDesc[]; - // cohortDescs = []; - // // for every selected cohort - // for (const cohort of this.cohorts) { - // // for every newCohort create a filter (for now... the filter is actually not needed, will be changed in the future) - // for (const newCohort of newCohortIds){ - // cohortDescs.push({ - // cohort: cohort, - // newCohortId: newCohort, - // attr:[this.attributes[0], this.attributes[1]] - // }); - // } - // } - // - // this.container.dispatchEvent(new AutoSplitEvent(cohortDescs)); + + let cohortDescs: INewCohortDesc[]; + cohortDescs = []; + // for every selected cohort + for (const cohort of this.cohorts) { + // for every newCohort create a filter (for now... the filter is actually not needed, will be changed in the future) + for (const newCohort of newCohortIds){ + cohortDescs.push({ + cohort: cohort, + newCohortId: newCohort, + attr:[this.attributes[0], this.attributes[1]] + }); + } + } + + this.container.dispatchEvent(new AutoSplitEvent(cohortDescs)); } diff --git a/src/Taskview/visualizations/GroupedBoxplot.ts b/src/Taskview/visualizations/GroupedBoxplot.ts index d1ac15d..0444df8 100644 --- a/src/Taskview/visualizations/GroupedBoxplot.ts +++ b/src/Taskview/visualizations/GroupedBoxplot.ts @@ -242,18 +242,37 @@ export class GroupedBoxplot extends MultiAttributeVisualization { this.container.dispatchEvent(new SplitEvent(filterDescs)); } - async createAutomatically() { + async createAutomatically(useNumberOfClusters: boolean = false) { console.log("createAutomatically GroupedBoxplot"); + let numberOfClusters = 0; + if (useNumberOfClusters) { + // select the bins field + // binsCount = (this.controls.querySelector('#split input.bins') as HTMLInputElement).valueAsNumber; + numberOfClusters = (this.controls.querySelector(`#split #recommendSplitControls input.clusters`) as HTMLInputElement).valueAsNumber; + console.log("numberOfClusters", numberOfClusters); + } + let newCohortIds = []; + let attributesMapped = this.attributes.map((attr) => {return {dataKey: attr.dataKey, type: attr.type}}); + // convert the attributesParam to a JSON object + let attributesParam: string = JSON.stringify(attributesMapped); for (const cht of this.cohorts) { + // const params: ICohortMultiAttrDBDataParams = { + // cohortId: cht.dbId, + // attribute0: this.attributes[0].dataKey, + // attribute0type: this.attributes[0].type, + // attribute1: this.attributes[1].dataKey, + // attribute1type: this.attributes[1].type, + // numberOfClusters: numberOfClusters, + // }; + const params: ICohortMultiAttrDBDataParams = { cohortId: cht.dbId, - attribute0: this.attributes[0].dataKey, - attribute0type: this.attributes[0].type, - attribute1: this.attributes[1].dataKey, - attribute1type: this.attributes[1].type + attributes: attributesParam, + numberOfClusters: numberOfClusters, }; + newCohortIds = await createDBCohortAutomatically(params) console.log("createAutomatically scatterplot data", newCohortIds); } diff --git a/src/Taskview/visualizations/MultiAttributeVisualization.ts b/src/Taskview/visualizations/MultiAttributeVisualization.ts index eaacf5b..ccd14f8 100644 --- a/src/Taskview/visualizations/MultiAttributeVisualization.ts +++ b/src/Taskview/visualizations/MultiAttributeVisualization.ts @@ -155,11 +155,11 @@ export abstract class MultiAttributeVisualization extends AVegaVisualization { `beforeend`, `
- + - +
`,