diff --git a/Replicating_Phenomizer_Results/Replicating_Phenomizer_Results.ipynb b/Replicating_Phenomizer_Results/Replicating_Phenomizer_Results.ipynb
new file mode 100644
index 0000000..8ff6363
--- /dev/null
+++ b/Replicating_Phenomizer_Results/Replicating_Phenomizer_Results.ipynb
@@ -0,0 +1,3289 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# The goal of this notebook is to replicate Phenomizer results in the paper \"Wikidata as a FAIR knowledge graph for the life sciences\"\n",
+ "Notes:
\n",
+ "1) Set up a python virual environment and make sure required softwares are installed. You can do do this in terminal by running `pip install -r requirements.txt`
\n",
+ "* Wikidata integrator: https://github.com/SuLab/WikidataIntegrator
\n",
+ "* Requirements: https://github.com/SuLab/Wikidata-phenomizer/blob/master/Replicating_Phenomizer_Results/requirements.txt
\n",
+ "\n",
+ "2) Download \"Wikidata_phenomizer_input_modifier.py\" from github link:
\n",
+ "* Python script: https://github.com/SuLab/Wikidata-phenomizer/Replicating_Phenomizer_Results/Wikidata_phenomizer_input_modifier.py
\n",
+ "\n",
+ "\n",
+ "3) Install BOQA: https://github.com/sulab/boqa
\n",
+ "\n",
+ "4) Download hpo.obo files (you can do that in this notebook)
\n",
+ "5) Download phenotype_annotation.tab files (you can do this in this notebook)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### The following steps are done to manipulate the data for BOQA analysis:\n",
+ "* Download `hpo.obo` and `phenotype_annotation.tab` files.\n",
+ "* Generate `phenotype_annotation_wd.tab` files (essentially `phenotype_annotation.tab` files with wikidata items appended at the end).\n",
+ "* Rename each file `DBname` to those found in build 1266. (Makes analysis more consistent).\n",
+ "* Run BOQA\n",
+ "* Extract text from BOQA\n",
+ "* Graph!"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Import libraries required."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Downloading the annotation_files and obo_files."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/rogertu/Phenomizer\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pwd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create a directory to download annotation files to.\n",
+ "!mkdir _annotation_files\n",
+ "os.chdir(\"_annotation_files\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Download `phenotype_annotation.tab` builds from human phenotype ontology Jenkin's servers."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# From Jenkins, Download all `phenotype_annotation.tab` files\n",
+ "phenotype_annotation_ls = ['1266','1265','1264','1263','1262',\n",
+ " '1259','1254','1252','1249','1248',\n",
+ " '1246','1241','1239','1237']\n",
+ "\n",
+ "for a_file in phenotype_annotation_ls:\n",
+ " curCount = phenotype_annotation_ls.index(a_file) # gets index from the list\n",
+ " fileName = \"http://compbio.charite.de/jenkins/job/hpo.annotations/\" + a_file +\"/artifact/misc/phenotype_annotation.tab\"\n",
+ " !wget $fileName -q\n",
+ " reName = 'pa_b' + phenotype_annotation_ls[curCount] + \".tab\"\n",
+ " !mv phenotype_annotation.tab $reName"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Change out (up) of this directory, create a new directory, and change into (down) that directory."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get out of current directory, create a new directory, change into the new directory.\n",
+ "os.chdir(\"..\")\n",
+ "!mkdir _obo_files\n",
+ "os.chdir(\"_obo_files\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Download old `hpo.obo` builds from Bioportal into the newly created folder."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# From Bioportal, download `hp.obo` files\n",
+ "obo_ls = ['577','576','575','574','573',\n",
+ " '572','571','570','569','568',\n",
+ " '567','566','564', '563']\n",
+ "\n",
+ "for a_file in obo_ls:\n",
+ " curCount = obo_ls.index(a_file) # gets index from the list\n",
+ " fileName = \"http://data.bioontology.org/ontologies/HP/submissions/\" + a_file + \"/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\"\n",
+ " !wget $fileName -q\n",
+ " reName = 'hp_b' + phenotype_annotation_ls[curCount] + \".obo\"\n",
+ " !mv download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb $reName\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Fix phenotype_annotation.tab builds 1248, 1249, 1252 and 1254 because they have extra columns..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# fix build 1248, 1249, 1252 and 1254 because it has 15 columns vs 14 (why?!?!)\n",
+ "os.chdir(\"..\")\n",
+ "os.chdir(\"_annotation_files\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (3,7,11,14) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " interactivity=interactivity, compiler=compiler, result=result)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# read to a dictionary, manipulate files with pandas\n",
+ "\n",
+ "toFix = ['b1248', 'b1249', 'b1252', 'b1254']\n",
+ "toFix_dict = dict()\n",
+ "\n",
+ "for item in toFix:\n",
+ " key = item\n",
+ " value = pd.read_csv(\"pa_\" + item + \".tab\", header = None, sep = \"\\t\")\n",
+ " toFix_dict.update({key:value})\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#*If you want to visualize the dataframe before column switching, convert this from markdown → code.*
\n",
+ "`for item in toFix:\n",
+ " display(toFix_dict[item].head())`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 11 | \n",
+ " 12 | \n",
+ " 13 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0000252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001249 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001250 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001518 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n",
+ "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n",
+ "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n",
+ "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n",
+ "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n",
+ "\n",
+ " 7 8 9 10 11 12 13 14 \n",
+ "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 11 | \n",
+ " 12 | \n",
+ " 13 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0000252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001249 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001250 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001518 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n",
+ "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n",
+ "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n",
+ "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n",
+ "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n",
+ "\n",
+ " 7 8 9 10 11 12 13 14 \n",
+ "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 11 | \n",
+ " 12 | \n",
+ " 13 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0000252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001249 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001250 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001518 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n",
+ "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n",
+ "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n",
+ "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n",
+ "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n",
+ "\n",
+ " 7 8 9 10 11 12 13 14 \n",
+ "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 11 | \n",
+ " 12 | \n",
+ " 13 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0000252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001249 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001250 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001518 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " 2013-05-29 | \n",
+ " HPO:skoehler | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n",
+ "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n",
+ "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n",
+ "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n",
+ "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n",
+ "\n",
+ " 7 8 9 10 11 12 13 14 \n",
+ "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n",
+ "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "for item in toFix:\n",
+ " display(toFix_dict[item].head())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Concatenate col 13 and 12 as 'cat', remove col 13 and 12, and reorder indices.\n",
+ "for item in toFix:\n",
+ " df = toFix_dict[item]\n",
+ " df['cat'] = df[13] + \"[\" + df[12] + \"]\"\n",
+ " del df[12]\n",
+ " del df[13]\n",
+ " value = df[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,'cat' , 14]]\n",
+ " toFix_dict.update({item:value})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#*If you want to visualize the dataframe after column switching.*
\n",
+ "\n",
+ "` for item in toFix:\n",
+ " display(toFix_dict[item].head()) `"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 11 | \n",
+ " cat | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0000252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001249 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001250 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001518 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n",
+ "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n",
+ "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n",
+ "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n",
+ "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n",
+ "\n",
+ " 7 8 9 10 11 cat 14 \n",
+ "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 11 | \n",
+ " cat | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0000252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001249 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001250 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001518 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n",
+ "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n",
+ "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n",
+ "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n",
+ "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n",
+ "\n",
+ " 7 8 9 10 11 cat 14 \n",
+ "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 11 | \n",
+ " cat | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0000252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001249 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001250 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001518 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n",
+ "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n",
+ "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n",
+ "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n",
+ "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n",
+ "\n",
+ " 7 8 9 10 11 cat 14 \n",
+ "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 11 | \n",
+ " cat | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0000252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001249 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001250 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001252 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " DECIPHER | \n",
+ " 1 | \n",
+ " Wolf-Hirschhorn Syndrome | \n",
+ " NaN | \n",
+ " HP:0001518 | \n",
+ " DECIPHER:1 | \n",
+ " IEA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " O | \n",
+ " WOLF-HIRSCHHORN SYNDROME | \n",
+ " HPO:skoehler[2013-05-29] | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5 6 \\\n",
+ "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n",
+ "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n",
+ "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n",
+ "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n",
+ "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n",
+ "\n",
+ " 7 8 9 10 11 cat 14 \n",
+ "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n",
+ "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "for item in toFix:\n",
+ " display(toFix_dict[item].head())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Export merged files.\n",
+ "for item in toFix:\n",
+ " df = toFix_dict[item]\n",
+ " df.to_csv(\"pa_\" + item + \".tab\", sep = '\\t', header = None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Congrats! You've downloaded the both the datasets and fixed the files that had too many columns. \n",
+ "Next we'll explore utilizing Phenomizer. Phenomizer (Su Lab) is a program that takes a `phenotype_annotation.tab` file, queries wikidata for all OMIM entries and appends them to the phenotype_annotation, and returns the file as a `phenotype_annotation_wd.tab` file."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Get `Wikidata_phenomizer_input_modifier.py` from github to the folder that holding your `_annotation_files`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--2019-10-09 17:52:08-- https://raw.githubusercontent.com/turoger/Wikidata-phenomizer/master/Replicating_Phenomizer_Results/Wikidata_phenomizer_input_modifier.py\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.196.133\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.196.133|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 3742 (3.7K) [text/plain]\n",
+ "Saving to: ‘Wikidata_phenomizer_input_modifier.py’\n",
+ "\n",
+ "Wikidata_phenomizer 100%[===================>] 3.65K --.-KB/s in 0s \n",
+ "\n",
+ "2019-10-09 17:52:09 (58.0 MB/s) - ‘Wikidata_phenomizer_input_modifier.py’ saved [3742/3742]\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "!wget https://raw.githubusercontent.com/turoger/Wikidata-phenomizer/master/Replicating_Phenomizer_Results/Wikidata_phenomizer_input_modifier.py"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/rogertu/Phenomizer/_annotation_files\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pwd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Run phenomizer to generate wikidata + HPO `_wd.tab` files."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 131600\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 3200\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "NGLY1-deficiency 70\n",
+ "lung cancer 28\n",
+ "Parkinson disease 24\n",
+ "BEHCET SYNDROME 12\n",
+ "type 2 diabetes mellitus 12\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n",
+ "MAJOR AFFECTIVE DISORDER 1 7\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n",
+ "Name: DB_Name, dtype: int64\n",
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 146171\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 3173\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION 70\n",
+ "lung cancer 28\n",
+ "Parkinson disease 24\n",
+ "BEHCET SYNDROME 12\n",
+ "type 2 diabetes mellitus 12\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n",
+ "MAJOR AFFECTIVE DISORDER 2 7\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "MAJOR AFFECTIVE DISORDER 1 7\n",
+ "Name: DB_Name, dtype: int64\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 160275\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 274\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "#615273 CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 68\n",
+ "lung cancer 28\n",
+ "Parkinson disease 20\n",
+ "type 2 diabetes mellitus 12\n",
+ "BEHCET SYNDROME 11\n",
+ "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "MAJOR AFFECTIVE DISORDER 1 7\n",
+ "MAJOR AFFECTIVE DISORDER 2 7\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "Name: DB_Name, dtype: int64\n",
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 172366\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 285\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG 68\n",
+ "lung cancer 28\n",
+ "Parkinson disease 20\n",
+ "type 2 diabetes mellitus 12\n",
+ "BEHCET SYNDROME 11\n",
+ "MAJOR AFFECTIVE DISORDER 2 7\n",
+ "LUNG CANCER 7\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n",
+ "Name: DB_Name, dtype: int64\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 150177\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 252\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "#615273 CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 70\n",
+ "lung cancer 28\n",
+ "Parkinson disease 24\n",
+ "BEHCET SYNDROME 12\n",
+ "type 2 diabetes mellitus 12\n",
+ "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n",
+ "MAJOR AFFECTIVE DISORDER 2 7\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "MAJOR AFFECTIVE DISORDER 1 7\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "Name: DB_Name, dtype: int64\n",
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 143508\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 3175\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION 70\n",
+ "lung cancer 28\n",
+ "Parkinson disease 24\n",
+ "BEHCET SYNDROME 12\n",
+ "type 2 diabetes mellitus 12\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "MAJOR AFFECTIVE DISORDER 2 7\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n",
+ "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n",
+ "Name: DB_Name, dtype: int64\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 158239\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 274\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG 68\n",
+ "lung cancer 28\n",
+ "Parkinson disease 20\n",
+ "type 2 diabetes mellitus 12\n",
+ "BEHCET SYNDROME 11\n",
+ "MAJOR AFFECTIVE DISORDER 2 7\n",
+ "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n",
+ "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "MAJOR AFFECTIVE DISORDER 1 7\n",
+ "Name: DB_Name, dtype: int64\n",
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 165700\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 280\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "#615273 CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 68\n",
+ "lung cancer 28\n",
+ "Parkinson disease 20\n",
+ "type 2 diabetes mellitus 12\n",
+ "BEHCET SYNDROME 11\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n",
+ "MAJOR AFFECTIVE DISORDER 2 7\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "LUNG CANCER 7\n",
+ "Name: DB_Name, dtype: int64\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 145453\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 3175\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION 70\n",
+ "lung cancer 28\n",
+ "Parkinson disease 24\n",
+ "type 2 diabetes mellitus 12\n",
+ "BEHCET SYNDROME 12\n",
+ "MAJOR AFFECTIVE DISORDER 1 7\n",
+ "MAJOR AFFECTIVE DISORDER 2 7\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n",
+ "Name: DB_Name, dtype: int64\n",
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 151695\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 259\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "#615273 CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 70\n",
+ "lung cancer 28\n",
+ "Parkinson disease 24\n",
+ "type 2 diabetes mellitus 12\n",
+ "BEHCET SYNDROME 12\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "MAJOR AFFECTIVE DISORDER 1 7\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n",
+ "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n",
+ "Name: DB_Name, dtype: int64\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 159162\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 283\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG 68\n",
+ "lung cancer 28\n",
+ "Parkinson disease 20\n",
+ "type 2 diabetes mellitus 12\n",
+ "BEHCET SYNDROME 11\n",
+ "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n",
+ "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "MAJOR AFFECTIVE DISORDER 1 7\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "Name: DB_Name, dtype: int64\n",
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 129155\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 3253\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "#615273 CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG;;CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 63\n",
+ "lung cancer 28\n",
+ "Parkinson disease 24\n",
+ "BEHCET SYNDROME 12\n",
+ "type 2 diabetes mellitus 12\n",
+ "MAJOR AFFECTIVE DISORDER 2 7\n",
+ "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n",
+ "MAJOR AFFECTIVE DISORDER 1 7\n",
+ "Name: DB_Name, dtype: int64\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 128603\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 3253\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "#615273 CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG;;CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 63\n",
+ "lung cancer 28\n",
+ "Parkinson disease 24\n",
+ "BEHCET SYNDROME 12\n",
+ "type 2 diabetes mellitus 12\n",
+ "MAJOR AFFECTIVE DISORDER 2 7\n",
+ "#154700 MARFAN SYNDROME; MFS;;MARFAN SYNDROME, TYPE I; MFS1 7\n",
+ "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n",
+ "MAJOR AFFECTIVE DISORDER 1 7\n",
+ "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n",
+ "Name: DB_Name, dtype: int64\n",
+ "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
+ "of pandas will change to not sort by default.\n",
+ "\n",
+ "To accept the future behavior, pass 'sort=False'.\n",
+ "\n",
+ "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
+ "\n",
+ " sort=sort)\n",
+ "number of hpo annotations: 147417\n",
+ "number of wikidata annotations: 418\n",
+ "number overlap annotations: 3177\n",
+ "\n",
+ "top unique disease-phenotypes in wd:\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION 70\n",
+ "lung cancer 28\n",
+ "Parkinson disease 24\n",
+ "BEHCET SYNDROME 12\n",
+ "type 2 diabetes mellitus 12\n",
+ "MAJOR AFFECTIVE DISORDER 2 7\n",
+ "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n",
+ "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n",
+ "MAJOR AFFECTIVE DISORDER 1 7\n",
+ "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n",
+ "Name: DB_Name, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "for item in os.listdir():\n",
+ " if item.endswith(\".tab\"):\n",
+ " !python3 Wikidata_phenomizer_input_modifier.py $item"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "pa_b1237_wd.tab pa_b1248_wd.tab pa_b1259_wd.tab pa_b1265_wd.tab\r\n",
+ "pa_b1239_wd.tab pa_b1249_wd.tab pa_b1262_wd.tab pa_b1266_wd.tab\r\n",
+ "pa_b1241_wd.tab pa_b1252_wd.tab pa_b1263_wd.tab\r\n",
+ "pa_b1246_wd.tab pa_b1254_wd.tab pa_b1264_wd.tab\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!ls *_wd.tab"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Next, change all names in each file to that of build 1266. If you don't do this, the naming is inconsistent between each build over time, which gives erroneous results in BOQA."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Column labels\n",
+ "tab_colnames = ['DB', 'DB_Object_ID', 'DB_Name', 'Qualifier', 'HPO_ID', 'DB_Reference',\n",
+ " 'Evidence_Code', 'Onset modifier', 'Frequency', 'Sex', 'Modifier',\n",
+ " 'Aspect', 'Date_Created', 'Assigned_By']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import items into a dictionary for unmodified files and wdmodified files\n",
+ "unmod_ls = []\n",
+ "unmod_dict = dict()\n",
+ "disease_label = dict()\n",
+ "\n",
+ "wdmod_ls = []\n",
+ "wdmod_dict = dict()\n",
+ "\n",
+ "for item in os.listdir():\n",
+ " \n",
+ " if not item.endswith(\"_wd.tab\") and item.startswith(\"pa\"):\n",
+ " unmod_ls.append(item)\n",
+ " value = pd.read_csv(item, delimiter = \"\\t\", names=tab_colnames, dtype=str)\n",
+ " unmod_dict.update({item:value})\n",
+ " disease_label.update({item:dict(zip(value.DB_Reference, value.DB_Name))})\n",
+ " \n",
+ " elif item.endswith(\"_wd.tab\") and item.startswith(\"pa\"):\n",
+ " wdmod_ls.append(item)\n",
+ " value = pd.read_csv(item, delimiter = \"\\t\", names=tab_colnames, dtype=str)\n",
+ " wdmod_dict.update({item:value})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Change labels for unmodified and wd modified files to build 1266\n",
+ "# Takes generated dictionary in previous(pd_dict), and change all names to labels found in b1266.\n",
+ "\n",
+ "b1266_unmod_dict = dict()\n",
+ "b1266_wdmod_dict = dict()\n",
+ "for item in unmod_ls:\n",
+ " df = unmod_dict[item]\n",
+ " df['DB_Name_hpo'] = df.DB_Reference.map(disease_label[\"pa_b1266.tab\"].get)\n",
+ " df['DB_Name'] = df.DB_Name_hpo.combine_first(df.DB_Name)\n",
+ " b1266_unmod_dict.update({item:df})\n",
+ " \n",
+ "for item in wdmod_ls:\n",
+ " df = wdmod_dict[item]\n",
+ " df['DB_Name_hpo'] = df.DB_Reference.map(disease_label[\"pa_b1266.tab\"].get)\n",
+ " df['DB_Name'] = df.DB_Name_hpo.combine_first(df.DB_Name)\n",
+ " b1266_wdmod_dict.update({item:df})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Export files named appropriately\n",
+ "\n",
+ "for item in unmod_ls:\n",
+ " df = b1266_unmod_dict[item]\n",
+ " df[tab_colnames].to_csv(item, sep=\"\\t\", header=False, index=False)\n",
+ " \n",
+ "for item in wdmod_ls:\n",
+ " df = b1266_wdmod_dict[item]\n",
+ " df[tab_colnames].to_csv(item, sep=\"\\t\", header=False, index=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Run BOQA on unmodified and modified files."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/rogertu/Phenomizer/boqa\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Run BOQA for each set\n",
+ "hpo_ids=\"HP:0001263,HP:0001252,HP:0000522,HP:0012804,HP:0000559,HP:0011968,HP:0009830,HP:0001265,HP:0002167,HP:0000970,HP:0040129\"\n",
+ "os.chdir(\"/home/rogertu/Phenomizer/boqa/\") # Note, you need to change into the directory you installed boqa in\n",
+ "!pwd\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# First create adict to map your keys to obo files\n",
+ "obo_Path = \"/home/rogertu/Phenomizer/_obo_files/\" # Note, you need to pick where your obo_files are\n",
+ "ann_Path = \"/home/rogertu/Phenomizer/_annotation_files/\" # Note, you need to pick where your annotation_tab files are\n",
+ "annobo_Dict = dict()\n",
+ "\n",
+ "for fname in os.listdir(ann_Path):\n",
+ " if fname.startswith(\"pa\") and not fname.endswith(\"_wd.tab\"):\n",
+ " splitted = fname.split(\".\")\n",
+ " name = splitted[0]\n",
+ " annobo_Val = obo_Path + \"hp_\" + name[3:8] + \".obo\"\n",
+ " annobo_Key = ann_Path + fname\n",
+ " \n",
+ " annobo_Dict.update({annobo_Key:annobo_Val})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#*check if your dictionary is mapped properly. You'd want your `ann_Path:obo_Path` id's to match correspondingly.*
\n",
+ "\n",
+ "```annobo_Dict```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create dictionary for obo files with \"_wd\"\n",
+ "annobo_Dict2 = dict()\n",
+ "\n",
+ "for fname in os.listdir(ann_Path):\n",
+ " if fname.endswith(\"_wd.tab\"):\n",
+ " splitted = fname.split(\".\")\n",
+ " name = splitted[0]\n",
+ " annobo_Val = obo_Path + \"hp_\" + name[3:8] +\".obo\"\n",
+ " annobo_Key = ann_Path + fname\n",
+ " \n",
+ "\n",
+ " annobo_Dict2.update({annobo_Key:annobo_Val})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#*check if your dictionary is mapped properly. You'd want your `ann_Path2:obo_Path` id's to match correspondingly.*
\n",
+ "\n",
+ "```annobo_Dict2```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Run BOQA on Unmodified and Wdmodified files, pipe the standard out and error to a text file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "pa_b1237.tab\n",
+ "pa_b1239.tab\n",
+ "pa_b1241.tab\n",
+ "pa_b1246.tab\n",
+ "pa_b1248.tab\n",
+ "pa_b1249.tab\n",
+ "pa_b1252.tab\n",
+ "pa_b1254.tab\n",
+ "pa_b1259.tab\n",
+ "pa_b1262.tab\n",
+ "pa_b1263.tab\n",
+ "pa_b1264.tab\n",
+ "pa_b1265.tab\n",
+ "pa_b1266.tab\n"
+ ]
+ }
+ ],
+ "source": [
+ "# BOQA Unmodified, pipe output to a file named unmod.txt\n",
+ "directory = ann_Path\n",
+ "\n",
+ "for filename in sorted(os.listdir(directory)):\n",
+ "\n",
+ " if filename.startswith(\"pa\") and not filename.endswith(\"_wd.tab\"): \n",
+ " print(filename)\n",
+ " filePath = os.path.join(directory, filename)\n",
+ " filePathStr = ('\"'+filePath+'\"') # Takes path obtained, adds quotes\n",
+ " hp_obo = (annobo_Dict[filePath]) # Pass a Key into annobo_Dict, recieve a value and add quotes\n",
+ " \n",
+ " # make sure you choose the right path to output your result text files.\n",
+ " runThis = !java -jar target/boqa-0.0.3-SNAPSHOT.jar -hpo {hpo_ids} -obo {hp_obo} -af {filePathStr} -n 10 &>> /home/rogertu/Phenomizer/_annotation_files/unmod.txt\n",
+ " runThis\n",
+ " continue\n",
+ " \n",
+ " else:\n",
+ " continue"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "pa_b1237_wd.tab\n",
+ "pa_b1239_wd.tab\n",
+ "pa_b1241_wd.tab\n",
+ "pa_b1246_wd.tab\n",
+ "pa_b1248_wd.tab\n",
+ "pa_b1249_wd.tab\n",
+ "pa_b1252_wd.tab\n",
+ "pa_b1254_wd.tab\n",
+ "pa_b1259_wd.tab\n",
+ "pa_b1262_wd.tab\n",
+ "pa_b1263_wd.tab\n",
+ "pa_b1264_wd.tab\n",
+ "pa_b1265_wd.tab\n",
+ "pa_b1266_wd.tab\n"
+ ]
+ }
+ ],
+ "source": [
+ "# BOQA WDmodified, pipe output to a file named wdmod.txt\n",
+ "directory = ann_Path\n",
+ "\n",
+ "for filename in sorted(os.listdir(directory)):\n",
+ "\n",
+ " if filename.startswith(\"pa\") and filename.endswith(\"_wd.tab\"): # Note the removal of `not`\n",
+ " print(filename)\n",
+ " filePath = os.path.join(directory, filename)\n",
+ " filePathStr = ('\"'+filePath+'\"') # Takes path obtained, adds quotes\n",
+ " hp_obo = (annobo_Dict2[filePath]) # Pass a Key into annobo_Dict, recieve a value and add quotes\n",
+ "\n",
+ " runThis = !java -jar target/boqa-0.0.3-SNAPSHOT.jar -hpo {hpo_ids} -obo {hp_obo} -af {filePathStr} -n 25 &>> /home/rogertu/Phenomizer/_annotation_files/wdmod.txt\n",
+ " runThis\n",
+ " continue\n",
+ " \n",
+ " else:\n",
+ " continue"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Text processing of the BOQA outputs for the raw and the Wikidata-modified phenotype_annotation.tab files."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import datetime\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.chdir(\"/home/rogertu/Phenomizer/_annotation_files/\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Get indices of where `itemName|score` is located in the unmodified BOQA outputs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[4067,\n",
+ " 8145,\n",
+ " 12229,\n",
+ " 16313,\n",
+ " 20391,\n",
+ " 24469,\n",
+ " 28547,\n",
+ " 30925,\n",
+ " 33303,\n",
+ " 35733,\n",
+ " 38355,\n",
+ " 40793,\n",
+ " 43223,\n",
+ " 45557]"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unmod = open(\"unmod.txt\", \"r\")\n",
+ "\n",
+ "itemName_indices = []\n",
+ "for index, line in enumerate(unmod):\n",
+ " if line == \"itemName|score\\n\":\n",
+ " itemName_indices.append(index)\n",
+ " \n",
+ "itemName_indices"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "14"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# length? \n",
+ "len(itemName_indices)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### For every index in `itemName_indices`, look 10 lines down for CDDG. If it exists, create an item in a dictionary to build and value."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "b1241\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.13222605766200135\n",
+ "\n",
+ "b1246\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.13163654580120396\n",
+ "\n",
+ "b1248\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.13468054869009333\n",
+ "\n",
+ "b1249\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.14825745555847192\n",
+ "\n",
+ "b1252\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.14473728630088692\n",
+ "\n",
+ "b1254\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.1115729099754877\n",
+ "\n",
+ "b1259\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.10695366023475644\n",
+ "\n",
+ "b1262\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.35724875360237396\n",
+ "\n",
+ "b1263\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.35746508599626475\n",
+ "\n",
+ "b1264\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3482354901598477\n",
+ "\n",
+ "b1265\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3458454767346629\n",
+ "\n",
+ "b1266\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.32071617741247055\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "unmod = open(\"unmod.txt\", \"r\")\n",
+ "unmod_ls = unmod.readlines()\n",
+ "build_ls = ['b1237','b1239','b1241','b1246','b1248',\n",
+ " 'b1249','b1252','b1254','b1259','b1262',\n",
+ " 'b1263','b1264','b1265','b1266']\n",
+ "res_un = dict()\n",
+ "\n",
+ "for line in unmod_ls:\n",
+ " split = line.split(sep=\"|\") # returns a list of two, name and % semantic similarity\n",
+ " f_ind = unmod_ls.index(line) # computes the index of the line in the file is\n",
+ " \n",
+ " if split[0] == \"CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)\":\n",
+ "\n",
+ " for index in itemName_indices:\n",
+ " withn = range(index,index+10) # computes the index range file line should fall in.\n",
+ " buildName = build_ls[itemName_indices.index(index)] # returns the name of build from build list.\n",
+ "\n",
+ " if f_ind in withn:\n",
+ " val = split[1] # fix the % semantic similarity from string to float.\n",
+ " val = val.strip('\\n') #\n",
+ " val = float(val) # \n",
+ " res_un.update({buildName:val}) # Update dictionary with build and % semantic similarity\n",
+ " print(buildName)\n",
+ " print(line)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Do the same thing for wikidata modified BOQA results.\n",
+ "* Create an index for `itemName|score`.\n",
+ "* Look for CDDG, store build and value to a dictionary."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wdmod = open(\"wdmod.txt\", \"r\")\n",
+ "\n",
+ "itemName_indices2 = []\n",
+ "for index, line in enumerate(wdmod):\n",
+ " if line == \"itemName|score\\n\":\n",
+ " itemName_indices2.append(index)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "14"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(itemName_indices2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "b1237\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.17160357887114772\n",
+ "\n",
+ "b1239\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.17110371199145843\n",
+ "\n",
+ "b1241\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3356842848618296\n",
+ "\n",
+ "b1246\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.34719783567372703\n",
+ "\n",
+ "b1248\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.37075900318274746\n",
+ "\n",
+ "b1249\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.39493342004768245\n",
+ "\n",
+ "b1252\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3949371882443358\n",
+ "\n",
+ "b1254\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3949349034867903\n",
+ "\n",
+ "b1259\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.39493723518214785\n",
+ "\n",
+ "b1262\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3828069035818584\n",
+ "\n",
+ "b1263\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.38281125834105606\n",
+ "\n",
+ "b1264\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.40537283847675226\n",
+ "\n",
+ "b1265\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.4053704040700968\n",
+ "\n",
+ "b1266\n",
+ "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.40534408678088857\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "wdmod = open(\"wdmod.txt\", \"r\")\n",
+ "wdmod_ls = wdmod.readlines()\n",
+ "\n",
+ "res_wd = dict()\n",
+ "\n",
+ "for line in wdmod_ls:\n",
+ " split = line.split(sep=\"|\") # returns a list of two, name and % semantic similarity\n",
+ " f_ind = wdmod_ls.index(line) # computes the index of the line in the file is\n",
+ " \n",
+ " if split[0] == \"CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)\":\n",
+ "\n",
+ " for index in itemName_indices2:\n",
+ " withn = range(index,index+10) # computes the index range file line should fall in.\n",
+ " buildName = build_ls[itemName_indices2.index(index)]# returns the name of build from build list.\n",
+ "\n",
+ " if f_ind in withn:\n",
+ " val = split[1] # fix the % semantic similarity from string to float.\n",
+ " val = val.strip('\\n') #\n",
+ " val = float(val) # \n",
+ " res_wd.update({buildName:val}) # Update dictionary with build and % semantic similarity\n",
+ " print(buildName)\n",
+ " print(line)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Print both dictionaries to see if the values match"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'b1241': 0.13222605766200135,\n",
+ " 'b1246': 0.13163654580120396,\n",
+ " 'b1248': 0.13468054869009333,\n",
+ " 'b1249': 0.14825745555847192,\n",
+ " 'b1252': 0.14473728630088692,\n",
+ " 'b1254': 0.1115729099754877,\n",
+ " 'b1259': 0.10695366023475644,\n",
+ " 'b1262': 0.35724875360237396,\n",
+ " 'b1263': 0.35746508599626475,\n",
+ " 'b1264': 0.3482354901598477,\n",
+ " 'b1265': 0.3458454767346629,\n",
+ " 'b1266': 0.32071617741247055}"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "res_un"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'b1237': 0.17160357887114772,\n",
+ " 'b1239': 0.17110371199145843,\n",
+ " 'b1241': 0.3356842848618296,\n",
+ " 'b1246': 0.34719783567372703,\n",
+ " 'b1248': 0.37075900318274746,\n",
+ " 'b1249': 0.39493342004768245,\n",
+ " 'b1252': 0.3949371882443358,\n",
+ " 'b1254': 0.3949349034867903,\n",
+ " 'b1259': 0.39493723518214785,\n",
+ " 'b1262': 0.3828069035818584,\n",
+ " 'b1263': 0.38281125834105606,\n",
+ " 'b1264': 0.40537283847675226,\n",
+ " 'b1265': 0.4053704040700968,\n",
+ " 'b1266': 0.40534408678088857}"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "res_wd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Create date object for builds 1237 to 1266."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "date_ls = ['2017/03/09','2017/04/14','2017/06/30','2017/10/06', '2017/12/12',\n",
+ " '2018/01/26','2018/03/09','2018/06/13','2018/07/27', '2018/10/09',\n",
+ " '2018/12/21','2019/02/12','2019/04/15','2019/06/03']\n",
+ "date_ls = pd.to_datetime(date_ls)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "DatetimeIndex(['2017-03-09', '2017-04-14', '2017-06-30', '2017-10-06',\n",
+ " '2017-12-12', '2018-01-26', '2018-03-09', '2018-06-13',\n",
+ " '2018-07-27', '2018-10-09', '2018-12-21', '2019-02-12',\n",
+ " '2019-04-15', '2019-06-03'],\n",
+ " dtype='datetime64[ns]', freq=None)"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "date_ls"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Create a list from the dictionaries, that way you can create a pandas dataframe."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# pushing results from the untransformed and wd into a list to create a pandas df.\n",
+ "res_un_ls = []\n",
+ "for name in build_ls:\n",
+ " if name in res_un.keys():\n",
+ " res_un_ls.append(res_un[name])\n",
+ " else:\n",
+ " res_un_ls.append(0)\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "res_wd_ls = []\n",
+ "for name in build_ls:\n",
+ " if name in res_wd.keys():\n",
+ " res_wd_ls.append(res_wd[name])\n",
+ " else:\n",
+ " res_wd_ls.append(0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Construct your dataframe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = {'build':build_ls, 'date':date_ls, 'HPO':res_un_ls, 'HPO + Wikidata':res_wd_ls}\n",
+ "res = pd.DataFrame.from_dict(data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " build | \n",
+ " date | \n",
+ " HPO | \n",
+ " HPO + Wikidata | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " b1237 | \n",
+ " 2017-03-09 | \n",
+ " 0.000000 | \n",
+ " 0.171604 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " b1239 | \n",
+ " 2017-04-14 | \n",
+ " 0.000000 | \n",
+ " 0.171104 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " b1241 | \n",
+ " 2017-06-30 | \n",
+ " 0.132226 | \n",
+ " 0.335684 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " b1246 | \n",
+ " 2017-10-06 | \n",
+ " 0.131637 | \n",
+ " 0.347198 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " b1248 | \n",
+ " 2017-12-12 | \n",
+ " 0.134681 | \n",
+ " 0.370759 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " b1249 | \n",
+ " 2018-01-26 | \n",
+ " 0.148257 | \n",
+ " 0.394933 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " b1252 | \n",
+ " 2018-03-09 | \n",
+ " 0.144737 | \n",
+ " 0.394937 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " b1254 | \n",
+ " 2018-06-13 | \n",
+ " 0.111573 | \n",
+ " 0.394935 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " b1259 | \n",
+ " 2018-07-27 | \n",
+ " 0.106954 | \n",
+ " 0.394937 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " b1262 | \n",
+ " 2018-10-09 | \n",
+ " 0.357249 | \n",
+ " 0.382807 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " b1263 | \n",
+ " 2018-12-21 | \n",
+ " 0.357465 | \n",
+ " 0.382811 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " b1264 | \n",
+ " 2019-02-12 | \n",
+ " 0.348235 | \n",
+ " 0.405373 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " b1265 | \n",
+ " 2019-04-15 | \n",
+ " 0.345845 | \n",
+ " 0.405370 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " b1266 | \n",
+ " 2019-06-03 | \n",
+ " 0.320716 | \n",
+ " 0.405344 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " build date HPO HPO + Wikidata\n",
+ "0 b1237 2017-03-09 0.000000 0.171604\n",
+ "1 b1239 2017-04-14 0.000000 0.171104\n",
+ "2 b1241 2017-06-30 0.132226 0.335684\n",
+ "3 b1246 2017-10-06 0.131637 0.347198\n",
+ "4 b1248 2017-12-12 0.134681 0.370759\n",
+ "5 b1249 2018-01-26 0.148257 0.394933\n",
+ "6 b1252 2018-03-09 0.144737 0.394937\n",
+ "7 b1254 2018-06-13 0.111573 0.394935\n",
+ "8 b1259 2018-07-27 0.106954 0.394937\n",
+ "9 b1262 2018-10-09 0.357249 0.382807\n",
+ "10 b1263 2018-12-21 0.357465 0.382811\n",
+ "11 b1264 2019-02-12 0.348235 0.405373\n",
+ "12 b1265 2019-04-15 0.345845 0.405370\n",
+ "13 b1266 2019-06-03 0.320716 0.405344"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "res"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Gotta melt your dataframe\n",
+ "res_melt = pd.melt(res, id_vars= [\"build\", \"date\"], value_vars=[\"HPO\",\"HPO + Wikidata\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " build | \n",
+ " date | \n",
+ " variable | \n",
+ " value | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " b1237 | \n",
+ " 2017-03-09 | \n",
+ " HPO | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " b1239 | \n",
+ " 2017-04-14 | \n",
+ " HPO | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " b1241 | \n",
+ " 2017-06-30 | \n",
+ " HPO | \n",
+ " 0.132226 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " b1246 | \n",
+ " 2017-10-06 | \n",
+ " HPO | \n",
+ " 0.131637 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " b1248 | \n",
+ " 2017-12-12 | \n",
+ " HPO | \n",
+ " 0.134681 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " b1249 | \n",
+ " 2018-01-26 | \n",
+ " HPO | \n",
+ " 0.148257 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " b1252 | \n",
+ " 2018-03-09 | \n",
+ " HPO | \n",
+ " 0.144737 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " b1254 | \n",
+ " 2018-06-13 | \n",
+ " HPO | \n",
+ " 0.111573 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " b1259 | \n",
+ " 2018-07-27 | \n",
+ " HPO | \n",
+ " 0.106954 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " b1262 | \n",
+ " 2018-10-09 | \n",
+ " HPO | \n",
+ " 0.357249 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " b1263 | \n",
+ " 2018-12-21 | \n",
+ " HPO | \n",
+ " 0.357465 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " b1264 | \n",
+ " 2019-02-12 | \n",
+ " HPO | \n",
+ " 0.348235 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " b1265 | \n",
+ " 2019-04-15 | \n",
+ " HPO | \n",
+ " 0.345845 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " b1266 | \n",
+ " 2019-06-03 | \n",
+ " HPO | \n",
+ " 0.320716 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " b1237 | \n",
+ " 2017-03-09 | \n",
+ " HPO + Wikidata | \n",
+ " 0.171604 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " b1239 | \n",
+ " 2017-04-14 | \n",
+ " HPO + Wikidata | \n",
+ " 0.171104 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " b1241 | \n",
+ " 2017-06-30 | \n",
+ " HPO + Wikidata | \n",
+ " 0.335684 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " b1246 | \n",
+ " 2017-10-06 | \n",
+ " HPO + Wikidata | \n",
+ " 0.347198 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " b1248 | \n",
+ " 2017-12-12 | \n",
+ " HPO + Wikidata | \n",
+ " 0.370759 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " b1249 | \n",
+ " 2018-01-26 | \n",
+ " HPO + Wikidata | \n",
+ " 0.394933 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " b1252 | \n",
+ " 2018-03-09 | \n",
+ " HPO + Wikidata | \n",
+ " 0.394937 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " b1254 | \n",
+ " 2018-06-13 | \n",
+ " HPO + Wikidata | \n",
+ " 0.394935 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " b1259 | \n",
+ " 2018-07-27 | \n",
+ " HPO + Wikidata | \n",
+ " 0.394937 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " b1262 | \n",
+ " 2018-10-09 | \n",
+ " HPO + Wikidata | \n",
+ " 0.382807 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " b1263 | \n",
+ " 2018-12-21 | \n",
+ " HPO + Wikidata | \n",
+ " 0.382811 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " b1264 | \n",
+ " 2019-02-12 | \n",
+ " HPO + Wikidata | \n",
+ " 0.405373 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " b1265 | \n",
+ " 2019-04-15 | \n",
+ " HPO + Wikidata | \n",
+ " 0.405370 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " b1266 | \n",
+ " 2019-06-03 | \n",
+ " HPO + Wikidata | \n",
+ " 0.405344 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " build date variable value\n",
+ "0 b1237 2017-03-09 HPO 0.000000\n",
+ "1 b1239 2017-04-14 HPO 0.000000\n",
+ "2 b1241 2017-06-30 HPO 0.132226\n",
+ "3 b1246 2017-10-06 HPO 0.131637\n",
+ "4 b1248 2017-12-12 HPO 0.134681\n",
+ "5 b1249 2018-01-26 HPO 0.148257\n",
+ "6 b1252 2018-03-09 HPO 0.144737\n",
+ "7 b1254 2018-06-13 HPO 0.111573\n",
+ "8 b1259 2018-07-27 HPO 0.106954\n",
+ "9 b1262 2018-10-09 HPO 0.357249\n",
+ "10 b1263 2018-12-21 HPO 0.357465\n",
+ "11 b1264 2019-02-12 HPO 0.348235\n",
+ "12 b1265 2019-04-15 HPO 0.345845\n",
+ "13 b1266 2019-06-03 HPO 0.320716\n",
+ "14 b1237 2017-03-09 HPO + Wikidata 0.171604\n",
+ "15 b1239 2017-04-14 HPO + Wikidata 0.171104\n",
+ "16 b1241 2017-06-30 HPO + Wikidata 0.335684\n",
+ "17 b1246 2017-10-06 HPO + Wikidata 0.347198\n",
+ "18 b1248 2017-12-12 HPO + Wikidata 0.370759\n",
+ "19 b1249 2018-01-26 HPO + Wikidata 0.394933\n",
+ "20 b1252 2018-03-09 HPO + Wikidata 0.394937\n",
+ "21 b1254 2018-06-13 HPO + Wikidata 0.394935\n",
+ "22 b1259 2018-07-27 HPO + Wikidata 0.394937\n",
+ "23 b1262 2018-10-09 HPO + Wikidata 0.382807\n",
+ "24 b1263 2018-12-21 HPO + Wikidata 0.382811\n",
+ "25 b1264 2019-02-12 HPO + Wikidata 0.405373\n",
+ "26 b1265 2019-04-15 HPO + Wikidata 0.405370\n",
+ "27 b1266 2019-06-03 HPO + Wikidata 0.405344"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "res_melt"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Plot your data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "