diff --git a/Replicating_Phenomizer_Results/Replicating_Phenomizer_Results.ipynb b/Replicating_Phenomizer_Results/Replicating_Phenomizer_Results.ipynb new file mode 100644 index 0000000..8ff6363 --- /dev/null +++ b/Replicating_Phenomizer_Results/Replicating_Phenomizer_Results.ipynb @@ -0,0 +1,3289 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The goal of this notebook is to replicate Phenomizer results in the paper \"Wikidata as a FAIR knowledge graph for the life sciences\"\n", + "Notes:
\n", + "1) Set up a python virual environment and make sure required softwares are installed. You can do do this in terminal by running `pip install -r requirements.txt`
\n", + "* Wikidata integrator: https://github.com/SuLab/WikidataIntegrator
\n", + "* Requirements: https://github.com/SuLab/Wikidata-phenomizer/blob/master/Replicating_Phenomizer_Results/requirements.txt
\n", + "\n", + "2) Download \"Wikidata_phenomizer_input_modifier.py\" from github link:
\n", + "* Python script: https://github.com/SuLab/Wikidata-phenomizer/Replicating_Phenomizer_Results/Wikidata_phenomizer_input_modifier.py
\n", + "\n", + "\n", + "3) Install BOQA: https://github.com/sulab/boqa
\n", + "\n", + "4) Download hpo.obo files (you can do that in this notebook)
\n", + "5) Download phenotype_annotation.tab files (you can do this in this notebook)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The following steps are done to manipulate the data for BOQA analysis:\n", + "* Download `hpo.obo` and `phenotype_annotation.tab` files.\n", + "* Generate `phenotype_annotation_wd.tab` files (essentially `phenotype_annotation.tab` files with wikidata items appended at the end).\n", + "* Rename each file `DBname` to those found in build 1266. (Makes analysis more consistent).\n", + "* Run BOQA\n", + "* Extract text from BOQA\n", + "* Graph!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Import libraries required." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Downloading the annotation_files and obo_files." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/rogertu/Phenomizer\r\n" + ] + } + ], + "source": [ + "!pwd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# create a directory to download annotation files to.\n", + "!mkdir _annotation_files\n", + "os.chdir(\"_annotation_files\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Download `phenotype_annotation.tab` builds from human phenotype ontology Jenkin's servers." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# From Jenkins, Download all `phenotype_annotation.tab` files\n", + "phenotype_annotation_ls = ['1266','1265','1264','1263','1262',\n", + " '1259','1254','1252','1249','1248',\n", + " '1246','1241','1239','1237']\n", + "\n", + "for a_file in phenotype_annotation_ls:\n", + " curCount = phenotype_annotation_ls.index(a_file) # gets index from the list\n", + " fileName = \"http://compbio.charite.de/jenkins/job/hpo.annotations/\" + a_file +\"/artifact/misc/phenotype_annotation.tab\"\n", + " !wget $fileName -q\n", + " reName = 'pa_b' + phenotype_annotation_ls[curCount] + \".tab\"\n", + " !mv phenotype_annotation.tab $reName" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Change out (up) of this directory, create a new directory, and change into (down) that directory." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Get out of current directory, create a new directory, change into the new directory.\n", + "os.chdir(\"..\")\n", + "!mkdir _obo_files\n", + "os.chdir(\"_obo_files\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Download old `hpo.obo` builds from Bioportal into the newly created folder." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# From Bioportal, download `hp.obo` files\n", + "obo_ls = ['577','576','575','574','573',\n", + " '572','571','570','569','568',\n", + " '567','566','564', '563']\n", + "\n", + "for a_file in obo_ls:\n", + " curCount = obo_ls.index(a_file) # gets index from the list\n", + " fileName = \"http://data.bioontology.org/ontologies/HP/submissions/\" + a_file + \"/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb\"\n", + " !wget $fileName -q\n", + " reName = 'hp_b' + phenotype_annotation_ls[curCount] + \".obo\"\n", + " !mv download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb $reName\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Fix phenotype_annotation.tab builds 1248, 1249, 1252 and 1254 because they have extra columns..." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# fix build 1248, 1249, 1252 and 1254 because it has 15 columns vs 14 (why?!?!)\n", + "os.chdir(\"..\")\n", + "os.chdir(\"_annotation_files\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (3,7,11,14) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + } + ], + "source": [ + "# read to a dictionary, manipulate files with pandas\n", + "\n", + "toFix = ['b1248', 'b1249', 'b1252', 'b1254']\n", + "toFix_dict = dict()\n", + "\n", + "for item in toFix:\n", + " key = item\n", + " value = pd.read_csv(\"pa_\" + item + \".tab\", header = None, sep = \"\\t\")\n", + " toFix_dict.update({key:value})\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#*If you want to visualize the dataframe before column switching, convert this from markdown → code.*
\n", + "`for item in toFix:\n", + " display(toFix_dict[item].head())`" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234567891011121314
0DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0000252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
1DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001249DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
2DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001250DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
3DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
4DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001518DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n", + "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n", + "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n", + "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n", + "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n", + "\n", + " 7 8 9 10 11 12 13 14 \n", + "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234567891011121314
0DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0000252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
1DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001249DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
2DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001250DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
3DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
4DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001518DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n", + "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n", + "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n", + "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n", + "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n", + "\n", + " 7 8 9 10 11 12 13 14 \n", + "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234567891011121314
0DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0000252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
1DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001249DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
2DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001250DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
3DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
4DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001518DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n", + "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n", + "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n", + "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n", + "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n", + "\n", + " 7 8 9 10 11 12 13 14 \n", + "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234567891011121314
0DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0000252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
1DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001249DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
2DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001250DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
3DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
4DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001518DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROME2013-05-29HPO:skoehlerNaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n", + "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n", + "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n", + "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n", + "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n", + "\n", + " 7 8 9 10 11 12 13 14 \n", + "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN \n", + "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME 2013-05-29 HPO:skoehler NaN " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for item in toFix:\n", + " display(toFix_dict[item].head())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Concatenate col 13 and 12 as 'cat', remove col 13 and 12, and reorder indices.\n", + "for item in toFix:\n", + " df = toFix_dict[item]\n", + " df['cat'] = df[13] + \"[\" + df[12] + \"]\"\n", + " del df[12]\n", + " del df[13]\n", + " value = df[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,'cat' , 14]]\n", + " toFix_dict.update({item:value})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#*If you want to visualize the dataframe after column switching.*
\n", + "\n", + "` for item in toFix:\n", + " display(toFix_dict[item].head()) `" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234567891011cat14
0DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0000252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
1DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001249DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
2DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001250DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
3DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
4DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001518DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n", + "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n", + "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n", + "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n", + "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n", + "\n", + " 7 8 9 10 11 cat 14 \n", + "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234567891011cat14
0DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0000252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
1DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001249DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
2DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001250DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
3DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
4DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001518DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n", + "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n", + "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n", + "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n", + "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n", + "\n", + " 7 8 9 10 11 cat 14 \n", + "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234567891011cat14
0DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0000252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
1DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001249DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
2DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001250DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
3DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
4DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001518DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n", + "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n", + "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n", + "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n", + "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n", + "\n", + " 7 8 9 10 11 cat 14 \n", + "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01234567891011cat14
0DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0000252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
1DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001249DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
2DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001250DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
3DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001252DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
4DECIPHER1Wolf-Hirschhorn SyndromeNaNHP:0001518DECIPHER:1IEANaNNaNNaNOWOLF-HIRSCHHORN SYNDROMEHPO:skoehler[2013-05-29]NaN
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0000252 DECIPHER:1 IEA \n", + "1 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001249 DECIPHER:1 IEA \n", + "2 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001250 DECIPHER:1 IEA \n", + "3 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001252 DECIPHER:1 IEA \n", + "4 DECIPHER 1 Wolf-Hirschhorn Syndrome NaN HP:0001518 DECIPHER:1 IEA \n", + "\n", + " 7 8 9 10 11 cat 14 \n", + "0 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "1 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "2 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "3 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN \n", + "4 NaN NaN NaN O WOLF-HIRSCHHORN SYNDROME HPO:skoehler[2013-05-29] NaN " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for item in toFix:\n", + " display(toFix_dict[item].head())" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Export merged files.\n", + "for item in toFix:\n", + " df = toFix_dict[item]\n", + " df.to_csv(\"pa_\" + item + \".tab\", sep = '\\t', header = None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Congrats! You've downloaded the both the datasets and fixed the files that had too many columns. \n", + "Next we'll explore utilizing Phenomizer. Phenomizer (Su Lab) is a program that takes a `phenotype_annotation.tab` file, queries wikidata for all OMIM entries and appends them to the phenotype_annotation, and returns the file as a `phenotype_annotation_wd.tab` file." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get `Wikidata_phenomizer_input_modifier.py` from github to the folder that holding your `_annotation_files`" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2019-10-09 17:52:08-- https://raw.githubusercontent.com/turoger/Wikidata-phenomizer/master/Replicating_Phenomizer_Results/Wikidata_phenomizer_input_modifier.py\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.196.133\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.196.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 3742 (3.7K) [text/plain]\n", + "Saving to: ‘Wikidata_phenomizer_input_modifier.py’\n", + "\n", + "Wikidata_phenomizer 100%[===================>] 3.65K --.-KB/s in 0s \n", + "\n", + "2019-10-09 17:52:09 (58.0 MB/s) - ‘Wikidata_phenomizer_input_modifier.py’ saved [3742/3742]\n", + "\n" + ] + } + ], + "source": [ + "!wget https://raw.githubusercontent.com/turoger/Wikidata-phenomizer/master/Replicating_Phenomizer_Results/Wikidata_phenomizer_input_modifier.py" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/rogertu/Phenomizer/_annotation_files\r\n" + ] + } + ], + "source": [ + "!pwd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Run phenomizer to generate wikidata + HPO `_wd.tab` files." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 131600\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 3200\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "NGLY1-deficiency 70\n", + "lung cancer 28\n", + "Parkinson disease 24\n", + "BEHCET SYNDROME 12\n", + "type 2 diabetes mellitus 12\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n", + "MAJOR AFFECTIVE DISORDER 1 7\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n", + "Name: DB_Name, dtype: int64\n", + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 146171\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 3173\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION 70\n", + "lung cancer 28\n", + "Parkinson disease 24\n", + "BEHCET SYNDROME 12\n", + "type 2 diabetes mellitus 12\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n", + "MAJOR AFFECTIVE DISORDER 2 7\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "MAJOR AFFECTIVE DISORDER 1 7\n", + "Name: DB_Name, dtype: int64\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 160275\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 274\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "#615273 CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 68\n", + "lung cancer 28\n", + "Parkinson disease 20\n", + "type 2 diabetes mellitus 12\n", + "BEHCET SYNDROME 11\n", + "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "MAJOR AFFECTIVE DISORDER 1 7\n", + "MAJOR AFFECTIVE DISORDER 2 7\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "Name: DB_Name, dtype: int64\n", + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 172366\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 285\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG 68\n", + "lung cancer 28\n", + "Parkinson disease 20\n", + "type 2 diabetes mellitus 12\n", + "BEHCET SYNDROME 11\n", + "MAJOR AFFECTIVE DISORDER 2 7\n", + "LUNG CANCER 7\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n", + "Name: DB_Name, dtype: int64\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 150177\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 252\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "#615273 CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 70\n", + "lung cancer 28\n", + "Parkinson disease 24\n", + "BEHCET SYNDROME 12\n", + "type 2 diabetes mellitus 12\n", + "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n", + "MAJOR AFFECTIVE DISORDER 2 7\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "MAJOR AFFECTIVE DISORDER 1 7\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "Name: DB_Name, dtype: int64\n", + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 143508\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 3175\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION 70\n", + "lung cancer 28\n", + "Parkinson disease 24\n", + "BEHCET SYNDROME 12\n", + "type 2 diabetes mellitus 12\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "MAJOR AFFECTIVE DISORDER 2 7\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n", + "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n", + "Name: DB_Name, dtype: int64\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 158239\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 274\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG 68\n", + "lung cancer 28\n", + "Parkinson disease 20\n", + "type 2 diabetes mellitus 12\n", + "BEHCET SYNDROME 11\n", + "MAJOR AFFECTIVE DISORDER 2 7\n", + "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n", + "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "MAJOR AFFECTIVE DISORDER 1 7\n", + "Name: DB_Name, dtype: int64\n", + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 165700\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 280\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "#615273 CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 68\n", + "lung cancer 28\n", + "Parkinson disease 20\n", + "type 2 diabetes mellitus 12\n", + "BEHCET SYNDROME 11\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n", + "MAJOR AFFECTIVE DISORDER 2 7\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "LUNG CANCER 7\n", + "Name: DB_Name, dtype: int64\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 145453\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 3175\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION 70\n", + "lung cancer 28\n", + "Parkinson disease 24\n", + "type 2 diabetes mellitus 12\n", + "BEHCET SYNDROME 12\n", + "MAJOR AFFECTIVE DISORDER 1 7\n", + "MAJOR AFFECTIVE DISORDER 2 7\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n", + "Name: DB_Name, dtype: int64\n", + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 151695\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 259\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "#615273 CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 70\n", + "lung cancer 28\n", + "Parkinson disease 24\n", + "type 2 diabetes mellitus 12\n", + "BEHCET SYNDROME 12\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "MAJOR AFFECTIVE DISORDER 1 7\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n", + "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n", + "Name: DB_Name, dtype: int64\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 159162\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 283\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG 68\n", + "lung cancer 28\n", + "Parkinson disease 20\n", + "type 2 diabetes mellitus 12\n", + "BEHCET SYNDROME 11\n", + "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n", + "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "MAJOR AFFECTIVE DISORDER 1 7\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "Name: DB_Name, dtype: int64\n", + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 129155\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 3253\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "#615273 CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG;;CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 63\n", + "lung cancer 28\n", + "Parkinson disease 24\n", + "BEHCET SYNDROME 12\n", + "type 2 diabetes mellitus 12\n", + "MAJOR AFFECTIVE DISORDER 2 7\n", + "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "%613065 LEUKEMIA, ACUTE LYMPHOBLASTIC; ALLLEUKEMIA, ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;ALL1, INCLUDED;;LEUKEMIA, ACUTE LYMPHOCYTIC, SUSCEPTIBILITY TO, 1, INCLUDED;;LEUKEMIA, B-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, T-CELL ACUTE LYMPHOBLASTIC, SUSCEPTIBILITY TO, INCLUDED;;LEUKEMIA, ACUTE LYMPHOBLASTIC, B-HYPERDIPLOID, SUSCEPTIBILITY TO,INCLUDED 7\n", + "MAJOR AFFECTIVE DISORDER 1 7\n", + "Name: DB_Name, dtype: int64\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 128603\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 3253\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "#615273 CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG;;CONGENITAL DISORDER OF GLYCOSYLATION, TYPE Iv; CDG1V;;CDG Iv; CDGIv 63\n", + "lung cancer 28\n", + "Parkinson disease 24\n", + "BEHCET SYNDROME 12\n", + "type 2 diabetes mellitus 12\n", + "MAJOR AFFECTIVE DISORDER 2 7\n", + "#154700 MARFAN SYNDROME; MFS;;MARFAN SYNDROME, TYPE I; MFS1 7\n", + "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n", + "MAJOR AFFECTIVE DISORDER 1 7\n", + "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n", + "Name: DB_Name, dtype: int64\n", + "/home/rogertu/Phenomizer/Phen_env/lib/python3.6/site-packages/pandas/core/frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " sort=sort)\n", + "number of hpo annotations: 147417\n", + "number of wikidata annotations: 418\n", + "number overlap annotations: 3177\n", + "\n", + "top unique disease-phenotypes in wd:\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION 70\n", + "lung cancer 28\n", + "Parkinson disease 24\n", + "BEHCET SYNDROME 12\n", + "type 2 diabetes mellitus 12\n", + "MAJOR AFFECTIVE DISORDER 2 7\n", + "LYMPHOBLASTIC LEUKEMIA, ACUTE, WITH LYMPHOMATOUS FEATURES 7\n", + "LUNG CANCERALVEOLAR CELL CARCINOMA, INCLUDED 7\n", + "MAJOR AFFECTIVE DISORDER 1 7\n", + "#266600 INFLAMMATORY BOWEL DISEASE 1; IBD1REGIONAL ENTERITIS, INCLUDED;;CROHN DISEASE, INCLUDED;;ULCERATIVE COLITIS, INCLUDED;;CROHN DISEASE-ASSOCIATED GROWTH FAILURE, SUSCEPTIBILITY TO, INCLUDED 7\n", + "Name: DB_Name, dtype: int64\n" + ] + } + ], + "source": [ + "for item in os.listdir():\n", + " if item.endswith(\".tab\"):\n", + " !python3 Wikidata_phenomizer_input_modifier.py $item" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pa_b1237_wd.tab pa_b1248_wd.tab pa_b1259_wd.tab pa_b1265_wd.tab\r\n", + "pa_b1239_wd.tab pa_b1249_wd.tab pa_b1262_wd.tab pa_b1266_wd.tab\r\n", + "pa_b1241_wd.tab pa_b1252_wd.tab pa_b1263_wd.tab\r\n", + "pa_b1246_wd.tab pa_b1254_wd.tab pa_b1264_wd.tab\r\n" + ] + } + ], + "source": [ + "!ls *_wd.tab" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Next, change all names in each file to that of build 1266. If you don't do this, the naming is inconsistent between each build over time, which gives erroneous results in BOQA." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Column labels\n", + "tab_colnames = ['DB', 'DB_Object_ID', 'DB_Name', 'Qualifier', 'HPO_ID', 'DB_Reference',\n", + " 'Evidence_Code', 'Onset modifier', 'Frequency', 'Sex', 'Modifier',\n", + " 'Aspect', 'Date_Created', 'Assigned_By']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Import items into a dictionary for unmodified files and wdmodified files\n", + "unmod_ls = []\n", + "unmod_dict = dict()\n", + "disease_label = dict()\n", + "\n", + "wdmod_ls = []\n", + "wdmod_dict = dict()\n", + "\n", + "for item in os.listdir():\n", + " \n", + " if not item.endswith(\"_wd.tab\") and item.startswith(\"pa\"):\n", + " unmod_ls.append(item)\n", + " value = pd.read_csv(item, delimiter = \"\\t\", names=tab_colnames, dtype=str)\n", + " unmod_dict.update({item:value})\n", + " disease_label.update({item:dict(zip(value.DB_Reference, value.DB_Name))})\n", + " \n", + " elif item.endswith(\"_wd.tab\") and item.startswith(\"pa\"):\n", + " wdmod_ls.append(item)\n", + " value = pd.read_csv(item, delimiter = \"\\t\", names=tab_colnames, dtype=str)\n", + " wdmod_dict.update({item:value})" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Change labels for unmodified and wd modified files to build 1266\n", + "# Takes generated dictionary in previous(pd_dict), and change all names to labels found in b1266.\n", + "\n", + "b1266_unmod_dict = dict()\n", + "b1266_wdmod_dict = dict()\n", + "for item in unmod_ls:\n", + " df = unmod_dict[item]\n", + " df['DB_Name_hpo'] = df.DB_Reference.map(disease_label[\"pa_b1266.tab\"].get)\n", + " df['DB_Name'] = df.DB_Name_hpo.combine_first(df.DB_Name)\n", + " b1266_unmod_dict.update({item:df})\n", + " \n", + "for item in wdmod_ls:\n", + " df = wdmod_dict[item]\n", + " df['DB_Name_hpo'] = df.DB_Reference.map(disease_label[\"pa_b1266.tab\"].get)\n", + " df['DB_Name'] = df.DB_Name_hpo.combine_first(df.DB_Name)\n", + " b1266_wdmod_dict.update({item:df})" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Export files named appropriately\n", + "\n", + "for item in unmod_ls:\n", + " df = b1266_unmod_dict[item]\n", + " df[tab_colnames].to_csv(item, sep=\"\\t\", header=False, index=False)\n", + " \n", + "for item in wdmod_ls:\n", + " df = b1266_wdmod_dict[item]\n", + " df[tab_colnames].to_csv(item, sep=\"\\t\", header=False, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Run BOQA on unmodified and modified files." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/rogertu/Phenomizer/boqa\r\n" + ] + } + ], + "source": [ + "# Run BOQA for each set\n", + "hpo_ids=\"HP:0001263,HP:0001252,HP:0000522,HP:0012804,HP:0000559,HP:0011968,HP:0009830,HP:0001265,HP:0002167,HP:0000970,HP:0040129\"\n", + "os.chdir(\"/home/rogertu/Phenomizer/boqa/\") # Note, you need to change into the directory you installed boqa in\n", + "!pwd\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# First create adict to map your keys to obo files\n", + "obo_Path = \"/home/rogertu/Phenomizer/_obo_files/\" # Note, you need to pick where your obo_files are\n", + "ann_Path = \"/home/rogertu/Phenomizer/_annotation_files/\" # Note, you need to pick where your annotation_tab files are\n", + "annobo_Dict = dict()\n", + "\n", + "for fname in os.listdir(ann_Path):\n", + " if fname.startswith(\"pa\") and not fname.endswith(\"_wd.tab\"):\n", + " splitted = fname.split(\".\")\n", + " name = splitted[0]\n", + " annobo_Val = obo_Path + \"hp_\" + name[3:8] + \".obo\"\n", + " annobo_Key = ann_Path + fname\n", + " \n", + " annobo_Dict.update({annobo_Key:annobo_Val})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#*check if your dictionary is mapped properly. You'd want your `ann_Path:obo_Path` id's to match correspondingly.*
\n", + "\n", + "```annobo_Dict```" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Create dictionary for obo files with \"_wd\"\n", + "annobo_Dict2 = dict()\n", + "\n", + "for fname in os.listdir(ann_Path):\n", + " if fname.endswith(\"_wd.tab\"):\n", + " splitted = fname.split(\".\")\n", + " name = splitted[0]\n", + " annobo_Val = obo_Path + \"hp_\" + name[3:8] +\".obo\"\n", + " annobo_Key = ann_Path + fname\n", + " \n", + "\n", + " annobo_Dict2.update({annobo_Key:annobo_Val})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#*check if your dictionary is mapped properly. You'd want your `ann_Path2:obo_Path` id's to match correspondingly.*
\n", + "\n", + "```annobo_Dict2```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Run BOQA on Unmodified and Wdmodified files, pipe the standard out and error to a text file." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pa_b1237.tab\n", + "pa_b1239.tab\n", + "pa_b1241.tab\n", + "pa_b1246.tab\n", + "pa_b1248.tab\n", + "pa_b1249.tab\n", + "pa_b1252.tab\n", + "pa_b1254.tab\n", + "pa_b1259.tab\n", + "pa_b1262.tab\n", + "pa_b1263.tab\n", + "pa_b1264.tab\n", + "pa_b1265.tab\n", + "pa_b1266.tab\n" + ] + } + ], + "source": [ + "# BOQA Unmodified, pipe output to a file named unmod.txt\n", + "directory = ann_Path\n", + "\n", + "for filename in sorted(os.listdir(directory)):\n", + "\n", + " if filename.startswith(\"pa\") and not filename.endswith(\"_wd.tab\"): \n", + " print(filename)\n", + " filePath = os.path.join(directory, filename)\n", + " filePathStr = ('\"'+filePath+'\"') # Takes path obtained, adds quotes\n", + " hp_obo = (annobo_Dict[filePath]) # Pass a Key into annobo_Dict, recieve a value and add quotes\n", + " \n", + " # make sure you choose the right path to output your result text files.\n", + " runThis = !java -jar target/boqa-0.0.3-SNAPSHOT.jar -hpo {hpo_ids} -obo {hp_obo} -af {filePathStr} -n 10 &>> /home/rogertu/Phenomizer/_annotation_files/unmod.txt\n", + " runThis\n", + " continue\n", + " \n", + " else:\n", + " continue" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pa_b1237_wd.tab\n", + "pa_b1239_wd.tab\n", + "pa_b1241_wd.tab\n", + "pa_b1246_wd.tab\n", + "pa_b1248_wd.tab\n", + "pa_b1249_wd.tab\n", + "pa_b1252_wd.tab\n", + "pa_b1254_wd.tab\n", + "pa_b1259_wd.tab\n", + "pa_b1262_wd.tab\n", + "pa_b1263_wd.tab\n", + "pa_b1264_wd.tab\n", + "pa_b1265_wd.tab\n", + "pa_b1266_wd.tab\n" + ] + } + ], + "source": [ + "# BOQA WDmodified, pipe output to a file named wdmod.txt\n", + "directory = ann_Path\n", + "\n", + "for filename in sorted(os.listdir(directory)):\n", + "\n", + " if filename.startswith(\"pa\") and filename.endswith(\"_wd.tab\"): # Note the removal of `not`\n", + " print(filename)\n", + " filePath = os.path.join(directory, filename)\n", + " filePathStr = ('\"'+filePath+'\"') # Takes path obtained, adds quotes\n", + " hp_obo = (annobo_Dict2[filePath]) # Pass a Key into annobo_Dict, recieve a value and add quotes\n", + "\n", + " runThis = !java -jar target/boqa-0.0.3-SNAPSHOT.jar -hpo {hpo_ids} -obo {hp_obo} -af {filePathStr} -n 25 &>> /home/rogertu/Phenomizer/_annotation_files/wdmod.txt\n", + " runThis\n", + " continue\n", + " \n", + " else:\n", + " continue" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Text processing of the BOQA outputs for the raw and the Wikidata-modified phenotype_annotation.tab files." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "os.chdir(\"/home/rogertu/Phenomizer/_annotation_files/\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get indices of where `itemName|score` is located in the unmodified BOQA outputs." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[4067,\n", + " 8145,\n", + " 12229,\n", + " 16313,\n", + " 20391,\n", + " 24469,\n", + " 28547,\n", + " 30925,\n", + " 33303,\n", + " 35733,\n", + " 38355,\n", + " 40793,\n", + " 43223,\n", + " 45557]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unmod = open(\"unmod.txt\", \"r\")\n", + "\n", + "itemName_indices = []\n", + "for index, line in enumerate(unmod):\n", + " if line == \"itemName|score\\n\":\n", + " itemName_indices.append(index)\n", + " \n", + "itemName_indices" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# length? \n", + "len(itemName_indices)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### For every index in `itemName_indices`, look 10 lines down for CDDG. If it exists, create an item in a dictionary to build and value." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b1241\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.13222605766200135\n", + "\n", + "b1246\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.13163654580120396\n", + "\n", + "b1248\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.13468054869009333\n", + "\n", + "b1249\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.14825745555847192\n", + "\n", + "b1252\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.14473728630088692\n", + "\n", + "b1254\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.1115729099754877\n", + "\n", + "b1259\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.10695366023475644\n", + "\n", + "b1262\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.35724875360237396\n", + "\n", + "b1263\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.35746508599626475\n", + "\n", + "b1264\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3482354901598477\n", + "\n", + "b1265\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3458454767346629\n", + "\n", + "b1266\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.32071617741247055\n", + "\n" + ] + } + ], + "source": [ + "unmod = open(\"unmod.txt\", \"r\")\n", + "unmod_ls = unmod.readlines()\n", + "build_ls = ['b1237','b1239','b1241','b1246','b1248',\n", + " 'b1249','b1252','b1254','b1259','b1262',\n", + " 'b1263','b1264','b1265','b1266']\n", + "res_un = dict()\n", + "\n", + "for line in unmod_ls:\n", + " split = line.split(sep=\"|\") # returns a list of two, name and % semantic similarity\n", + " f_ind = unmod_ls.index(line) # computes the index of the line in the file is\n", + " \n", + " if split[0] == \"CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)\":\n", + "\n", + " for index in itemName_indices:\n", + " withn = range(index,index+10) # computes the index range file line should fall in.\n", + " buildName = build_ls[itemName_indices.index(index)] # returns the name of build from build list.\n", + "\n", + " if f_ind in withn:\n", + " val = split[1] # fix the % semantic similarity from string to float.\n", + " val = val.strip('\\n') #\n", + " val = float(val) # \n", + " res_un.update({buildName:val}) # Update dictionary with build and % semantic similarity\n", + " print(buildName)\n", + " print(line)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Do the same thing for wikidata modified BOQA results.\n", + "* Create an index for `itemName|score`.\n", + "* Look for CDDG, store build and value to a dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "wdmod = open(\"wdmod.txt\", \"r\")\n", + "\n", + "itemName_indices2 = []\n", + "for index, line in enumerate(wdmod):\n", + " if line == \"itemName|score\\n\":\n", + " itemName_indices2.append(index)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(itemName_indices2)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "b1237\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.17160357887114772\n", + "\n", + "b1239\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.17110371199145843\n", + "\n", + "b1241\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3356842848618296\n", + "\n", + "b1246\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.34719783567372703\n", + "\n", + "b1248\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.37075900318274746\n", + "\n", + "b1249\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.39493342004768245\n", + "\n", + "b1252\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3949371882443358\n", + "\n", + "b1254\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3949349034867903\n", + "\n", + "b1259\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.39493723518214785\n", + "\n", + "b1262\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.3828069035818584\n", + "\n", + "b1263\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.38281125834105606\n", + "\n", + "b1264\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.40537283847675226\n", + "\n", + "b1265\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.4053704040700968\n", + "\n", + "b1266\n", + "CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)|0.40534408678088857\n", + "\n" + ] + } + ], + "source": [ + "wdmod = open(\"wdmod.txt\", \"r\")\n", + "wdmod_ls = wdmod.readlines()\n", + "\n", + "res_wd = dict()\n", + "\n", + "for line in wdmod_ls:\n", + " split = line.split(sep=\"|\") # returns a list of two, name and % semantic similarity\n", + " f_ind = wdmod_ls.index(line) # computes the index of the line in the file is\n", + " \n", + " if split[0] == \"CONGENITAL DISORDER OF DEGLYCOSYLATION; CDDG (OMIM:615273)\":\n", + "\n", + " for index in itemName_indices2:\n", + " withn = range(index,index+10) # computes the index range file line should fall in.\n", + " buildName = build_ls[itemName_indices2.index(index)]# returns the name of build from build list.\n", + "\n", + " if f_ind in withn:\n", + " val = split[1] # fix the % semantic similarity from string to float.\n", + " val = val.strip('\\n') #\n", + " val = float(val) # \n", + " res_wd.update({buildName:val}) # Update dictionary with build and % semantic similarity\n", + " print(buildName)\n", + " print(line)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Print both dictionaries to see if the values match" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'b1241': 0.13222605766200135,\n", + " 'b1246': 0.13163654580120396,\n", + " 'b1248': 0.13468054869009333,\n", + " 'b1249': 0.14825745555847192,\n", + " 'b1252': 0.14473728630088692,\n", + " 'b1254': 0.1115729099754877,\n", + " 'b1259': 0.10695366023475644,\n", + " 'b1262': 0.35724875360237396,\n", + " 'b1263': 0.35746508599626475,\n", + " 'b1264': 0.3482354901598477,\n", + " 'b1265': 0.3458454767346629,\n", + " 'b1266': 0.32071617741247055}" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_un" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'b1237': 0.17160357887114772,\n", + " 'b1239': 0.17110371199145843,\n", + " 'b1241': 0.3356842848618296,\n", + " 'b1246': 0.34719783567372703,\n", + " 'b1248': 0.37075900318274746,\n", + " 'b1249': 0.39493342004768245,\n", + " 'b1252': 0.3949371882443358,\n", + " 'b1254': 0.3949349034867903,\n", + " 'b1259': 0.39493723518214785,\n", + " 'b1262': 0.3828069035818584,\n", + " 'b1263': 0.38281125834105606,\n", + " 'b1264': 0.40537283847675226,\n", + " 'b1265': 0.4053704040700968,\n", + " 'b1266': 0.40534408678088857}" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_wd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Create date object for builds 1237 to 1266." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "date_ls = ['2017/03/09','2017/04/14','2017/06/30','2017/10/06', '2017/12/12',\n", + " '2018/01/26','2018/03/09','2018/06/13','2018/07/27', '2018/10/09',\n", + " '2018/12/21','2019/02/12','2019/04/15','2019/06/03']\n", + "date_ls = pd.to_datetime(date_ls)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2017-03-09', '2017-04-14', '2017-06-30', '2017-10-06',\n", + " '2017-12-12', '2018-01-26', '2018-03-09', '2018-06-13',\n", + " '2018-07-27', '2018-10-09', '2018-12-21', '2019-02-12',\n", + " '2019-04-15', '2019-06-03'],\n", + " dtype='datetime64[ns]', freq=None)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "date_ls" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Create a list from the dictionaries, that way you can create a pandas dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "# pushing results from the untransformed and wd into a list to create a pandas df.\n", + "res_un_ls = []\n", + "for name in build_ls:\n", + " if name in res_un.keys():\n", + " res_un_ls.append(res_un[name])\n", + " else:\n", + " res_un_ls.append(0)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "res_wd_ls = []\n", + "for name in build_ls:\n", + " if name in res_wd.keys():\n", + " res_wd_ls.append(res_wd[name])\n", + " else:\n", + " res_wd_ls.append(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Construct your dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "data = {'build':build_ls, 'date':date_ls, 'HPO':res_un_ls, 'HPO + Wikidata':res_wd_ls}\n", + "res = pd.DataFrame.from_dict(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
builddateHPOHPO + Wikidata
0b12372017-03-090.0000000.171604
1b12392017-04-140.0000000.171104
2b12412017-06-300.1322260.335684
3b12462017-10-060.1316370.347198
4b12482017-12-120.1346810.370759
5b12492018-01-260.1482570.394933
6b12522018-03-090.1447370.394937
7b12542018-06-130.1115730.394935
8b12592018-07-270.1069540.394937
9b12622018-10-090.3572490.382807
10b12632018-12-210.3574650.382811
11b12642019-02-120.3482350.405373
12b12652019-04-150.3458450.405370
13b12662019-06-030.3207160.405344
\n", + "
" + ], + "text/plain": [ + " build date HPO HPO + Wikidata\n", + "0 b1237 2017-03-09 0.000000 0.171604\n", + "1 b1239 2017-04-14 0.000000 0.171104\n", + "2 b1241 2017-06-30 0.132226 0.335684\n", + "3 b1246 2017-10-06 0.131637 0.347198\n", + "4 b1248 2017-12-12 0.134681 0.370759\n", + "5 b1249 2018-01-26 0.148257 0.394933\n", + "6 b1252 2018-03-09 0.144737 0.394937\n", + "7 b1254 2018-06-13 0.111573 0.394935\n", + "8 b1259 2018-07-27 0.106954 0.394937\n", + "9 b1262 2018-10-09 0.357249 0.382807\n", + "10 b1263 2018-12-21 0.357465 0.382811\n", + "11 b1264 2019-02-12 0.348235 0.405373\n", + "12 b1265 2019-04-15 0.345845 0.405370\n", + "13 b1266 2019-06-03 0.320716 0.405344" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# Gotta melt your dataframe\n", + "res_melt = pd.melt(res, id_vars= [\"build\", \"date\"], value_vars=[\"HPO\",\"HPO + Wikidata\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
builddatevariablevalue
0b12372017-03-09HPO0.000000
1b12392017-04-14HPO0.000000
2b12412017-06-30HPO0.132226
3b12462017-10-06HPO0.131637
4b12482017-12-12HPO0.134681
5b12492018-01-26HPO0.148257
6b12522018-03-09HPO0.144737
7b12542018-06-13HPO0.111573
8b12592018-07-27HPO0.106954
9b12622018-10-09HPO0.357249
10b12632018-12-21HPO0.357465
11b12642019-02-12HPO0.348235
12b12652019-04-15HPO0.345845
13b12662019-06-03HPO0.320716
14b12372017-03-09HPO + Wikidata0.171604
15b12392017-04-14HPO + Wikidata0.171104
16b12412017-06-30HPO + Wikidata0.335684
17b12462017-10-06HPO + Wikidata0.347198
18b12482017-12-12HPO + Wikidata0.370759
19b12492018-01-26HPO + Wikidata0.394933
20b12522018-03-09HPO + Wikidata0.394937
21b12542018-06-13HPO + Wikidata0.394935
22b12592018-07-27HPO + Wikidata0.394937
23b12622018-10-09HPO + Wikidata0.382807
24b12632018-12-21HPO + Wikidata0.382811
25b12642019-02-12HPO + Wikidata0.405373
26b12652019-04-15HPO + Wikidata0.405370
27b12662019-06-03HPO + Wikidata0.405344
\n", + "
" + ], + "text/plain": [ + " build date variable value\n", + "0 b1237 2017-03-09 HPO 0.000000\n", + "1 b1239 2017-04-14 HPO 0.000000\n", + "2 b1241 2017-06-30 HPO 0.132226\n", + "3 b1246 2017-10-06 HPO 0.131637\n", + "4 b1248 2017-12-12 HPO 0.134681\n", + "5 b1249 2018-01-26 HPO 0.148257\n", + "6 b1252 2018-03-09 HPO 0.144737\n", + "7 b1254 2018-06-13 HPO 0.111573\n", + "8 b1259 2018-07-27 HPO 0.106954\n", + "9 b1262 2018-10-09 HPO 0.357249\n", + "10 b1263 2018-12-21 HPO 0.357465\n", + "11 b1264 2019-02-12 HPO 0.348235\n", + "12 b1265 2019-04-15 HPO 0.345845\n", + "13 b1266 2019-06-03 HPO 0.320716\n", + "14 b1237 2017-03-09 HPO + Wikidata 0.171604\n", + "15 b1239 2017-04-14 HPO + Wikidata 0.171104\n", + "16 b1241 2017-06-30 HPO + Wikidata 0.335684\n", + "17 b1246 2017-10-06 HPO + Wikidata 0.347198\n", + "18 b1248 2017-12-12 HPO + Wikidata 0.370759\n", + "19 b1249 2018-01-26 HPO + Wikidata 0.394933\n", + "20 b1252 2018-03-09 HPO + Wikidata 0.394937\n", + "21 b1254 2018-06-13 HPO + Wikidata 0.394935\n", + "22 b1259 2018-07-27 HPO + Wikidata 0.394937\n", + "23 b1262 2018-10-09 HPO + Wikidata 0.382807\n", + "24 b1263 2018-12-21 HPO + Wikidata 0.382811\n", + "25 b1264 2019-02-12 HPO + Wikidata 0.405373\n", + "26 b1265 2019-04-15 HPO + Wikidata 0.405370\n", + "27 b1266 2019-06-03 HPO + Wikidata 0.405344" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res_melt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plot your data" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "ax2 = sns.lineplot(x = \"date\", y = \"value\", hue = \"variable\", style = \"variable\", data = res_melt,\n", + " markers = True, dashes=False, palette = \"binary\")\n", + "\n", + "ax2.set(ylim=(0,0.5))\n", + "plt.xticks(rotation=45)\n", + "plt.xlabel('')\n", + "plt.ylabel('Semantic Similarity (%)')\n", + "plt.title('Semantic Similarity Scores with/without Wikidata Overtime')\n", + "\n", + "#Removes Top and Right borders.\n", + "ax2.spines['top'].set_visible(False)\n", + "ax2.spines['right'].set_visible(False)\n", + "\n", + "#Removing legend title and frame.\n", + "handles, labels = ax2.get_legend_handles_labels()\n", + "ax2.legend(handles = handles[1:], labels=labels[1:], loc = \"upper left\", frameon=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This plot demonstrates the calculated semantic simlarity for the disease \"Congenital Disorder of Deglycosylation\" from the Bayesian Ontology Query Algorithm when fed with only the phenotype_annotation.tab dataset, provided by HPO, or the same phenotype_annotation.tab dataset supplemented with Wikidata. This algorithm integrates knowledge stored in an ontology (hpo.obo) and accompanying annotations (phenotype_annotation.tab) in a Bayesian network and searches for terms of the ontology to get a list of the best matching diseases. (Bauer, Kohler et al PMID: 22843981). The Human Phenotype terms that were searched for came from Caglayan and Komu et al (PMID: 25220016)." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ax2 = sns.lineplot(x = \"date\", y = \"value\", hue = \"variable\", style = \"variable\", data = res_melt,\n", + " markers = True, dashes=False, palette = \"binary\")\n", + "\n", + "ax2.set(ylim=(0,0.5))\n", + "plt.xticks(rotation=45)\n", + "plt.xlabel('')\n", + "plt.ylabel('Semantic Similarity (%)')\n", + "plt.title('')\n", + "plt.figure(figsize=[9,6],dpi = 1000)\n", + "#Removes Top and Right borders.\n", + "ax2.spines['top'].set_visible(False)\n", + "ax2.spines['right'].set_visible(False)\n", + "\n", + "#Removing legend title and frame.\n", + "handles, labels = ax2.get_legend_handles_labels()\n", + "ax2.legend(handles = handles[1:], labels=labels[1:], loc = \"upper left\", frameon=False)\n", + "\n", + "#Export\n", + "ax2.figure.savefig('/home/rogertu/Phenomizer/Sem_sim_scores.svg',format = 'svg', bbox_inches='tight', dpi = 500)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Replicating_Phenomizer_Results/Semantic_similarity_score_hpowd_vs_hpo.svg b/Replicating_Phenomizer_Results/Semantic_similarity_score_hpowd_vs_hpo.svg new file mode 100644 index 0000000..4b28717 --- /dev/null +++ b/Replicating_Phenomizer_Results/Semantic_similarity_score_hpowd_vs_hpo.svgdiff --git a/Replicating_Phenomizer_Results/Wikidata_phenomizer_input_modifier.py b/Replicating_Phenomizer_Results/Wikidata_phenomizer_input_modifier.py new file mode 100644 index 0000000..3683204 --- /dev/null +++ b/Replicating_Phenomizer_Results/Wikidata_phenomizer_input_modifier.py @@ -0,0 +1,85 @@ +# The goal of this script is to query OMIM id's from the Wikidata. Running this script will return a GAF files modified with OMIM id's. To see which are the top ranked returned diseases after Wikidata query, use BOQA (https://github.com/sulab/boqa ). +# Note: This is a copy of phenomizer (https://github.com/SuLab/Wikidata-phenomizer). The difference between this and phenomizer is it has an input to accept any hpo.tab file, which makes it easier to load in files. Additionally, it Queries for OMIM id's instead of Orphanet id's. Lastly, the file is more appropriately named making it less confusing. +# To run this script: +# python3 Wikidata_phenomizer_input_modifier.py [Put your phenotype_annotation.tab file name here without brackets] +""" +Get the disease phenotypes from wikidata, +merge them with disease phenotypes HPO GAF file +Feed them into ontologizer +""" +import pandas as pd +import sys + +# Check if there is a phenotype_annotation file appended with command. If not, default=phenotype_annotation.tab. +if len(sys.argv) == 1: + fname = "phenotype_annotation" +else: + fname = sys.argv[1] + fname = fname[0:-4] + +# WikiData Query +from wikidataintegrator import wdi_core + +wikidata_all_query = """ +SELECT ?hpo_id ?symptom_wdLabel ?omim_id ?disease_wdLabel WHERE { + ?disease_wd wdt:P780 ?symptom_wd . + ?symptom_wd wdt:P3841 ?hpo_id . + ?disease_wd wdt:P492 ?omim_id . + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } + }""" + +df_wd = wdi_core.WDItemEngine.execute_sparql_query(wikidata_all_query, as_dataframe=True) + +tab_colnames = ['DB', 'DB_Object_ID', 'DB_Name', 'Qualifier', 'HPO_ID', 'DB_Reference', + 'Evidence_Code', 'Onset modifier', 'Frequency', 'Sex', 'Modifier', + 'Aspect', 'Date_Created', 'Assigned_By'] +gaf = pd.read_csv(fname + ".tab", delimiter="\t", names=tab_colnames, dtype=str) +disease_label = dict(zip(gaf.DB_Reference, gaf.DB_Name)) + +""" +GAF file +OMIM 101120 ACROCEPHALOPOLYSYNDACTYLY TYPE III HP:0000303 OMIM:101120 IEA O HPO:iea[2009-02-17] +""" +df_wd['DB'] = 'OMIM' +df_wd['DB_Object_ID'] = df_wd.omim_id.str.split(":").map(lambda x: x[0]) +df_wd['DB_Name'] = df_wd.disease_wdLabel +df_wd['Qualifier'] = "" +df_wd['HPO_ID'] = df_wd.hpo_id +df_wd['DB_Reference'] = df_wd['DB'] + ":" + df_wd['DB_Object_ID'] +df_wd['Evidence_Code'] = "IEA" +df_wd['Onset modifier'] = "" +df_wd['Frequency'] = "" +df_wd['Sex'] = "" +df_wd['Modifier'] = "O" +df_wd['Aspect'] = "" +df_wd['Date_Created'] = "wd:xxx[2018-12-10]" +df_wd['DB_Name_hpo'] = df_wd.DB_Reference.map(disease_label.get) +df_wd['DB_Name'] = df_wd.DB_Name_hpo.combine_first(df_wd.DB_Name) + +df_wd_gaf = gaf.append(df_wd) + +df_wd_gaf[tab_colnames].to_csv(fname + "_wd.tab", sep="\t", header=False, index=False) + +######## +# drop dupes for counting purposes +gaf = gaf.drop_duplicates(['DB_Object_ID', 'HPO_ID']) +df_wd = df_wd.drop_duplicates(['DB_Object_ID', 'HPO_ID']) +df_wd_gaf = gaf.append(df_wd) + +print("number of hpo annotations: {}".format(len(gaf))) +print("number of wikidata annotations: {}".format(len(df_wd))) + +dupes = df_wd_gaf[df_wd_gaf.duplicated(subset=['HPO_ID', 'DB_Reference'], keep="last")].sort_values(['HPO_ID', 'DB_Reference']) +dupes = dupes[['DB_Name', 'DB_Reference', 'HPO_ID', 'symptom_wdLabel']] +print("number overlap annotations: {}".format(len(dupes))) + +# ones in wikidata, but not in hpo: +wd = set(tuple(zip(df_wd.HPO_ID, df_wd.DB_Reference))) +hpo = set(tuple(zip(gaf.HPO_ID, gaf.DB_Reference))) +new_wd = wd-hpo + +new_df = df_wd[df_wd.apply(lambda row: (row.HPO_ID, row.DB_Reference) in new_wd, axis=1)] +new_df = new_df[['DB_Name', 'DB_Reference', 'HPO_ID', 'symptom_wdLabel']] + +print("\ntop unique disease-phenotypes in wd:") +print(new_df.DB_Name.value_counts()[:10]) diff --git a/Replicating_Phenomizer_Results/requirements.txt b/Replicating_Phenomizer_Results/requirements.txt new file mode 100644 index 0000000..9de034d --- /dev/null +++ b/Replicating_Phenomizer_Results/requirements.txt @@ -0,0 +1,64 @@ +backcall==0.1.0 +bleach==3.0.2 +certifi==2018.11.29 +chardet==3.0.4 +cycler==0.10.0 +decorator==4.3.0 +defusedxml==0.5.0 +entrypoints==0.2.3 +idna==2.8 +ipykernel==5.1.0 +ipython==7.2.0 +ipython-genutils==0.2.0 +ipywidgets==7.4.2 +jedi==0.13.2 +Jinja2==2.10 +jsonschema==2.6.0 +jupyter==1.0.0 +jupyter-client==5.2.4 +jupyter-console==6.0.0 +jupyter-core==4.4.0 +kiwisolver==1.1.0 +MarkupSafe==1.1.0 +matplotlib==3.0.3 +matplotlib-venn==0.11.5 +mistune==0.8.4 +mwoauth==0.3.2 +nbconvert==5.4.0 +nbformat==4.4.0 +notebook==5.7.4 +numpy==1.15.4 +oauthlib==2.1.0 +pandas==0.23.4 +pandocfilters==1.4.2 +parso==0.3.1 +pexpect==4.6.0 +pickleshare==0.7.5 +prometheus-client==0.5.0 +prompt-toolkit==2.0.7 +ptyprocess==0.6.0 +Pygments==2.3.1 +PyJWT==1.7.1 +pyparsing==2.4.0 +python-dateutil==2.7.5 +pytz==2018.7 +pyzmq==17.1.2 +qtconsole==4.4.3 +requests==2.21.0 +requests-oauthlib==1.0.0 +scipy==1.2.1 +seaborn==0.9.0 +Send2Trash==1.5.0 +simplejson==3.16.0 +six==1.12.0 +terminado==0.8.1 +testpath==0.4.2 +tornado==5.1.1 +tqdm==4.28.1 +traitlets==4.3.2 +urllib3==1.24.1 +venn==0.1.3 +wcwidth==0.1.7 +webencodings==0.5.1 +widgetsnbextension==3.4.2 +wikidataintegrator==0.1.3