From 10b23f4ae5d37784ce2423364a4027ee120a0774 Mon Sep 17 00:00:00 2001
From: gerikson <galina.erikson@gmail.com>
Date: Sat, 8 Nov 2014 13:39:51 -0800
Subject: [PATCH 1/3] Fixed error r Please enter the commit message for your
 changes. Lines starting

---
 src/dataload/contrib/gonl/{_init_.py => __init_.py} | 0
 src/dataload/contrib/gonl/gonl_parser.py            | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename src/dataload/contrib/gonl/{_init_.py => __init_.py} (100%)

diff --git a/src/dataload/contrib/gonl/_init_.py b/src/dataload/contrib/gonl/__init_.py
similarity index 100%
rename from src/dataload/contrib/gonl/_init_.py
rename to src/dataload/contrib/gonl/__init_.py
diff --git a/src/dataload/contrib/gonl/gonl_parser.py b/src/dataload/contrib/gonl/gonl_parser.py
index 99e80cb4..0e02456c 100644
--- a/src/dataload/contrib/gonl/gonl_parser.py
+++ b/src/dataload/contrib/gonl/gonl_parser.py
@@ -128,7 +128,7 @@ def _map_line_to_json(fields):
                 "varType": varType,
                 "rsID": rsID,
                 "QUAL": QUAL,
-                "FILTER": FILTER
+                "FILTER": FILTER,
                 "AC": AC,
                 "AF": AF,
                 "AN": AN

From cba5eea200d1c849f2caed11415dad7461f41f5c Mon Sep 17 00:00:00 2001
From: gerikson <galina.erikson@gmail.com>
Date: Sat, 8 Nov 2014 14:52:33 -0800
Subject: [PATCH 2/3] Added GEUVADIS RNA sequencing project AF

---
 src/dataload/contrib/geuvadis/__init_.py      |   3 +
 .../contrib/geuvadis/geuvadis_parser.py       | 180 ++++++++++++++++++
 2 files changed, 183 insertions(+)
 create mode 100644 src/dataload/contrib/geuvadis/__init_.py
 create mode 100644 src/dataload/contrib/geuvadis/geuvadis_parser.py

diff --git a/src/dataload/contrib/geuvadis/__init_.py b/src/dataload/contrib/geuvadis/__init_.py
new file mode 100644
index 00000000..dcc224d2
--- /dev/null
+++ b/src/dataload/contrib/geuvadis/__init_.py
@@ -0,0 +1,3 @@
+# -*- coding: utf-8 -*-
+
+from .geuvadis_parser import load_data
\ No newline at end of file
diff --git a/src/dataload/contrib/geuvadis/geuvadis_parser.py b/src/dataload/contrib/geuvadis/geuvadis_parser.py
new file mode 100644
index 00000000..e3ebc446
--- /dev/null
+++ b/src/dataload/contrib/geuvadis/geuvadis_parser.py
@@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+'''
+GEUVADIS Genetic European Variation in Health and Disease, 
+A European Medical Sequencing Consortium
+'''
+import pymongo
+import time
+import gzip
+from utils.common import timesofar
+
+
+
+# split ";" separated fields into comma separated lists, strip.
+def list_split(d):
+    for key, val in d.items():
+        if isinstance(val, dict):
+            list_split(val)
+        try:
+            if len(val.split(";")) > 1:
+                d[key] = val.rstrip().rstrip(';').split(";")
+        except (AttributeError):
+            pass
+    return d
+
+
+# remove keys whos values are "."
+# and remove empty dictionaries
+def dict_sweep(d):
+    for key, val in d.items():
+        if val == ".":
+            del d[key]
+        elif isinstance(val, list):
+            d[key] = [dict_sweep(item) for item in val if isinstance(item, dict)]
+            if len(val) == 0:
+                del d[key]
+        elif isinstance(val, dict):
+            dict_sweep(val)
+            if len(val) == 0:
+                del d[key]
+    return d
+
+
+# convert string numbers into integers or floats
+def value_convert(d):
+    for key, val in d.items():
+        try:
+            d[key] = int(val)
+        except (ValueError, TypeError):
+            try:
+                d[key] = float(val)
+            except (ValueError, TypeError):
+                pass
+        if isinstance(val, dict):
+            value_convert(val)
+        elif isinstance(val, list):
+            try:
+                d[key] = [int(x) for x in val]
+            except (ValueError, TypeError):
+                try:
+                    d[key] = [float(x) for x in val]
+                except (ValueError, TypeError):
+                    pass
+    return d
+
+
+# if dict value is a list of length 1, unlist
+def unlist(d):
+    for key, val in d.items():
+            if isinstance(val, list):
+                if len(val) == 1:
+                    d[key] = val[0]
+            elif isinstance(val, dict):
+                unlist(val)
+    return d
+
+
+# convert one snp to json
+def _map_line_to_json(fields):
+    # specific variable treatment
+    chrom = fields[0]
+    chromStart = int(fields[1])
+    allele1 = fields[3]
+    allele2 = fields[4]
+    HGVS = "chr%s:g.%d%s>%s" % (chrom, chromStart, allele1, allele2)
+    chromEnd = chromStart + len(allele1)
+    rsID = fields[2]
+    QUAL = fields[5]
+    FILTER = fields[6]
+    info = fields[7].split(";")
+    varType = "."
+    AC="."
+    AF="."
+    AN="."
+
+    for i in info:
+        i = i.strip()
+        if i.startswith("AC"):
+            AC = i.strip("AC=")
+        elif i.startswith("AF="):
+            AF = i.strip("AF=")
+        elif i.startswith("GTS="):
+            AN = i.strip("GTS=")
+        elif i.startswith("GTC="):
+            varType=i.strip("GTC=")
+    
+    
+    one_snp_json = {
+
+        "_id": HGVS,
+        "gonl":
+            {
+                "chrom": chrom,
+                "hg19":
+                    {
+                        "start": chromStart,
+                        "end": chromEnd
+                    },
+
+                "allele1": allele1,
+                "allele2": allele2,
+                "varType": varType,
+                "rsID": rsID,
+                "QUAL": QUAL,
+                "FILTER": FILTER,
+                "AC": AC,
+                "AF": AF,
+                "GTS": GTS,
+                "GTC": GTC
+            }
+    }
+
+    one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json))))
+    one_snp_json["gonl"]["chrom"] = str(one_snp_json["gonl"]["chrom"])
+    return one_snp_json
+    
+
+# open file, parse, pass to json mapper
+def data_generator(input_file):
+    open_file = open(input_file)
+    #load vcf file
+    line = open_file.readline()
+
+    while line.strip() != "":
+        if line.startswith("#"):
+            print "HEADER LINE"
+            line = open_file.readline()
+        else:
+            line = line.split("\t")
+            current_row = _map_line_to_json(line)
+            line = open_file.readline()
+    open_file.close()
+
+
+# load path and find files, pass to data_generator
+def load_data(path):
+    input_file = "/gpfs/home/gerikson/GEEVS_aggregation_v2.vcf.gz"
+    data = data_generator(input_file)
+
+    
+
+
+# load collection into mongodb
+def load_collection(database, input_file_list, collection_name):
+    """
+    : param database: mongodb url
+    : param input_file_list: variant docs, path to file
+    : param collection_name: annotation source name
+    """
+    conn = pymongo.MongoClient(database)
+    db = conn.variantdoc
+    posts = db[collection_name]
+    t1 = time.time()
+    cnt = 0
+    input_file_list = getFileList()
+    for doc in load_data(input_file_list):
+        posts.insert(doc, manipulate=False, check_keys=False, w=0)
+        cnt += 1
+        if cnt % 100000 == 0:
+            print cnt, timesofar(t1)
+    print "successfully loaded %s into mongodb" % collection_name 

From ebdda499ede1becb57e9740bb0043393c5109861 Mon Sep 17 00:00:00 2001
From: gerikson <galina.erikson@gmail.com>
Date: Sat, 8 Nov 2014 14:59:44 -0800
Subject: [PATCH 3/3] Bug Fix

---
 src/dataload/contrib/geuvadis/geuvadis_parser.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/dataload/contrib/geuvadis/geuvadis_parser.py b/src/dataload/contrib/geuvadis/geuvadis_parser.py
index e3ebc446..5753a281 100644
--- a/src/dataload/contrib/geuvadis/geuvadis_parser.py
+++ b/src/dataload/contrib/geuvadis/geuvadis_parser.py
@@ -107,7 +107,7 @@ def _map_line_to_json(fields):
     one_snp_json = {
 
         "_id": HGVS,
-        "gonl":
+        "geuvadis":
             {
                 "chrom": chrom,
                 "hg19":
@@ -130,7 +130,7 @@ def _map_line_to_json(fields):
     }
 
     one_snp_json = list_split(dict_sweep(unlist(value_convert(one_snp_json))))
-    one_snp_json["gonl"]["chrom"] = str(one_snp_json["gonl"]["chrom"])
+    one_snp_json["geuvadis"]["chrom"] = str(one_snp_json["geuvadis"]["chrom"])
     return one_snp_json
     
 
@@ -153,7 +153,7 @@ def data_generator(input_file):
 
 # load path and find files, pass to data_generator
 def load_data(path):
-    input_file = "/gpfs/home/gerikson/GEEVS_aggregation_v2.vcf.gz"
+    input_file = "ftp://ftp.ebi.ac.uk/pub/databases/eva/PRJEB6042/ERZX00026/GEEVS_aggregation_v2.vcf.gz"
     data = data_generator(input_file)