Merge pull request #12 from timgdavies/master

Updating naming process
NRGI · Jun 12, 2015 · f2d6395 · f2d6395
2 parents e4bb07a + cb54dc2
commit f2d6395
Show file tree

Hide file tree

Showing 4 changed files with 8,065 additions and 22 deletions.
diff --git a/data/indonesia/3-openoil-concessions-indonesia.csv b/data/indonesia/3-openoil-concessions-indonesia.csv
diff --git a/onto-rdf-builder.ipynb b/onto-rdf-builder.ipynb
@@ -1,7 +1,7 @@
 {
  "metadata": {
   "name": "",
-  "signature": "sha256:f62ff933dc605fa41870a63790327e74cc5592b9ee4a27bc5e172da0d90ff690"
+  "signature": "sha256:d36f8bb38e36d7f12996f16fd48d2ecda0aa858bd8d4491244a6e7b9510dd91e"
  },
  "nbformat": 3,
  "nbformat_minor": 0,
@@ -52,7 +52,10 @@
       "from rdflib import Graph, URIRef, Literal\n",
       "from rdflib.namespace import FOAF, RDF, SKOS, OWL, RDFS\n",
       "from rdflib.namespace import Namespace\n",
+      "import random\n",
       "prov = Namespace('http://www.w3.org/ns/prov#')\n",
+      "ontology = 'http://resourceprojects.org/def/'\n",
+      "base_uri = 'http://resourceprojects.org/'\n",
       "\n",
       "def clean_string(string):\n",
       "    invalid_chars = ''.join(c for c in map(chr, range(256)) if not c.isalnum())\n",
@@ -95,9 +98,32 @@
       "        country = \"xx\"\n",
       "    return country.lower()\n",
       "\n",
+      "\n",
+      "   \n",
+      "    \n",
+      "\n",
+      "def generate_project_identifier(name):\n",
+      "    \"\"\"Generates a project identifier.\n",
+      "    \n",
+      "    If the project name is a single word, use the first 4 digits, then a 6 characther random alphanumeric string.\n",
+      "    If the project name is two words, use the first two digits of each word. \n",
+      "    \n",
+      "    Uses clean_string to strip non alphanumeric ascii characters before processing. \n",
+      "    \n",
+      "    \"\"\"\n",
+      "    name = name.lower().split(\" \")\n",
+      "    if len(name) == 1:\n",
+      "        start = clean_string(name[0])[:4]\n",
+      "    else:\n",
+      "        start = clean_string(name[0])[:2] + clean_string(name[1])[:2]\n",
+      "    \n",
+      "    \n",
+      "    suffix = ''.join(random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for i in range(6))\n",
+      "    return start + \"-\" + suffix\n",
+      "\n",
+      "\n",
       "id_cache = {}\n",
       "def generate_identifier(path,row,country = \"xx\"):\n",
-      "\n",
       "    if \"#\" + path + \"+identifier\" in row.keys(): #Check if this entity already has an identifier given in a column\n",
       "        if not row[\"#\" + path + \"+identifier\"].strip() == \"\":\n",
       "            return row[\"#\" + path + \"+identifier\"].strip()\n",
@@ -110,11 +136,10 @@
       "        if cache_key in id_cache.keys() and len(clean_string(row[\"#\"+path]).strip()) > 1:\n",
       "            return id_cache[cache_key]\n",
       "\n",
-      "        \n",
       "        if entity == \"project\":\n",
-      "            identifier = country + \"/\" + clean_string(row[\"#\"+path]).lower()[:3] + \"-\"+ str(uuid64.hex())[-4:].lower()\n",
+      "            identifier = country + \"/\" + generate_project_identifier(row[\"#\"+path])\n",
       "        elif entity == \"country\":\n",
-      "            identifier = get_country(row)\n",
+      "            identifier = get_country(row).upper()\n",
       "        else:\n",
       "            identifier = country + \"/\" + uuid64.hex()\n",
       "\n",
@@ -124,9 +149,6 @@
       "  \n",
       "    return identifier\n",
       "\n",
-      "    \n",
-      "ontology = 'http://resourceprojects.org/def/'\n",
-      "base_uri = 'http://resourceprojects.org/data/'\n",
       "\n",
       "def get_tag_type(tag,onto):\n",
       "    if ( URIRef(ontology+tag.title()), RDF.type, OWL.Class ) in onto:\n",
@@ -176,7 +198,7 @@
       "    g = Graph()\n",
       "    data = data.fillna(\"\") # Make sure we set any NA values to blank\n",
       "    ## Remove row limit when in production\n",
-      "    for line, row in data[0:5].iterrows():\n",
+      "    for line, row in data[0:300].iterrows():\n",
       "        #First we create the row entity.\n",
       "        entity = URIRef(base_uri + \"sources/\"+filename+\"/row/\"+ str(line))\n",
       "        g.add((entity,RDF.type,URIRef(ontology + \"Row\")))\n",
@@ -240,19 +262,25 @@
       "           \n",
       "    return g\n",
       "\n",
-      "\n",
-      "datasets = OrderedDict()\n",
-      "datasets['eiti-project-level'] = pd.read_csv('eiti-project-level.csv')\n",
-      "data = map_tags(datasets['eiti-project-level'])\n",
-      "\n",
       "onto = Graph()\n",
       "onto.parse(\"../../code/resource-projects-etl/ontology.rdf\", format=\"xml\")\n",
       "\n",
-      "g = generate_graph(data, onto,\"eiti-project-level.csv\")\n",
       "\n",
-      "# print g.serialize(format='turtle')\n",
-      "g.serialize(format='turtle', destination='resource-projects.ttl')\n",
-      "print \"Written output to resource-projects.ttl\""
+      "#datasets = OrderedDict()\n",
+      "#datasets['eiti-project-level'] = pd.read_csv('eiti-project-level.csv')\n",
+      "\n",
+      "datasets = OrderedDict()\n",
+      "datasets['openoil-concessions'] = pd.read_csv('data/indonesia/3-openoil-concessions-indonesia.csv')\n",
+      "\n",
+      "\n",
+      "for dataset in datasets:\n",
+      "    print \"Mapping tags for \" + dataset\n",
+      "    data = map_tags(datasets[dataset])\n",
+      "    print \"Generating graph for \" + dataset\n",
+      "    g = generate_graph(data, onto,dataset)\n",
+      "    print \"Writing out dataset\"\n",
+      "    g.serialize(format='turtle', destination=\"rdf/\"+dataset+\".ttl\")\n",
+      "    print \"Written output to rdf/\"+dataset+\".ttl\""
      ],
      "language": "python",
      "metadata": {},
@@ -261,19 +289,54 @@
        "output_type": "stream",
        "stream": "stdout",
        "text": [
-        "Written output to resource-projects.ttl\n"
+        "Mapping tags for openoil-concessions\n",
+        "Generating graph for openoil-concessions\n",
+        "Writing out dataset"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Written output to rdf/openoil-concessions.ttl"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n"
        ]
       }
      ],
-     "prompt_number": 26
+     "prompt_number": 50
     },
     {
      "cell_type": "code",
      "collapsed": false,
-     "input": [],
+     "input": [
+      "\n",
+      "\n",
+      "\n",
+      "\n",
+      "    \n",
+      "print generate_project_identifier(\"Jubilee\")\n",
+      "print generate_project_identifier(\"East Kalaman Fields\")"
+     ],
      "language": "python",
      "metadata": {},
-     "outputs": []
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "jubi-gzgzz3\n",
+        "eaka-9qbtcx\n"
+       ]
+      }
+     ],
+     "prompt_number": 41
     }
    ],
    "metadata": {}