Skip to content

Commit

Permalink
Merge pull request #12 from timgdavies/master
Browse files Browse the repository at this point in the history
Updating naming process
  • Loading branch information
Bjwebb committed Jun 12, 2015
2 parents e4bb07a + cb54dc2 commit f2d6395
Show file tree
Hide file tree
Showing 4 changed files with 8,065 additions and 22 deletions.
1 change: 1 addition & 0 deletions data/indonesia/3-openoil-concessions-indonesia.csv

Large diffs are not rendered by default.

107 changes: 85 additions & 22 deletions onto-rdf-builder.ipynb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"metadata": {
"name": "",
"signature": "sha256:f62ff933dc605fa41870a63790327e74cc5592b9ee4a27bc5e172da0d90ff690"
"signature": "sha256:d36f8bb38e36d7f12996f16fd48d2ecda0aa858bd8d4491244a6e7b9510dd91e"
},
"nbformat": 3,
"nbformat_minor": 0,
Expand Down Expand Up @@ -52,7 +52,10 @@
"from rdflib import Graph, URIRef, Literal\n",
"from rdflib.namespace import FOAF, RDF, SKOS, OWL, RDFS\n",
"from rdflib.namespace import Namespace\n",
"import random\n",
"prov = Namespace('http://www.w3.org/ns/prov#')\n",
"ontology = 'http://resourceprojects.org/def/'\n",
"base_uri = 'http://resourceprojects.org/'\n",
"\n",
"def clean_string(string):\n",
" invalid_chars = ''.join(c for c in map(chr, range(256)) if not c.isalnum())\n",
Expand Down Expand Up @@ -95,9 +98,32 @@
" country = \"xx\"\n",
" return country.lower()\n",
"\n",
"\n",
" \n",
" \n",
"\n",
"def generate_project_identifier(name):\n",
" \"\"\"Generates a project identifier.\n",
" \n",
" If the project name is a single word, use the first 4 digits, then a 6 characther random alphanumeric string.\n",
" If the project name is two words, use the first two digits of each word. \n",
" \n",
" Uses clean_string to strip non alphanumeric ascii characters before processing. \n",
" \n",
" \"\"\"\n",
" name = name.lower().split(\" \")\n",
" if len(name) == 1:\n",
" start = clean_string(name[0])[:4]\n",
" else:\n",
" start = clean_string(name[0])[:2] + clean_string(name[1])[:2]\n",
" \n",
" \n",
" suffix = ''.join(random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for i in range(6))\n",
" return start + \"-\" + suffix\n",
"\n",
"\n",
"id_cache = {}\n",
"def generate_identifier(path,row,country = \"xx\"):\n",
"\n",
" if \"#\" + path + \"+identifier\" in row.keys(): #Check if this entity already has an identifier given in a column\n",
" if not row[\"#\" + path + \"+identifier\"].strip() == \"\":\n",
" return row[\"#\" + path + \"+identifier\"].strip()\n",
Expand All @@ -110,11 +136,10 @@
" if cache_key in id_cache.keys() and len(clean_string(row[\"#\"+path]).strip()) > 1:\n",
" return id_cache[cache_key]\n",
"\n",
" \n",
" if entity == \"project\":\n",
" identifier = country + \"/\" + clean_string(row[\"#\"+path]).lower()[:3] + \"-\"+ str(uuid64.hex())[-4:].lower()\n",
" identifier = country + \"/\" + generate_project_identifier(row[\"#\"+path])\n",
" elif entity == \"country\":\n",
" identifier = get_country(row)\n",
" identifier = get_country(row).upper()\n",
" else:\n",
" identifier = country + \"/\" + uuid64.hex()\n",
"\n",
Expand All @@ -124,9 +149,6 @@
" \n",
" return identifier\n",
"\n",
" \n",
"ontology = 'http://resourceprojects.org/def/'\n",
"base_uri = 'http://resourceprojects.org/data/'\n",
"\n",
"def get_tag_type(tag,onto):\n",
" if ( URIRef(ontology+tag.title()), RDF.type, OWL.Class ) in onto:\n",
Expand Down Expand Up @@ -176,7 +198,7 @@
" g = Graph()\n",
" data = data.fillna(\"\") # Make sure we set any NA values to blank\n",
" ## Remove row limit when in production\n",
" for line, row in data[0:5].iterrows():\n",
" for line, row in data[0:300].iterrows():\n",
" #First we create the row entity.\n",
" entity = URIRef(base_uri + \"sources/\"+filename+\"/row/\"+ str(line))\n",
" g.add((entity,RDF.type,URIRef(ontology + \"Row\")))\n",
Expand Down Expand Up @@ -240,19 +262,25 @@
" \n",
" return g\n",
"\n",
"\n",
"datasets = OrderedDict()\n",
"datasets['eiti-project-level'] = pd.read_csv('eiti-project-level.csv')\n",
"data = map_tags(datasets['eiti-project-level'])\n",
"\n",
"onto = Graph()\n",
"onto.parse(\"../../code/resource-projects-etl/ontology.rdf\", format=\"xml\")\n",
"\n",
"g = generate_graph(data, onto,\"eiti-project-level.csv\")\n",
"\n",
"# print g.serialize(format='turtle')\n",
"g.serialize(format='turtle', destination='resource-projects.ttl')\n",
"print \"Written output to resource-projects.ttl\""
"#datasets = OrderedDict()\n",
"#datasets['eiti-project-level'] = pd.read_csv('eiti-project-level.csv')\n",
"\n",
"datasets = OrderedDict()\n",
"datasets['openoil-concessions'] = pd.read_csv('data/indonesia/3-openoil-concessions-indonesia.csv')\n",
"\n",
"\n",
"for dataset in datasets:\n",
" print \"Mapping tags for \" + dataset\n",
" data = map_tags(datasets[dataset])\n",
" print \"Generating graph for \" + dataset\n",
" g = generate_graph(data, onto,dataset)\n",
" print \"Writing out dataset\"\n",
" g.serialize(format='turtle', destination=\"rdf/\"+dataset+\".ttl\")\n",
" print \"Written output to rdf/\"+dataset+\".ttl\""
],
"language": "python",
"metadata": {},
Expand All @@ -261,19 +289,54 @@
"output_type": "stream",
"stream": "stdout",
"text": [
"Written output to resource-projects.ttl\n"
"Mapping tags for openoil-concessions\n",
"Generating graph for openoil-concessions\n",
"Writing out dataset"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Written output to rdf/openoil-concessions.ttl"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 26
"prompt_number": 50
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"input": [
"\n",
"\n",
"\n",
"\n",
" \n",
"print generate_project_identifier(\"Jubilee\")\n",
"print generate_project_identifier(\"East Kalaman Fields\")"
],
"language": "python",
"metadata": {},
"outputs": []
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"jubi-gzgzz3\n",
"eaka-9qbtcx\n"
]
}
],
"prompt_number": 41
}
],
"metadata": {}
Expand Down
Loading

0 comments on commit f2d6395

Please sign in to comment.