[#40] Move resourceprojects specific cove code to this repo

NRGI · Sep 30, 2015 · 749e952 · 749e952
1 parent 55db314
commit 749e952
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 11 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -19,6 +19,10 @@ ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US:en  
 ENV LC_ALL en_US.UTF-8   
 
+ADD requirements_taglifter.txt requirements_taglifter.txt
+# These are included by requirements.txt but by installing explicitly
+# first we can use a cached copy when other requirements change.
+RUN pip3 install -r requirements_taglifter.txt
 ADD requirements.txt requirements.txt
 RUN pip3 install -r requirements.txt
 
@@ -32,5 +36,7 @@ RUN manage.py migrate --noinput
 RUN manage.py compilemessages
 RUN manage.py collectstatic --noinput
 
+ADD ontology ontology
+
 EXPOSE 80
 CMD gunicorn cove.wsgi -b 0.0.0.0:80
diff --git a/modules/cove_resourceprojects.py b/modules/cove_resourceprojects.py
@@ -0,0 +1,145 @@
+from taglifter import TagLifter
+from collections import OrderedDict
+from functools import partial
+##requests doesn't work with large files, see below
+#import requests
+#from requests.auth import HTTPDigestAuth
+import subprocess
+import os
+import urllib.parse
+
+
+def fetch(dataset):
+    pass
+
+
+def convert(dataset):
+    tl = TagLifter(
+        ontology="ontology/resource-projects-ontology.rdf",
+        source=dataset.supplied_data.original_file.file.name,
+        base="http://resourceprojects.org/",
+        source_meta={"author": "TODO", "Source_Type": "official", "Converted": "Today"}
+    )
+    tl.build_graph()
+    tl.graph.serialize(
+        format='turtle',
+        destination=os.path.join(dataset.supplied_data.upload_dir(), 'output.ttl')
+    )
+
+
+def put_to_virtuoso(dataset, staging):
+    ttl_filename = os.path.join(dataset.supplied_data.upload_dir(), 'output.ttl')
+    prefix = 'staging.' if staging else ''
+    graphuri = 'http://{}resourceprojects.org/{}'.format(prefix, dataset.supplied_data.pk)
+
+    # Call curl in a subprocess, as requests doesn't work with large files.
+    #
+    # Security considerations:
+    # Beware adding user input to this call. check_call has shell=False by
+    # default, which means it's not possible to eascape the shell. However,
+    # user input could pass extra arguments / sensitive files to curl, so we
+    # should be careful:
+    # * ttl_filename is not from user input, so should be safe
+    # * graphuri is urlencoded, so should be safe
+    subprocess.check_call([
+        'curl',
+        '-T',
+        ttl_filename,
+        'http://virtuoso:8890/sparql-graph-crud-auth?' + urllib.parse.urlencode({'graph': graphuri}),
+        '--digest',
+        '--user',
+        'dba:{}'.format(os.environ['DBA_PASS'])
+    ])
+
+    # This requests code doesn't work for files larger than about 1MB
+    #with open(os.path.join(data_dir, f), 'rb') as fp:
+    #    r = requests.put('http://localhost:8890/sparql-graph-crud-auth',
+    #    #'http://requestb.in/1mfng7t1',
+    #        params = {'graph': graphuri},
+    #        auth=HTTPDigestAuth('dba', os.environ['DBA_PASS']),
+    #        data=fp
+    #    )
+
+    # We're using shell=True here (and running virutoso SQL directly!), so must
+    # trust prefix, graphuri and DBA_PASS. The only outside input to this are
+    # DBA_PASS the pk used to construct graphuri, which are not user editable.
+    # We must ensure this continues to be the case.
+    subprocess.check_call('''
+        echo "DB.DBA.RDF_GRAPH_GROUP_INS('http://{}resourceprojects.org/data/', '{}');" | isql virtuoso dba {} \
+        '''.format(prefix, graphuri, os.environ['DBA_PASS']), shell=True)
+
+
+def delete_from_virtuoso(dataset, staging):
+    prefix = 'staging.' if staging else ''
+    graphuri = 'http://{}resourceprojects.org/{}'.format(prefix, dataset.supplied_data.pk)
+
+    # Using curl here because we're already using it for putting.
+    # If we want to switch to e.g. requests this part should work fine.
+    subprocess.check_call([
+        'curl',
+        '-X',
+        'DELETE',
+        'http://virtuoso:8890/sparql-graph-crud-auth?' + urllib.parse.urlencode({'graph': graphuri}),
+        '--digest',
+        '--user',
+        'dba:{}'.format(os.environ['DBA_PASS'])
+    ])
+
+
+PROCESSES = OrderedDict([
+    ('fetch', {
+        'name': 'Fetched',
+        'action_name': 'Fetch',
+        'depends': None,
+        'function': fetch,
+        'main': True,
+    }),
+    ('convert', {
+        'name': 'Converted',
+        'action_name': 'Convert',
+        'more_info_name': 'Conversion messages',
+        'depends': 'fetch',
+        'function': convert,
+        'main': True
+    }),
+    ('staging', {
+        'name': 'Pushed to staging',
+        'action_name': 'Push to staging',
+        'more_info_name': 'View on staging',
+        'depends': 'convert',
+        'function': partial(put_to_virtuoso, staging=True),
+        'reverse_id': 'rm_staging',
+        'main': True
+    }),
+    ('live', {
+        'name': 'Pushed to live',
+        'action_name': 'Push to live',
+        'depends': 'fetch',
+        'more_info_name': 'View on live',
+        'function': partial(put_to_virtuoso, staging=False),
+        'reverse_id': 'rm_live',
+        'main': True
+    }),
+    ('rm_staging', {
+        'name': 'Removed from staging',
+        'action_name': 'Remove from staging',
+        'depends': 'staging',
+        'function': partial(delete_from_virtuoso, staging=True),
+        'main': False
+    }),
+    ('rm_live', {
+        'name': 'Removed from live',
+        'action_name': 'Remove from live',
+        'depends': 'live',
+        'function': partial(delete_from_virtuoso, staging=False),
+        'main': False
+    }),
+])
+
+# Add id and reverse fields to each process
+PROCESSES = OrderedDict([
+    (process_id, dict(
+        id=process_id,
+        reverse=PROCESSES[process['reverse_id']] if 'reverse_id' in process else None,
+        **process))
+    for process_id, process in PROCESSES.items()])
diff --git a/requirements.txt b/requirements.txt
@@ -1,12 +1,4 @@
--e git+https://github.com/OpenDataServices/cove.git@7143b59587e077e4b4c79b935b3ee6e7017e5500#egg=cove
+-e git+https://github.com/OpenDataServices/cove.git@ccd039c22836a4abc150dd8ed5e3a1493f651fcb#egg=cove
 -e git+https://github.com/OpenDataServices/flatten-tool.git@61d8404b444f10384363cde1cad542a0d04af004#egg=flattentool
+-r requirements_taglifter.txt
 gunicorn==19.3.0
-pandas==0.16.2
-rdflib==4.2.1
-countrycode==0.2
-
-numpy==1.9.3
-pyparsing==2.0.3
-isodate==0.5.4
-python-dateutil==2.4.2
-
diff --git a/requirements_taglifter.txt b/requirements_taglifter.txt
@@ -0,0 +1,8 @@
+pandas==0.16.2
+rdflib==4.2.1
+countrycode==0.2
+
+numpy==1.9.3
+pyparsing==2.0.3
+isodate==0.5.4
+python-dateutil==2.4.2
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
     author='Open Data Services',
     author_email='[email protected]',
     package_dir = {'': 'modules'},
-    py_modules=['taglifter', 'settings'],
+    py_modules=['taglifter', 'settings', 'cove_resourceprojects'],
     url='https://github.com/NRGI/resource-projects-etl',
     description='',
     classifiers=[