Skip to content

Commit

Permalink
[#40] Move resourceprojects specific cove code to this repo
Browse files Browse the repository at this point in the history
  • Loading branch information
Bjwebb committed Sep 30, 2015
1 parent 55db314 commit 749e952
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 11 deletions.
6 changes: 6 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8

ADD requirements_taglifter.txt requirements_taglifter.txt
# These are included by requirements.txt but by installing explicitly
# first we can use a cached copy when other requirements change.
RUN pip3 install -r requirements_taglifter.txt
ADD requirements.txt requirements.txt
RUN pip3 install -r requirements.txt

Expand All @@ -32,5 +36,7 @@ RUN manage.py migrate --noinput
RUN manage.py compilemessages
RUN manage.py collectstatic --noinput

ADD ontology ontology

EXPOSE 80
CMD gunicorn cove.wsgi -b 0.0.0.0:80
145 changes: 145 additions & 0 deletions modules/cove_resourceprojects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from taglifter import TagLifter
from collections import OrderedDict
from functools import partial
##requests doesn't work with large files, see below
#import requests
#from requests.auth import HTTPDigestAuth
import subprocess
import os
import urllib.parse


def fetch(dataset):
pass


def convert(dataset):
tl = TagLifter(
ontology="ontology/resource-projects-ontology.rdf",
source=dataset.supplied_data.original_file.file.name,
base="http://resourceprojects.org/",
source_meta={"author": "TODO", "Source_Type": "official", "Converted": "Today"}
)
tl.build_graph()
tl.graph.serialize(
format='turtle',
destination=os.path.join(dataset.supplied_data.upload_dir(), 'output.ttl')
)


def put_to_virtuoso(dataset, staging):
ttl_filename = os.path.join(dataset.supplied_data.upload_dir(), 'output.ttl')
prefix = 'staging.' if staging else ''
graphuri = 'http://{}resourceprojects.org/{}'.format(prefix, dataset.supplied_data.pk)

# Call curl in a subprocess, as requests doesn't work with large files.
#
# Security considerations:
# Beware adding user input to this call. check_call has shell=False by
# default, which means it's not possible to eascape the shell. However,
# user input could pass extra arguments / sensitive files to curl, so we
# should be careful:
# * ttl_filename is not from user input, so should be safe
# * graphuri is urlencoded, so should be safe
subprocess.check_call([
'curl',
'-T',
ttl_filename,
'http://virtuoso:8890/sparql-graph-crud-auth?' + urllib.parse.urlencode({'graph': graphuri}),
'--digest',
'--user',
'dba:{}'.format(os.environ['DBA_PASS'])
])

# This requests code doesn't work for files larger than about 1MB
#with open(os.path.join(data_dir, f), 'rb') as fp:
# r = requests.put('http://localhost:8890/sparql-graph-crud-auth',
# #'http://requestb.in/1mfng7t1',
# params = {'graph': graphuri},
# auth=HTTPDigestAuth('dba', os.environ['DBA_PASS']),
# data=fp
# )

# We're using shell=True here (and running virutoso SQL directly!), so must
# trust prefix, graphuri and DBA_PASS. The only outside input to this are
# DBA_PASS the pk used to construct graphuri, which are not user editable.
# We must ensure this continues to be the case.
subprocess.check_call('''
echo "DB.DBA.RDF_GRAPH_GROUP_INS('http://{}resourceprojects.org/data/', '{}');" | isql virtuoso dba {} \
'''.format(prefix, graphuri, os.environ['DBA_PASS']), shell=True)


def delete_from_virtuoso(dataset, staging):
prefix = 'staging.' if staging else ''
graphuri = 'http://{}resourceprojects.org/{}'.format(prefix, dataset.supplied_data.pk)

# Using curl here because we're already using it for putting.
# If we want to switch to e.g. requests this part should work fine.
subprocess.check_call([
'curl',
'-X',
'DELETE',
'http://virtuoso:8890/sparql-graph-crud-auth?' + urllib.parse.urlencode({'graph': graphuri}),
'--digest',
'--user',
'dba:{}'.format(os.environ['DBA_PASS'])
])


PROCESSES = OrderedDict([
('fetch', {
'name': 'Fetched',
'action_name': 'Fetch',
'depends': None,
'function': fetch,
'main': True,
}),
('convert', {
'name': 'Converted',
'action_name': 'Convert',
'more_info_name': 'Conversion messages',
'depends': 'fetch',
'function': convert,
'main': True
}),
('staging', {
'name': 'Pushed to staging',
'action_name': 'Push to staging',
'more_info_name': 'View on staging',
'depends': 'convert',
'function': partial(put_to_virtuoso, staging=True),
'reverse_id': 'rm_staging',
'main': True
}),
('live', {
'name': 'Pushed to live',
'action_name': 'Push to live',
'depends': 'fetch',
'more_info_name': 'View on live',
'function': partial(put_to_virtuoso, staging=False),
'reverse_id': 'rm_live',
'main': True
}),
('rm_staging', {
'name': 'Removed from staging',
'action_name': 'Remove from staging',
'depends': 'staging',
'function': partial(delete_from_virtuoso, staging=True),
'main': False
}),
('rm_live', {
'name': 'Removed from live',
'action_name': 'Remove from live',
'depends': 'live',
'function': partial(delete_from_virtuoso, staging=False),
'main': False
}),
])

# Add id and reverse fields to each process
PROCESSES = OrderedDict([
(process_id, dict(
id=process_id,
reverse=PROCESSES[process['reverse_id']] if 'reverse_id' in process else None,
**process))
for process_id, process in PROCESSES.items()])
12 changes: 2 additions & 10 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,4 @@
-e git+https://github.com/OpenDataServices/cove.git@7143b59587e077e4b4c79b935b3ee6e7017e5500#egg=cove
-e git+https://github.com/OpenDataServices/cove.git@ccd039c22836a4abc150dd8ed5e3a1493f651fcb#egg=cove
-e git+https://github.com/OpenDataServices/flatten-tool.git@61d8404b444f10384363cde1cad542a0d04af004#egg=flattentool
-r requirements_taglifter.txt
gunicorn==19.3.0
pandas==0.16.2
rdflib==4.2.1
countrycode==0.2

numpy==1.9.3
pyparsing==2.0.3
isodate==0.5.4
python-dateutil==2.4.2

8 changes: 8 additions & 0 deletions requirements_taglifter.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
pandas==0.16.2
rdflib==4.2.1
countrycode==0.2

numpy==1.9.3
pyparsing==2.0.3
isodate==0.5.4
python-dateutil==2.4.2
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
author='Open Data Services',
author_email='[email protected]',
package_dir = {'': 'modules'},
py_modules=['taglifter', 'settings'],
py_modules=['taglifter', 'settings', 'cove_resourceprojects'],
url='https://github.com/NRGI/resource-projects-etl',
description='',
classifiers=[
Expand Down

0 comments on commit 749e952

Please sign in to comment.