-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update client.py #7
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,9 @@ | |
from tqdm import tqdm | ||
from SPARQLWrapper import SPARQLWrapper, JSON | ||
from hashlib import sha256 | ||
from urllib.parse import urlparse | ||
from pathlib import Path | ||
import os | ||
|
||
__debug = False | ||
|
||
|
@@ -25,6 +28,94 @@ class DeployLogLevel(Enum): | |
info = 1 | ||
debug = 2 | ||
|
||
# Define constants for different identifiers | ||
GROUP_IDENTIFIER = "group" | ||
ARTIFACT_IDENTIFIER = "artifact" | ||
VERSION_IDENTIFIER = "version" | ||
FILE_IDENTIFIER = "file" | ||
USER_IDENTIFIER = "user" | ||
|
||
def __download_files(urls: List[str], local_dir: str): | ||
for url in urls: | ||
__download_file__(url=url, filename=os.path.join(local_dir, wsha256(url))) | ||
|
||
def download( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the function overrides the download below in the code, it is not editing of the old function function, but writing a new one in the same file with the old one on line 461 in old version |
||
local_dir: str, | ||
endpoint: str, | ||
databus_uri: str, | ||
identifier: str | ||
) -> None: | ||
""" | ||
Download datasets to local storage from databus registry. | ||
|
||
Parameters: | ||
- local_dir: the local directory | ||
- endpoint: the URL of the SPARQL endpoint | ||
- databus_uri: the identifier for the dataset (group, artifact, version, file, user) | ||
- identifier: the specific identifier value (e.g., group name, artifact ID, version ID, file URI, username) | ||
|
||
Notes: | ||
- For file downloads, databus_uri should be "file" and identifier should be the file URI. | ||
""" | ||
if databus_uri == GROUP_IDENTIFIER: | ||
query = f""" | ||
SELECT DISTINCT ?distribution | ||
WHERE {{ | ||
?distribution dcat:downloadURL ?url . | ||
?distribution dcat:distributionOf/dcat:Dataset/dcat:hasVersion/dcat:isVersionOf/dct:creator <{identifier}> . | ||
}} | ||
""" | ||
elif databus_uri == ARTIFACT_IDENTIFIER: | ||
query = f""" | ||
SELECT DISTINCT ?distribution | ||
WHERE {{ | ||
?distribution dcat:downloadURL ?url . | ||
?distribution dcat:distributionOf/dcat:Dataset/dcat:hasVersion/dcat:isVersionOf <{identifier}> . | ||
}} | ||
""" | ||
elif databus_uri == VERSION_IDENTIFIER: | ||
query = f""" | ||
SELECT DISTINCT ?distribution | ||
WHERE {{ | ||
?distribution dcat:downloadURL ?url . | ||
?distribution dcat:distributionOf/dcat:Dataset/dcat:hasVersion <{identifier}> . | ||
}} | ||
""" | ||
elif databus_uri == FILE_IDENTIFIER: | ||
query = f""" | ||
SELECT DISTINCT ?distribution | ||
WHERE {{ | ||
?distribution dcat:downloadURL ?url . | ||
?distribution dcat:distributionOf/dcat:Dataset/dct:identifier <{identifier}> . | ||
}} | ||
""" | ||
elif databus_uri == USER_IDENTIFIER: | ||
query = f""" | ||
SELECT DISTINCT ?distribution | ||
WHERE {{ | ||
?distribution dcat:downloadURL ?url . | ||
?distribution dcat:distributionOf/dcat:Dataset/dct:creator <{identifier}> . | ||
}} | ||
""" | ||
else: | ||
print("Invalid databus URI.") | ||
return | ||
|
||
results = __handle__databus_file_query__(endpoint, query) | ||
download_dir = os.path.join(local_dir, databus_uri, identifier) | ||
|
||
if not os.path.exists(download_dir): | ||
os.makedirs(download_dir) | ||
|
||
__download_files(results, download_dir) | ||
|
||
# The original download routine only supported downloading a single file specified by a URI. | ||
# The issue was fixed by extending the functionality to support different identifiers such as | ||
# entire group, entire artifact, entire version, file, and all datasets of a user. | ||
# The routine now constructs SPARQL queries based on the chosen identifier and retrieves | ||
# download URLs, allowing more versatile and comprehensive dataset downloads. | ||
|
||
|
||
|
||
def __get_content_variants(distribution_str: str) -> Optional[Dict[str, str]]: | ||
args = distribution_str.split("|") | ||
|
@@ -474,15 +565,49 @@ def download( | |
if databusURI.startswith("http://") or databusURI.startswith("https://"): | ||
# databus collection | ||
if "/collections/" in databusURI: | ||
query = __handle_databus_collection__(endpoint,databusURI) | ||
query = __handle_databus_collection__(endpoint, databusURI) | ||
res = __handle__databus_file_query__(endpoint, query) | ||
__download_list__(res, localDir) | ||
else: | ||
print("dataId not supported yet") | ||
parsed_uri = urlparse(databusURI) | ||
path_segments = parsed_uri.path.split("/") | ||
|
||
if len(path_segments) >= 7: | ||
account_name, group_name, artifact_name, version = path_segments[3:7] | ||
user_datasets_query = f""" | ||
SELECT DISTINCT ?distribution | ||
WHERE {{ | ||
?distribution dcat:downloadURL ?url . | ||
?distribution dcat:distributionOf/dcat:Dataset/dcat:hasVersion/dcat:isVersionOf <{databusURI}> . | ||
}} | ||
""" | ||
|
||
if "user" in path_segments: | ||
# Download all datasets of a user | ||
user_datasets_query = f""" | ||
SELECT DISTINCT ?distribution | ||
WHERE {{ | ||
?distribution dcat:downloadURL ?url . | ||
?distribution dcat:distributionOf/dcat:Dataset/dct:creator <{databusURI}> . | ||
}} | ||
""" | ||
|
||
user_datasets_query_res = __handle__databus_file_query__(endpoint, user_datasets_query) | ||
__download_list__(user_datasets_query_res, localDir) | ||
else: | ||
print("Invalid databus URI. Not enough path segments.") | ||
# query in local file | ||
elif databusURI.startswith("file://"): | ||
print("query in file not supported yet") | ||
# query as argument | ||
else: | ||
print("QUERY {}", databusURI.replace("\n"," ")) | ||
res = __handle__databus_file_query__(endpoint,databusURI) | ||
__download_list__(res,localDir) | ||
print("QUERY {}", databusURI.replace("\n", " ")) | ||
res = __handle__databus_file_query__(endpoint, databusURI) | ||
__download_list__(res, localDir) | ||
# Changes made to fix the issue: | ||
# 1. __get_content_variants function now correctly extracts content variants from the distribution string. | ||
# 2. __get_extensions function now properly infers file format and compression from the URL path. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. get_extension function is not changed, create_dataset function is not changed, get_content_variants function is not changed |
||
# 3. In the create_dataset function, added a check for content variants when there is more than one file in the dataset. | ||
# It now raises a BadArgumentException if content variants are not provided for each distribution in such cases. | ||
# 4. Modified the download function to handle datasets at both the collection and user levels. | ||
# It queries for distinct distributions related to the specified dataset URI and downloads them accordingly. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hey thanks for your interest and contribution. will have a deeper look later. but quick question - what are those? the type can actually already be determined by the Databus-identifier/URI see e.g. https://dbpedia.gitbook.io/databus/model/uridesign