Skip to content

Commit

Permalink
fixing encoding for compounds file - default/utf cannot read all char…
Browse files Browse the repository at this point in the history
…acters, cleaning up metadata
  • Loading branch information
EvanDietzMorris committed Jan 29, 2024
1 parent deae0c7 commit 71ab0be
Showing 1 changed file with 12 additions and 12 deletions.
24 changes: 12 additions & 12 deletions parsers/chebi/src/loadChebiProperties.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import argparse
from gzip import GzipFile
import gzip

from collections import defaultdict
from Common.utils import GetData
Expand Down Expand Up @@ -30,13 +30,14 @@
class ChebiPropertiesLoader(SourceDataLoader):

# Setting the class level variables for the source ID and provenance
source_id: str = 'ChebiProperties'
provenance_id: str = 'infores:chebi-properties'
description = ""
source_data_url = ""
source_id: str = 'CHEBIProps'
provenance_id: str = None # there aren't edges coming from this source currently
description = "Chemical Entities of Biological Interest (ChEBI) is a freely available dictionary of molecular " \
"entities focused on ‘small’ chemical compounds."
source_data_url = "https://www.ebi.ac.uk/chebi/"
license = ""
attribution = ""
parsing_version = '1.1'
parsing_version = '1.2'
preserve_unconnected_nodes = True

def __init__(self, test_mode: bool = False, source_data_dir: str = None):
Expand Down Expand Up @@ -87,18 +88,17 @@ def parse_data(self):
names = {}
skipped_header = False
archive_file_path = os.path.join(self.data_path, self.compounds_file)
with GzipFile(archive_file_path) as zf:
for bytesline in zf:
with gzip.open(archive_file_path, mode="rt", encoding="iso-8859-1") as zf:
for line in zf:

# skip the header
if not skipped_header:
skipped_header = True
continue

lines = bytesline.decode('utf-8')
line = lines.strip().split('\t')
chebi_id = line[COMPOUNDS_CHEBI_ID_COLUMN]
cname = line[COMPOUNDS_CHEBI_NAME_COLUMN]
compounds_line = line.strip().split('\t')
chebi_id = compounds_line[COMPOUNDS_CHEBI_ID_COLUMN]
cname = compounds_line[COMPOUNDS_CHEBI_NAME_COLUMN]
names[chebi_id] = cname

# init the record counters
Expand Down

0 comments on commit 71ab0be

Please sign in to comment.