Skip to content

Commit

Permalink
Merge pull request #1017 from microbiomedata/1010-award-dois
Browse files Browse the repository at this point in the history
  • Loading branch information
naglepuff authored Sep 14, 2023
2 parents 631c823 + 79657da commit 110e469
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 85 deletions.
11 changes: 5 additions & 6 deletions nmdc_server/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,15 @@ def get_study_image(db: Session, study_id: str) -> Optional[bytes]:
return None


def get_doi(db: Session, doi_id: str) -> Optional[models.DOIInfo]:
doi = db.query(models.DOIInfo).get(doi_id)
return doi


def create_study(db: Session, study: schemas.StudyCreate) -> models.Study:
study_dict = study.dict()

websites = study_dict.pop("principal_investigator_websites")
publications = study_dict.pop("publication_dois")

db_study = models.Study(**study_dict)

Expand All @@ -114,11 +118,6 @@ def create_study(db: Session, study: schemas.StudyCreate) -> models.Study:
study_website = models.StudyWebsite(website=website)
db_study.principal_investigator_websites.append(study_website) # type: ignore

for doi in publications:
publication, _ = get_or_create(db, models.Publication, doi=doi)
study_publication = models.StudyPublication(publication=publication)
db_study.publication_dois.append(study_publication) # type: ignore

db.add(db_study)
db.commit()
db.refresh(db_study)
Expand Down
38 changes: 9 additions & 29 deletions nmdc_server/fakes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ def uuid(self):
return uuid4()


class EnumProvider(BaseProvider):
def enum_value(self, enum_class):
enum_values = list(enum_class)
return self.random_element(enum_values)


db = scoped_session(SessionLocal)
Faker.add_provider(DoiProvider)
Faker.add_provider(date_time)
Expand All @@ -33,6 +39,7 @@ def uuid(self):
Faker.add_provider(misc)
Faker.add_provider(person)
Faker.add_provider(python)
Faker.add_provider(EnumProvider)


class TokenFactory(Factory):
Expand All @@ -56,6 +63,7 @@ class Meta:

id = Faker("doi")
info = Faker("pydict", value_types=["str"])
doi_type = Faker("enum_value", enum_class=models.DOIType)


class AnnotatedFactory(SQLAlchemyModelFactory):
Expand All @@ -78,15 +86,6 @@ class Meta:
sqlalchemy_session = db


class PublicationFactory(SQLAlchemyModelFactory):
id = Faker("uuid")
doi_object = SubFactory(DOIInfoFactory)

class Meta:
model = models.Publication
sqlalchemy_session = db


class EnvoTermFactory(SQLAlchemyModelFactory):
id = Faker("pystr")
label = Faker("word")
Expand Down Expand Up @@ -130,8 +129,8 @@ class StudyFactory(AnnotatedFactory):
gold_description = Faker("sentence")
scientific_objective = Faker("sentence")
principal_investigator = SubFactory(PrincipalInvestigator)
doi_object = SubFactory(DOIInfoFactory)
image = Faker("binary", length=64)
dois: List[models.DOIInfo] = []

class Meta:
model = models.Study
Expand All @@ -148,17 +147,6 @@ def principal_investigator_websites(self, create, extracted, **kwargs):
for website in extracted:
self.principal_investigator_websites.append(website)

@post_generation
def publication_dois(self, create, extracted, **kwargs):
if not create:
return

if not extracted:
extracted = [StudyPublicationFactory(), StudyPublicationFactory()]

for publication in extracted:
self.publication_dois.append(publication)


class StudyWebsiteFactory(SQLAlchemyModelFactory):
website = SubFactory(WebsiteFactory)
Expand All @@ -168,14 +156,6 @@ class Meta:
sqlalchemy_session = db


class StudyPublicationFactory(SQLAlchemyModelFactory):
publication = SubFactory(PublicationFactory)

class Meta:
model = models.StudyPublication
sqlalchemy_session = db


class BiosampleFactory(AnnotatedFactory):
class Meta:
model = models.Biosample
Expand Down
6 changes: 3 additions & 3 deletions nmdc_server/ingest/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sqlalchemy.orm import Session

from nmdc_server.logger import get_logger
from nmdc_server.models import DOIInfo
from nmdc_server.models import DOIInfo, DOIType

retry_strategy = Retry(total=10)
adapter = HTTPAdapter(max_retries=retry_strategy)
Expand All @@ -24,7 +24,7 @@ def get_doi_info(doi: str) -> Response:
return requests.get(url, headers=headers, timeout=60)


def upsert_doi(db: Session, doi: str):
def upsert_doi(db: Session, doi: str, doi_type: DOIType):
logger = get_logger(__name__)
# Try really hard to get doi data... the doi.org service is very unreliable.
try:
Expand All @@ -43,7 +43,7 @@ def upsert_doi(db: Session, doi: str):
return
info = {}

statement = insert(DOIInfo.__table__).values(id=doi, info=info)
statement = insert(DOIInfo.__table__).values(id=doi, info=info, doi_type=doi_type)
statement = statement.on_conflict_do_update(constraint="pk_doi_info", set_=dict(info=info))
db.execute(statement)
db.flush()
33 changes: 23 additions & 10 deletions nmdc_server/ingest/study.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from pymongo.cursor import Cursor
from sqlalchemy.orm import Session

from nmdc_server.crud import create_study
from nmdc_server.crud import create_study, get_doi
from nmdc_server.ingest.common import extract_extras, extract_value
from nmdc_server.ingest.doi import upsert_doi
from nmdc_server.models import PrincipalInvestigator
from nmdc_server.models import DOIType, PrincipalInvestigator
from nmdc_server.schemas import StudyCreate


Expand Down Expand Up @@ -65,13 +65,26 @@ def load(db: Session, cursor: Cursor):
obj["principal_investigator_websites"] = obj.pop("websites", [])
obj["image"] = get_study_image_data(obj.pop("study_image", []))

obj["publication_dois"] = [transform_doi(d) for d in obj.pop("publications", [])]
if "doi" in obj:
obj["doi"]["has_raw_value"] = transform_doi(obj["doi"]["has_raw_value"])
publication_dois = [transform_doi(d) for d in obj.pop("publications", [])] + [
transform_doi(d) for d in obj.pop("publication_dois", [])
]
award_dois = [transform_doi(doi) for doi in obj.pop("award_dois", [])] + [
transform_doi(d) for d in obj.pop("emsl_project_dois", [])
]
dataset_dois = [transform_doi(doi) for doi in obj.pop("dataset_dois", [])]

if "doi" in obj:
upsert_doi(db, obj["doi"]["has_raw_value"])
for doi in obj.get("publication_dois", []):
upsert_doi(db, doi)
for doi in publication_dois:
upsert_doi(db, doi, DOIType.PUBLICATION)

create_study(db, Study(**obj))
for doi in award_dois:
upsert_doi(db, doi, DOIType.AWARD)

for doi in dataset_dois:
upsert_doi(db, doi, DOIType.DATASET)

new_study = create_study(db, Study(**obj))

for doi_id in publication_dois + award_dois + dataset_dois:
doi_object = get_doi(db, doi_id)
if doi_object:
new_study.dois.append(doi_object) # type: ignore
80 changes: 80 additions & 0 deletions nmdc_server/migrations/versions/1de891717fc0_multivalued_dois.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""multivalued DOIs
Revision ID: 1de891717fc0
Revises: dad555bb9212
Create Date: 2023-08-23 19:22:15.660679
"""
from typing import Optional

import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision: str = "1de891717fc0"
down_revision: Optional[str] = "dad555bb9212"
branch_labels: Optional[str] = None
depends_on: Optional[str] = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"study_doi_association",
sa.Column("study_id", sa.String(), nullable=False),
sa.Column("doi_id", sa.String(), nullable=False),
sa.ForeignKeyConstraint(
["doi_id"], ["doi_info.id"], name=op.f("fk_study_doi_association_doi_id_doi_info")
),
sa.ForeignKeyConstraint(
["study_id"], ["study.id"], name=op.f("fk_study_doi_association_study_id_study")
),
sa.PrimaryKeyConstraint("study_id", "doi_id", name=op.f("pk_study_doi_association")),
)
op.drop_table("study_publication")
op.drop_table("publication")

doitype = postgresql.ENUM("AWARD", "DATASET", "PUBLICATION", name="doitype")
doitype.create(op.get_bind())

op.add_column(
"doi_info",
sa.Column(
"doi_type", sa.Enum("AWARD", "DATASET", "PUBLICATION", name="doitype"), nullable=True
),
)
op.drop_constraint("fk_study_doi_doi_info", "study", type_="foreignkey")
op.drop_column("study", "doi")
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("study", sa.Column("doi", sa.VARCHAR(), autoincrement=False, nullable=True))
op.create_foreign_key("fk_study_doi_doi_info", "study", "doi_info", ["doi"], ["id"])
op.drop_column("doi_info", "doi_type")
op.create_table(
"study_publication",
sa.Column("study_id", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.Column("publication_id", postgresql.UUID(), autoincrement=False, nullable=False),
sa.ForeignKeyConstraint(
["publication_id"],
["publication.id"],
name="fk_study_publication_publication_id_publication",
),
sa.ForeignKeyConstraint(
["study_id"], ["study.id"], name="fk_study_publication_study_id_study"
),
sa.PrimaryKeyConstraint("study_id", "publication_id", name="pk_study_publication"),
)
op.create_table(
"publication",
sa.Column("id", postgresql.UUID(), autoincrement=False, nullable=False),
sa.Column("doi", sa.VARCHAR(), autoincrement=False, nullable=False),
sa.ForeignKeyConstraint(["doi"], ["doi_info.id"], name="fk_publication_doi_doi_info"),
sa.PrimaryKeyConstraint("id", name="pk_publication"),
sa.UniqueConstraint("doi", name="uq_publication_doi"),
)
op.drop_table("study_doi_association")
# ### end Alembic commands ###
67 changes: 38 additions & 29 deletions nmdc_server/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import enum
from datetime import datetime
from typing import Any, Dict, Iterator, List, Optional, Type, Union
from uuid import uuid4
Expand All @@ -8,6 +9,7 @@
CheckConstraint,
Column,
DateTime,
Enum,
Float,
ForeignKey,
Integer,
Expand Down Expand Up @@ -178,6 +180,20 @@ class PrincipalInvestigator(Base):
image = Column(LargeBinary, nullable=True)


class DOIType(enum.Enum):
AWARD = "award"
DATASET = "dataset"
PUBLICATION = "publication"


study_doi_association = Table(
"study_doi_association",
Base.metadata,
Column("study_id", ForeignKey("study.id"), primary_key=True),
Column("doi_id", ForeignKey("doi_info.id"), primary_key=True),
)


# Caches information from doi.org
class DOIInfo(Base):
__tablename__ = "doi_info"
Expand All @@ -188,6 +204,8 @@ class DOIInfo(Base):
primary_key=True,
)
info = Column(JSONB, nullable=False, default=dict)
doi_type = Column(Enum(DOIType))
studies = relationship("Study", secondary=study_doi_association, back_populates="dois")


class AnnotatedModel:
Expand All @@ -207,7 +225,7 @@ class Study(Base, AnnotatedModel):
gold_name = Column(String, nullable=False, default="")
gold_description = Column(String, nullable=False, default="")
scientific_objective = Column(String, nullable=False, default="")
doi = Column(String, ForeignKey("doi_info.id"), nullable=True)
dois = relationship("DOIInfo", secondary=study_doi_association, back_populates="studies")
multiomics = Column(Integer, nullable=False, default=0)

# TODO migrate these into relations or something
Expand Down Expand Up @@ -244,10 +262,6 @@ def image_url(self):
return ""

principal_investigator_websites = relationship("StudyWebsite", cascade="all", lazy="joined")
publication_dois = relationship("StudyPublication", cascade="all", lazy="joined")
doi_object = relationship("DOIInfo", cascade="all", lazy="joined")

doi_info = association_proxy("doi_object", "info")

@property
def open_in_gold(self) -> Optional[str]:
Expand All @@ -257,14 +271,27 @@ def open_in_gold(self) -> Optional[str]:
self.gold_study_identifiers, # type: ignore
)

@property
def award_dois(self) -> list[DOIInfo]:
return [d for d in self.dois if d.doi_type == DOIType.AWARD] # type: ignore

@property
def publication_dois(self) -> list[DOIInfo]:
return [d for d in self.dois if d.doi_type == DOIType.PUBLICATION] # type: ignore

@property
def dataset_dois(self) -> list[DOIInfo]:
return [d for d in self.dois if d.doi_type == DOIType.DATASET] # type: ignore

@property
def doi_map(self) -> Dict[str, Any]:
doi_info = {
d.publication.doi: d.publication.doi_object.info
for d in self.publication_dois # type: ignore
}
if self.doi:
doi_info[self.doi] = self.doi_info
doi_info = {}
for award_doi in self.award_dois:
doi_info[award_doi.id] = award_doi.info
for publication_doi in self.publication_dois:
doi_info[publication_doi.id] = publication_doi.info
for dataset_doi in self.dataset_dois:
doi_info[dataset_doi.id] = dataset_doi.info
return doi_info


Expand Down Expand Up @@ -598,24 +625,6 @@ class StudyWebsite(Base):
website = relationship(Website, cascade="all")


class Publication(Base):
__tablename__ = "publication"

id = Column(UUID(as_uuid=True), primary_key=True, default=uuid4)
doi = Column(String, ForeignKey("doi_info.id"), nullable=False, unique=True)

doi_object = relationship("DOIInfo", cascade="all", lazy="joined")


class StudyPublication(Base):
__tablename__ = "study_publication"

study_id = Column(String, ForeignKey("study.id"), primary_key=True)
publication_id = Column(UUID(as_uuid=True), ForeignKey("publication.id"), primary_key=True)

publication = relationship(Publication, cascade="all")


# This table contains KO terms detected in metagenome and metaproteomic workflow
# activities
class GeneFunction(Base):
Expand Down
Loading

0 comments on commit 110e469

Please sign in to comment.