diff --git a/nmdc_server/crud.py b/nmdc_server/crud.py index b929dc7a..9d0caffd 100644 --- a/nmdc_server/crud.py +++ b/nmdc_server/crud.py @@ -101,11 +101,15 @@ def get_study_image(db: Session, study_id: str) -> Optional[bytes]: return None +def get_doi(db: Session, doi_id: str) -> Optional[models.DOIInfo]: + doi = db.query(models.DOIInfo).get(doi_id) + return doi + + def create_study(db: Session, study: schemas.StudyCreate) -> models.Study: study_dict = study.dict() websites = study_dict.pop("principal_investigator_websites") - publications = study_dict.pop("publication_dois") db_study = models.Study(**study_dict) @@ -114,11 +118,6 @@ def create_study(db: Session, study: schemas.StudyCreate) -> models.Study: study_website = models.StudyWebsite(website=website) db_study.principal_investigator_websites.append(study_website) # type: ignore - for doi in publications: - publication, _ = get_or_create(db, models.Publication, doi=doi) - study_publication = models.StudyPublication(publication=publication) - db_study.publication_dois.append(study_publication) # type: ignore - db.add(db_study) db.commit() db.refresh(db_study) diff --git a/nmdc_server/fakes.py b/nmdc_server/fakes.py index 22fede1c..2fd6dd4b 100644 --- a/nmdc_server/fakes.py +++ b/nmdc_server/fakes.py @@ -24,6 +24,12 @@ def uuid(self): return uuid4() +class EnumProvider(BaseProvider): + def enum_value(self, enum_class): + enum_values = list(enum_class) + return self.random_element(enum_values) + + db = scoped_session(SessionLocal) Faker.add_provider(DoiProvider) Faker.add_provider(date_time) @@ -33,6 +39,7 @@ def uuid(self): Faker.add_provider(misc) Faker.add_provider(person) Faker.add_provider(python) +Faker.add_provider(EnumProvider) class TokenFactory(Factory): @@ -56,6 +63,7 @@ class Meta: id = Faker("doi") info = Faker("pydict", value_types=["str"]) + doi_type = Faker("enum_value", enum_class=models.DOIType) class AnnotatedFactory(SQLAlchemyModelFactory): @@ -78,15 +86,6 @@ class Meta: sqlalchemy_session = db -class PublicationFactory(SQLAlchemyModelFactory): - id = Faker("uuid") - doi_object = SubFactory(DOIInfoFactory) - - class Meta: - model = models.Publication - sqlalchemy_session = db - - class EnvoTermFactory(SQLAlchemyModelFactory): id = Faker("pystr") label = Faker("word") @@ -130,8 +129,8 @@ class StudyFactory(AnnotatedFactory): gold_description = Faker("sentence") scientific_objective = Faker("sentence") principal_investigator = SubFactory(PrincipalInvestigator) - doi_object = SubFactory(DOIInfoFactory) image = Faker("binary", length=64) + dois: List[models.DOIInfo] = [] class Meta: model = models.Study @@ -148,17 +147,6 @@ def principal_investigator_websites(self, create, extracted, **kwargs): for website in extracted: self.principal_investigator_websites.append(website) - @post_generation - def publication_dois(self, create, extracted, **kwargs): - if not create: - return - - if not extracted: - extracted = [StudyPublicationFactory(), StudyPublicationFactory()] - - for publication in extracted: - self.publication_dois.append(publication) - class StudyWebsiteFactory(SQLAlchemyModelFactory): website = SubFactory(WebsiteFactory) @@ -168,14 +156,6 @@ class Meta: sqlalchemy_session = db -class StudyPublicationFactory(SQLAlchemyModelFactory): - publication = SubFactory(PublicationFactory) - - class Meta: - model = models.StudyPublication - sqlalchemy_session = db - - class BiosampleFactory(AnnotatedFactory): class Meta: model = models.Biosample diff --git a/nmdc_server/ingest/doi.py b/nmdc_server/ingest/doi.py index eb0d3a95..ac8ce753 100644 --- a/nmdc_server/ingest/doi.py +++ b/nmdc_server/ingest/doi.py @@ -8,7 +8,7 @@ from sqlalchemy.orm import Session from nmdc_server.logger import get_logger -from nmdc_server.models import DOIInfo +from nmdc_server.models import DOIInfo, DOIType retry_strategy = Retry(total=10) adapter = HTTPAdapter(max_retries=retry_strategy) @@ -24,7 +24,7 @@ def get_doi_info(doi: str) -> Response: return requests.get(url, headers=headers, timeout=60) -def upsert_doi(db: Session, doi: str): +def upsert_doi(db: Session, doi: str, doi_type: DOIType): logger = get_logger(__name__) # Try really hard to get doi data... the doi.org service is very unreliable. try: @@ -43,7 +43,7 @@ def upsert_doi(db: Session, doi: str): return info = {} - statement = insert(DOIInfo.__table__).values(id=doi, info=info) + statement = insert(DOIInfo.__table__).values(id=doi, info=info, doi_type=doi_type) statement = statement.on_conflict_do_update(constraint="pk_doi_info", set_=dict(info=info)) db.execute(statement) db.flush() diff --git a/nmdc_server/ingest/study.py b/nmdc_server/ingest/study.py index 606f2354..8f059ffb 100644 --- a/nmdc_server/ingest/study.py +++ b/nmdc_server/ingest/study.py @@ -6,10 +6,10 @@ from pymongo.cursor import Cursor from sqlalchemy.orm import Session -from nmdc_server.crud import create_study +from nmdc_server.crud import create_study, get_doi from nmdc_server.ingest.common import extract_extras, extract_value from nmdc_server.ingest.doi import upsert_doi -from nmdc_server.models import PrincipalInvestigator +from nmdc_server.models import DOIType, PrincipalInvestigator from nmdc_server.schemas import StudyCreate @@ -65,13 +65,26 @@ def load(db: Session, cursor: Cursor): obj["principal_investigator_websites"] = obj.pop("websites", []) obj["image"] = get_study_image_data(obj.pop("study_image", [])) - obj["publication_dois"] = [transform_doi(d) for d in obj.pop("publications", [])] - if "doi" in obj: - obj["doi"]["has_raw_value"] = transform_doi(obj["doi"]["has_raw_value"]) + publication_dois = [transform_doi(d) for d in obj.pop("publications", [])] + [ + transform_doi(d) for d in obj.pop("publication_dois", []) + ] + award_dois = [transform_doi(doi) for doi in obj.pop("award_dois", [])] + [ + transform_doi(d) for d in obj.pop("emsl_project_dois", []) + ] + dataset_dois = [transform_doi(doi) for doi in obj.pop("dataset_dois", [])] - if "doi" in obj: - upsert_doi(db, obj["doi"]["has_raw_value"]) - for doi in obj.get("publication_dois", []): - upsert_doi(db, doi) + for doi in publication_dois: + upsert_doi(db, doi, DOIType.PUBLICATION) - create_study(db, Study(**obj)) + for doi in award_dois: + upsert_doi(db, doi, DOIType.AWARD) + + for doi in dataset_dois: + upsert_doi(db, doi, DOIType.DATASET) + + new_study = create_study(db, Study(**obj)) + + for doi_id in publication_dois + award_dois + dataset_dois: + doi_object = get_doi(db, doi_id) + if doi_object: + new_study.dois.append(doi_object) # type: ignore diff --git a/nmdc_server/migrations/versions/1de891717fc0_multivalued_dois.py b/nmdc_server/migrations/versions/1de891717fc0_multivalued_dois.py new file mode 100644 index 00000000..3de54732 --- /dev/null +++ b/nmdc_server/migrations/versions/1de891717fc0_multivalued_dois.py @@ -0,0 +1,80 @@ +"""multivalued DOIs + +Revision ID: 1de891717fc0 +Revises: dad555bb9212 +Create Date: 2023-08-23 19:22:15.660679 + +""" +from typing import Optional + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = "1de891717fc0" +down_revision: Optional[str] = "dad555bb9212" +branch_labels: Optional[str] = None +depends_on: Optional[str] = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "study_doi_association", + sa.Column("study_id", sa.String(), nullable=False), + sa.Column("doi_id", sa.String(), nullable=False), + sa.ForeignKeyConstraint( + ["doi_id"], ["doi_info.id"], name=op.f("fk_study_doi_association_doi_id_doi_info") + ), + sa.ForeignKeyConstraint( + ["study_id"], ["study.id"], name=op.f("fk_study_doi_association_study_id_study") + ), + sa.PrimaryKeyConstraint("study_id", "doi_id", name=op.f("pk_study_doi_association")), + ) + op.drop_table("study_publication") + op.drop_table("publication") + + doitype = postgresql.ENUM("AWARD", "DATASET", "PUBLICATION", name="doitype") + doitype.create(op.get_bind()) + + op.add_column( + "doi_info", + sa.Column( + "doi_type", sa.Enum("AWARD", "DATASET", "PUBLICATION", name="doitype"), nullable=True + ), + ) + op.drop_constraint("fk_study_doi_doi_info", "study", type_="foreignkey") + op.drop_column("study", "doi") + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("study", sa.Column("doi", sa.VARCHAR(), autoincrement=False, nullable=True)) + op.create_foreign_key("fk_study_doi_doi_info", "study", "doi_info", ["doi"], ["id"]) + op.drop_column("doi_info", "doi_type") + op.create_table( + "study_publication", + sa.Column("study_id", sa.VARCHAR(), autoincrement=False, nullable=False), + sa.Column("publication_id", postgresql.UUID(), autoincrement=False, nullable=False), + sa.ForeignKeyConstraint( + ["publication_id"], + ["publication.id"], + name="fk_study_publication_publication_id_publication", + ), + sa.ForeignKeyConstraint( + ["study_id"], ["study.id"], name="fk_study_publication_study_id_study" + ), + sa.PrimaryKeyConstraint("study_id", "publication_id", name="pk_study_publication"), + ) + op.create_table( + "publication", + sa.Column("id", postgresql.UUID(), autoincrement=False, nullable=False), + sa.Column("doi", sa.VARCHAR(), autoincrement=False, nullable=False), + sa.ForeignKeyConstraint(["doi"], ["doi_info.id"], name="fk_publication_doi_doi_info"), + sa.PrimaryKeyConstraint("id", name="pk_publication"), + sa.UniqueConstraint("doi", name="uq_publication_doi"), + ) + op.drop_table("study_doi_association") + # ### end Alembic commands ### diff --git a/nmdc_server/models.py b/nmdc_server/models.py index aec33920..b3f940d4 100644 --- a/nmdc_server/models.py +++ b/nmdc_server/models.py @@ -1,3 +1,4 @@ +import enum from datetime import datetime from typing import Any, Dict, Iterator, List, Optional, Type, Union from uuid import uuid4 @@ -8,6 +9,7 @@ CheckConstraint, Column, DateTime, + Enum, Float, ForeignKey, Integer, @@ -178,6 +180,20 @@ class PrincipalInvestigator(Base): image = Column(LargeBinary, nullable=True) +class DOIType(enum.Enum): + AWARD = "award" + DATASET = "dataset" + PUBLICATION = "publication" + + +study_doi_association = Table( + "study_doi_association", + Base.metadata, + Column("study_id", ForeignKey("study.id"), primary_key=True), + Column("doi_id", ForeignKey("doi_info.id"), primary_key=True), +) + + # Caches information from doi.org class DOIInfo(Base): __tablename__ = "doi_info" @@ -188,6 +204,8 @@ class DOIInfo(Base): primary_key=True, ) info = Column(JSONB, nullable=False, default=dict) + doi_type = Column(Enum(DOIType)) + studies = relationship("Study", secondary=study_doi_association, back_populates="dois") class AnnotatedModel: @@ -207,7 +225,7 @@ class Study(Base, AnnotatedModel): gold_name = Column(String, nullable=False, default="") gold_description = Column(String, nullable=False, default="") scientific_objective = Column(String, nullable=False, default="") - doi = Column(String, ForeignKey("doi_info.id"), nullable=True) + dois = relationship("DOIInfo", secondary=study_doi_association, back_populates="studies") multiomics = Column(Integer, nullable=False, default=0) # TODO migrate these into relations or something @@ -244,10 +262,6 @@ def image_url(self): return "" principal_investigator_websites = relationship("StudyWebsite", cascade="all", lazy="joined") - publication_dois = relationship("StudyPublication", cascade="all", lazy="joined") - doi_object = relationship("DOIInfo", cascade="all", lazy="joined") - - doi_info = association_proxy("doi_object", "info") @property def open_in_gold(self) -> Optional[str]: @@ -257,14 +271,27 @@ def open_in_gold(self) -> Optional[str]: self.gold_study_identifiers, # type: ignore ) + @property + def award_dois(self) -> list[DOIInfo]: + return [d for d in self.dois if d.doi_type == DOIType.AWARD] # type: ignore + + @property + def publication_dois(self) -> list[DOIInfo]: + return [d for d in self.dois if d.doi_type == DOIType.PUBLICATION] # type: ignore + + @property + def dataset_dois(self) -> list[DOIInfo]: + return [d for d in self.dois if d.doi_type == DOIType.DATASET] # type: ignore + @property def doi_map(self) -> Dict[str, Any]: - doi_info = { - d.publication.doi: d.publication.doi_object.info - for d in self.publication_dois # type: ignore - } - if self.doi: - doi_info[self.doi] = self.doi_info + doi_info = {} + for award_doi in self.award_dois: + doi_info[award_doi.id] = award_doi.info + for publication_doi in self.publication_dois: + doi_info[publication_doi.id] = publication_doi.info + for dataset_doi in self.dataset_dois: + doi_info[dataset_doi.id] = dataset_doi.info return doi_info @@ -598,24 +625,6 @@ class StudyWebsite(Base): website = relationship(Website, cascade="all") -class Publication(Base): - __tablename__ = "publication" - - id = Column(UUID(as_uuid=True), primary_key=True, default=uuid4) - doi = Column(String, ForeignKey("doi_info.id"), nullable=False, unique=True) - - doi_object = relationship("DOIInfo", cascade="all", lazy="joined") - - -class StudyPublication(Base): - __tablename__ = "study_publication" - - study_id = Column(String, ForeignKey("study.id"), primary_key=True) - publication_id = Column(UUID(as_uuid=True), ForeignKey("publication.id"), primary_key=True) - - publication = relationship(Publication, cascade="all") - - # This table contains KO terms detected in metagenome and metaproteomic workflow # activities class GeneFunction(Base): diff --git a/nmdc_server/schemas.py b/nmdc_server/schemas.py index 50e1afeb..9a8cd6c4 100644 --- a/nmdc_server/schemas.py +++ b/nmdc_server/schemas.py @@ -213,15 +213,22 @@ class CreditAssociation(BaseModel): applies_to_person: OrcidPerson +class DOIInfo(BaseModel): + id: str + info: dict + doi_type: models.DOIType + + class Config: + orm_mode = True + + class StudyBase(AnnotatedBase): principal_investigator_websites: List[str] = [] - publication_dois: List[str] = [] gold_name: str = "" gold_description: str = "" scientific_objective: str = "" add_date: Optional[DateType] mod_date: Optional[DateType] - doi: Optional[str] has_credit_associations: Optional[List[CreditAssociation]] relevant_protocols: Optional[List[str]] funding_sources: Optional[List[str]] @@ -235,12 +242,6 @@ def replace_websites(cls, study_website: Union[models.StudyWebsite, str]) -> str return study_website return study_website.website.url - @validator("publication_dois", pre=True, each_item=True) - def replace_dois(cls, study_publication: Union[models.StudyPublication, str]) -> str: - if isinstance(study_publication, str): - return study_publication - return study_publication.publication.doi - class StudyCreate(StudyBase): principal_investigator_id: UUID @@ -268,6 +269,10 @@ class Study(StudyBase): doi_map: Dict[str, Any] = {} multiomics: int + award_dois: Optional[List[DOIInfo]] + publication_dois: Optional[List[DOIInfo]] + dataset_dois: Optional[List[DOIInfo]] + class Config: orm_mode = True