Skip to content

Commit

Permalink
CU-2e77a98: Model pack django ORM model, admin functionality. A new M…
Browse files Browse the repository at this point in the history
…odelPack unpacks, test loads the CDB, Vocab, MetaTask models and loads their definitions into Django models if they don't already exist
  • Loading branch information
tomolopolis committed Jun 5, 2024
1 parent 6c97269 commit 28e2ffc
Show file tree
Hide file tree
Showing 8 changed files with 227 additions and 5 deletions.
2 changes: 2 additions & 0 deletions webapp/api/api/admin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@
admin.site.register(ExportedProject, ExportedProjectAdmin)
admin.site.register(ProjectMetrics, ProjectMetricsAdmin)
admin.site.register(Dataset, DatasetAdmin)
admin.site.register(ModelPack, ModelPackAdmin)
admin.site.register(MetaCATModel, MetaCATModelAdmin)
15 changes: 15 additions & 0 deletions webapp/api/api/admin/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,21 @@ class ConceptDBAdmin(admin.ModelAdmin):
actions = [import_concepts, delete_indexed_concepts, reset_cdb_filters]


class ModelPackAdmin(admin.ModelAdmin):
model = ModelPack
list_display = ('name', 'model_pack', 'concept_db', 'vocab', 'metacats')
fields = ['name', 'model_pack']

def metacats(self, obj):
return ", ".join(str(m_c) for m_c in obj.meta_cats.all())


class MetaCATModelAdmin(admin.ModelAdmin):
model = MetaCATModel
list_display = ('name', 'meta_cat_dir')
list_filter = ['meta_task']


class DocumentAdmin(admin.ModelAdmin):
model = Document
actions = [remove_all_documents]
Expand Down
1 change: 1 addition & 0 deletions webapp/api/api/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class ApiConfig(AppConfig):
def ready(self):
from api.views import _submit_document
from api.models import ProjectAnnotateEntities
from . import signals
resubmit_all = os.environ.get('RESUBMIT_ALL_ON_STARTUP', None)
if resubmit_all is not None and resubmit_all.lower() in ('1', 'y', 'true'):
logger.info('Found env var RESUBMIT_ALL_ON_STARTUP is True. '
Expand Down
34 changes: 34 additions & 0 deletions webapp/api/api/migrations/0078_metacatmodel_modelpack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Generated by Django 2.2.28 on 2024-06-05 23:39

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('api', '0077_projectgroup_create_associated_projects'),
]

operations = [
migrations.CreateModel(
name='MetaCATModel',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=100)),
('meta_cat_dir', models.FilePathField(allow_folders=True, help_text='The zip or dir for a MetaCAT model')),
('meta_task', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='api.MetaTask')),
],
),
migrations.CreateModel(
name='ModelPack',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.TextField()),
('model_pack', models.FileField(help_text='Model pack zip', upload_to='')),
('concept_db', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='api.ConceptDB')),
('meta_cats', models.ManyToManyField(blank=True, default=None, to='api.MetaCATModel')),
('vocab', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='api.Vocabulary')),
],
),
]
139 changes: 135 additions & 4 deletions webapp/api/api/models.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
import logging
import os
import shutil
from zipfile import BadZipFile

import pandas as pd
from django.conf import settings
from django.core.exceptions import ValidationError
from django.core.validators import RegexValidator
from django.db import models
from django.db.models import DO_NOTHING, SET_NULL
from django.dispatch import receiver
from django.forms import forms, ModelForm
from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.vocab import Vocab
from medcat.meta_cat import MetaCAT
from polymorphic.models import PolymorphicModel

from core.settings import MEDIA_ROOT

STATUS_CHOICES = [
(0, 'Not Validated'),
(1, 'Validated'),
Expand All @@ -22,27 +32,93 @@

cdb_name_validator = RegexValidator(r'^[0-9a-zA-Z_-]*$', 'Only alpahanumeric characters, -, _ are allowed for CDB names')

logger = logging.getLogger(__name__)


class ModelPack(models.Model):
name = models.TextField(help_text='')
model_pack = models.FileField(help_text='Model pack zip')
concept_db = models.ForeignKey('ConceptDB', on_delete=models.CASCADE, blank=True, null=True)
vocab = models.ForeignKey('Vocabulary', on_delete=models.CASCADE, blank=True, null=True)
meta_cats = models.ManyToManyField('MetaCATModel', blank=True, default=None)

def save(self, *args, **kwargs):
super().save(*args, **kwargs)
logger.info('Loading model pack: %s', self.model_pack)
model_pack_name = str(self.model_pack).replace(".zip", "")
try:
CAT.attempt_unpack(self.model_pack.path)
except BadZipFile as exc:
# potential for CRC-32 errors in Trainer process - ignore and still use
logger.warning(f'Possibly corrupt cdb.dat decompressing {self.model_pack}\nFull Exception: {exc}')
unpacked_model_pack_path = self.model_pack.path.replace('.zip', '')
# attempt to load cdb
try:
CAT.load_cdb(unpacked_model_pack_path)
concept_db = ConceptDB()
unpacked_file_name = self.model_pack.file.name.replace('.zip', '')
concept_db.cdb_file.name = os.path.join(unpacked_file_name, 'cdb.dat')
concept_db.name = f'{self.name} - CDB'
concept_db.save(skip_load=True)
self.concept_db = concept_db
except Exception as exc:
raise FileNotFoundError(f'Error loading the CDB from this model pack: {self.model_pack.path}') from exc

# Load Vocab
vocab_path = os.path.join(unpacked_model_pack_path, "vocab.dat")
if os.path.exists(vocab_path):
Vocab.load(vocab_path)
vocab = Vocabulary()
vocab.vocab_file.name = vocab_path.replace(f'{MEDIA_ROOT}/', '')
vocab.save(skip_load=True)
self.vocab = vocab
else:
raise FileNotFoundError(f'Error loading the Vocab from this model pack: {vocab_path}')

# load MetaCATs
try:
# should raise an error if there already is a MetaCAT model with this definition
for meta_cat_dir, meta_cat in CAT.load_meta_cats(unpacked_model_pack_path):
mc_model = MetaCATModel()
mc_model.meta_cat_dir = meta_cat_dir.replace(f'{MEDIA_ROOT}/', '')
mc_model.name = f'{meta_cat.config.general.category_name} - {meta_cat.config.model.model_name}'
mc_model.save(unpack_load_meta_cat_dir=False)
mc_model.get_or_create_meta_tasks_and_values(meta_cat)
except Exception as exc:
raise MedCATLoadException(f'Failure loading MetaCAT models - {unpacked_model_pack_path}') from exc
super().save(*args, **kwargs)

def __str__(self):
return self.name


class ConceptDB(models.Model):
name = models.CharField(max_length=100, default='', blank=True, validators=[cdb_name_validator])
cdb_file = models.FileField()
use_for_training = models.BooleanField(default=True)

def __init__(self, *args, **kwargs):
super(ConceptDB, self).__init__(*args, **kwargs)
super().__init__(*args, **kwargs)
self.__cdb_field_name = None

@classmethod
def from_db(cls, db, field_names, values):
inst = super(ConceptDB, cls).from_db(db, field_names, values)
inst = super().from_db(db, field_names, values)
inst.__cdb_field_name = [v for f, v in zip(field_names, values) if f == 'cdb_file'][0]
return inst

def save(self, *args, **kwargs):
def save(self, *args, skip_load=False, **kwargs, ):
# load the CDB, and raise if this fails.
if not skip_load:
try:
CDB.load(self.cdb_file)
except Exception as exc:
raise MedCATLoadException(f'Failed to load Concept DB from {self.cdb_file}, '
f'check if this CDB file successfully loads elsewhere') from exc
if self.__cdb_field_name is not None and self.__cdb_field_name != self.cdb_file.name:
raise ValidationError('Cannot change file path of existing CDB.')
else:
super(ConceptDB, self).save(*args, **kwargs)
super().save(*args, **kwargs)

def __str__(self):
return self.name
Expand All @@ -51,10 +127,60 @@ def __str__(self):
class Vocabulary(models.Model):
vocab_file = models.FileField()

def save(self, *args, skip_load=False, **kwargs):
# load the Vocab, and raise if this fails
if not skip_load:
try:
Vocab.load(self.vocab_file)
except Exception as exc:
raise MedCATLoadException(f'Failed to load Vocab from {self.vocab_file}, '
f'check if this Vocab file successfully loads elsewhere') from exc
super().save(*args, **kwargs)

def __str__(self):
return str(self.vocab_file.name)


class MetaCATModel(models.Model):
name = models.CharField(max_length=100)
meta_cat_dir = models.FilePathField(help_text='The zip or dir for a MetaCAT model', allow_folders=True)
meta_task = models.ForeignKey('MetaTask', on_delete=SET_NULL, blank=True, null=True)

def get_or_create_meta_tasks_and_values(self, meta_cat: MetaCAT):
task = meta_cat.config.general.category_name
mt = MetaTask.objects.filter(name=task).first()
if not mt:
mt = MetaTask()
mt.name = task
mt.save()
self.meta_task = mt

mt_vs = []
for meta_task_value in meta_cat.config.general.category_value2id.keys():
mt_v = MetaTaskValue.objects.filter(name=meta_task_value).first()
if not mt_v:
mt_v = MetaTaskValue()
mt_v.name = meta_task_value
mt_v.save()
mt_vs.append(mt_v)
self.meta_task.values.set(mt_vs)

def save(self, *args, unpack_load_meta_cat_dir=False, **kwargs):
if unpack_load_meta_cat_dir:
try:
# load the meta cat model, raise if issues
model_files = os.path.join(MEDIA_ROOT, self.meta_cat_dir)
shutil.unpack_archive(self.meta_cat_dir, extract_dir=model_files)
MetaCAT.load(save_dir_path=model_files)
except Exception as exc:
raise MedCATLoadException(f'Failed to load MetaCAT from {self.meta_cat_dir}, '
f'check if this MetaCAT dir successfully loads elsewhere') from exc
super().save(*args, **kwargs)

def __str__(self):
return f'{self.name} - {str(self.meta_cat_dir)}'


class Dataset(models.Model):
name = models.CharField(max_length=150)
original_file = models.FileField()
Expand Down Expand Up @@ -355,3 +481,8 @@ def _remove_file(instance, prop):
if getattr(instance, prop):
if os.path.isfile(getattr(instance, prop).path):
os.remove(getattr(instance, prop).path)


class MedCATLoadException(Exception):
def __init__(self, message):
super().__init__(message)
12 changes: 12 additions & 0 deletions webapp/api/api/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ class Meta:
fields = '__all__'


class ModelPackSerializer(serializers.ModelSerializer):
class Meta:
model = ModelPack
fields = '__all__'


class MetaCATModelSerializer(serializers.ModelSerializer):
class Meta:
model = MetaCATModel
fields = '__all__'


class ConceptDBSerializer(serializers.ModelSerializer):
class Meta:
model = ConceptDB
Expand Down
25 changes: 24 additions & 1 deletion webapp/api/api/forms.py → webapp/api/api/signals.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import json
import logging
import os
import shutil

from django.db.models.fields.files import FileField
from django.db.models.signals import post_save, post_delete, pre_save
from django.dispatch import receiver

from api.data_utils import dataset_from_file, delete_orphan_docs, upload_projects_export
from api.models import Dataset, ExportedProject
from api.models import Dataset, ExportedProject, ModelPack
from core.settings import MEDIA_ROOT


logger = logging.getLogger(__name__)


@receiver(post_save, sender=Dataset)
Expand Down Expand Up @@ -36,3 +42,20 @@ def save_exported_projects(sender, instance, **kwargs):
if not instance.trainer_export_file.path.endswith('.json'):
raise Exception("Please make sure the file is a .json file")
upload_projects_export(json.load(open(instance.trainer_export_file.path)))


@receiver(post_delete, sender=ModelPack)
def remove_model_pack_assets(sender, instance, **kwargs):
if instance.concept_db:
instance.concept_db.delete(using=None, keep_parents=False)
if instance.vocab:
instance.vocab.delete(using=None, keep_parents=False)
if len(instance.meta_cats.all()) > 0:
for m_c in instance.meta_cats.all():
m_c.delete(using=None, keep_parents=False)
try:
# rm the model pack unzipped dir & model pack zip
shutil.rmtree(instance.model_pack.path.replace(".zip", ""))
os.remove(instance.model_pack.path)
except FileNotFoundError:
logger.warning("Failure removing Model pack dir or zip. Not found. Likely already deleted")
4 changes: 4 additions & 0 deletions webapp/api/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ def get_medcat(CDB_MAP, VOCAB_MAP, CAT_MAP, project):
vocab = Vocab.load(vocab_path)
VOCAB_MAP[vocab_id] = vocab

# integrated model-pack spacy model not used.
# This assumes specified spacy model is installed...
# Next change will create conditional params to load CDB / Vocab, or
# model-packs directly for a project.
cat = CAT(cdb=cdb, config=cdb.config, vocab=vocab)
CAT_MAP[cat_id] = cat
return cat
Expand Down

0 comments on commit 28e2ffc

Please sign in to comment.