diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py
new file mode 100644
index 0000000000..e988665d7e
--- /dev/null
+++ b/emmet-builders/emmet/builders/cp2k/defects.py
@@ -0,0 +1,910 @@
+from datetime import datetime
+from itertools import chain, groupby, combinations
+from re import A
+from tkinter import W
+from typing import Dict, Iterator, List, Optional
+from copy import deepcopy
+from math import ceil
+from monty.json import MontyDecoder
+
+from maggma.builders import Builder
+from maggma.stores import Store
+from maggma.utils import grouper
+
+from pymatgen.core import Structure
+from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparator
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+
+from atomate.utils.utils import load_class
+
+from emmet.core.settings import EmmetSettings
+from emmet.core.thermo import ThermoDoc
+from emmet.core.material import MaterialsDoc
+
+from emmet.core.utils import jsanitize, get_sg
+from emmet.core.defect import DefectDoc, DefectDoc2d, DefectThermoDoc
+from emmet.core.cp2k.calc_types import TaskType
+from emmet.core.cp2k.calc_types.utils import run_type
+from emmet.builders.settings import EmmetBuildSettings
+from emmet.builders.cp2k.utils import synchronous_query
+
+from emmet.core.electronic_structure import ElectronicStructureDoc
+
+SETTINGS = EmmetSettings()
+
+__author__ = "Nicholas Winner <nwinner@berkeley.edu>"
+__maintainer__ = "Jason Munro"
+
+
+# TODO this builder is very close to being code agnostic. We need only resolve the standard key names and
+# how they are fed to the DefectDoc class. e.g. VASP calcs store "locpot", but CP2K store "v_hartree"
+class DefectBuilder(Builder):
+    """
+    The DefectBuilder collects task documents performed on structures containing a single point defect.
+    The builder is intended to group tasks corresponding to the same defect (species including charge state),
+    find the best ones, and perform finite-size defect corrections to create a defect document. These
+    defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder.
+
+    In order to make the build process easier, an entry must exist inside of the task doc that identifies it
+    as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future,
+    this may be changed to having a defect transformation in the transformation history.
+
+    The process is as follows:
+
+        1.) Find all documents containing the defect query.
+        2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already
+            calculated. These are the candidate bulk tasks.
+        3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites
+            (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding
+            bulk calculation.
+        4.) Convert (defect, bulk task) doc pairs to DefectDocs
+        5.) Post-process and validate defect document
+        6.) Update the defect store
+    """
+
+    #TODO how to incorporate into settings?
+    DEFAULT_ALLOWED_TASKS = [
+            TaskType.Structure_Optimization.value, 
+            TaskType.Static.value
+            ]
+
+    # TODO: should electrostatic_potential store be optional in case people
+    # unify their LOCPOT data?
+    # TODO: should dielectric/electronic_structure be optional or required?
+    def __init__(
+        self,
+        tasks: Store,
+        defects: Store,
+        dielectric: Store,
+        electronic_structure: Store,
+        materials: Store,
+        electrostatic_potentials: Store,
+        task_validation: Optional[Store] = None,
+        query: Optional[Dict] = None,
+        bulk_query: Optional[Dict] = None,
+        allowed_task_types: Optional[List[str]] = DEFAULT_ALLOWED_TASKS,
+        settings: Optional[EmmetBuildSettings] = None,
+        defects_2d: Optional[bool] = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            tasks: Store of task documents
+            defects: Store of defect documents to generate
+            dielectric: Store of dielectric data
+            electronic_structure: Store of electronic structure data
+            materials: Store of materials documents
+            electrostatic_potentials: Store of electrostatic potential data. These
+                are generally stored in seperately from the tasks on GridFS due to their size.
+            task_validation: Store of task validation documents.
+            query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property
+            allowed_task_types: list of task_types that can be processed
+            settings: EmmetBuildSettings object
+        """
+
+        self.tasks = tasks
+        self.defects = defects
+        self.materials = materials
+        self.dielectric = dielectric
+        self.electronic_structure = electronic_structure
+        self.electrostatic_potentials = electrostatic_potentials
+        self.task_validation = task_validation
+        self.allowed_task_types = allowed_task_types #TODO How to incorporate into getitems?
+
+        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
+        self.settings = EmmetBuildSettings.autoload(settings)
+        self.query = query if query else {}
+        self.bulk_query = bulk_query if bulk_query else {}
+        self.timestamp = None
+        self._mpid_map = {}
+        self.defects_2d = defects_2d
+        self.kwargs = kwargs
+
+        self._defect_query = 'transformations.history.0.defect'
+
+        self._required_defect_properties = [
+            self._defect_query,
+            'output.energy',
+            'output.structure',
+            'input',
+            'transformations',
+            'task_id',
+            'nsites'
+        ]
+
+        self._required_bulk_properties = [
+            'output.energy',
+            'output.structure',
+            'output.vbm',
+            'output.cbm',
+            'input',
+            'task_id',
+        ]
+
+        self._optional_defect_properties = []
+        self._optional_bulk_properties = []
+
+        # TODO This seems pretty inelegant
+        if defects_2d:
+            self._required_defect_properties.append('v_hartree')
+            self._required_bulk_properties.append('v_hartree')
+        else:
+            self._required_defect_properties.append('output.v_hartree_planar')
+            self._required_defect_properties.append('output.v_hartree_grid')
+            self._optional_defect_properties.append('output.v_hartree_sites')
+            self._required_bulk_properties.append('output.v_hartree_planar')
+            self._required_bulk_properties.append('output.v_hartree_grid')
+            self._optional_bulk_properties.append('output.v_hartree_sites')
+
+        sources = [tasks, dielectric, electronic_structure, materials, electrostatic_potentials]
+        if self.task_validation:
+            sources.append(self.task_validation)
+        super().__init__(sources=sources, targets=[defects], **kwargs)
+
+    @property
+    def defect_query(self) -> str:
+        """
+        The standard query for defect tasks.
+        """
+        return self._defect_query
+
+    #TODO Hartree pot should be required but only for charged defects
+    @property
+    def required_defect_properties(self) -> List:
+        """
+        Properties essential to processing a defect task.
+        """
+        return self._required_defect_properties
+
+    @property
+    def required_bulk_properties(self) -> List:
+        """
+        Properties essential to processing a bulk task.
+        """
+        return self._required_bulk_properties
+
+    @property
+    def optional_defect_properties(self) -> List:
+        """
+        Properties that are optional for processing a defect task.
+        """
+        return self._optional_defect_properties
+
+    @property
+    def optional_bulk_properties(self) -> List:
+        """
+        Properties that are optional for bulk tasks.
+        """
+        return self._optional_bulk_properties
+
+    @property
+    def mpid_map(self) -> Dict:
+        return self._mpid_map
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the tasks and materials collections
+        """
+
+        # Basic search index for tasks
+        self.tasks.ensure_index("task_id")
+        self.tasks.ensure_index("last_updated")
+        self.tasks.ensure_index("state")
+        self.tasks.ensure_index("formula_pretty")
+
+        # Search index for materials
+        self.materials.ensure_index("material_id")
+        self.materials.ensure_index("last_updated")
+        self.materials.ensure_index("task_ids")
+
+        # Search index for defects
+        self.defects.ensure_index("material_id")
+        self.defects.ensure_index("last_updated")
+        self.defects.ensure_index("task_ids")
+
+        if self.task_validation:
+            self.task_validation.ensure_index("task_id")
+            self.task_validation.ensure_index("valid")
+
+    def prechunk(self, number_splits: int) -> Iterator[Dict]:
+
+        tag_query = {} 
+        if len(self.settings.BUILD_TAGS) > 0 and len(self.settings.EXCLUDED_TAGS) > 0:
+            tag_query["$and"] = [
+                {"tags": {"$in": self.settings.BUILD_TAGS}},
+                {"tags": {"$nin": self.settings.EXCLUDED_TAGS}},
+            ]
+        elif len(self.settings.BUILD_TAGS) > 0:
+            tag_query["tags"] = {"$in": self.settings.BUILD_TAGS}
+
+        # Get defect tasks
+        temp_query = self.query.copy()
+        temp_query.update(tag_query)
+        temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_defect_properties})
+        temp_query.update({self.defect_query: {'$exists': True}, "state": "successful"})
+        defect_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        # Get bulk tasks
+        temp_query = self.bulk_query.copy()
+        temp_query.update(tag_query)
+        temp_query.update({d: {'$exists': True} for d in self.required_bulk_properties})
+        temp_query.update({self.defect_query: {'$exists': False}, "state": "successful"})
+        bulk_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        N = ceil(len(defect_tasks) / number_splits)
+        for task_chunk in grouper(defect_tasks, N):
+            yield {"query": {"task_id": {"$in": task_chunk + list(bulk_tasks)}}}
+
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets all items to process into defect documents.
+        This does no datetime checking; relying on on whether
+        task_ids are included in the Defect Collection.
+
+        The procedure is as follows:
+
+            1. Get all tasks with standard "defect" query tag
+            2. Filter all tasks by skipping tasks which are already in the Defect Store
+            3. Get all tasks that could be used as bulk
+            4. Filter all bulks which do not have corresponding Dielectric and 
+               ElectronicStructure data (if a band gap exists for that task).
+            5. Group defect tasks by defect matching 
+            6. Given defect object in a group, bundle them with bulk tasks
+               identified with structure matching
+            7. Yield the item bundles
+
+        Returns:
+            Iterator of (defect documents, task bundles)
+
+                The defect document is an existing defect doc to be updated with new data, or None
+            
+                task bundles bundle are all the tasks that correspond to the same defect and all possible
+                bulk tasks that could be matched to them.
+d        """
+
+        self.logger.info("Defect builder started")
+        self.logger.info(
+            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"
+        )
+
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark buildtime for material documents
+        self.timestamp = datetime.utcnow()
+
+        self.logger.info("Finding tasks to process")
+
+        # Get defect tasks
+        temp_query = self.query.copy()
+        temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_defect_properties})
+        temp_query.update({self.defect_query: {'$exists': True}, "state": "successful"})
+        defect_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        # Get bulk tasks
+        temp_query = self.bulk_query.copy()
+        temp_query.update({d: {'$exists': True} for d in self.required_bulk_properties})
+        temp_query.update({self.defect_query: {'$exists': False}, "state": "successful"})
+        bulk_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        # TODO Not the same validation behavior as material builders?
+        # If validation store exists, find tasks that are invalid and remove them
+        if self.task_validation:
+            validated = {
+                doc[self.tasks.key]
+                for doc in self.task_validation.query(
+                    {}, [self.task_validation.key]        
+                )
+            }
+
+            defect_tasks = defect_tasks.intersection(validated)
+            bulk_tasks = bulk_tasks.intersection(validated)
+
+            invalid_ids = {
+                doc[self.tasks.key]
+                for doc in self.task_validation.query(
+                    {"is_valid": False}, [self.task_validation.key]
+                )
+            }
+            self.logger.info("Removing {} invalid tasks".format(len(invalid_ids)))
+            defect_tasks = defect_tasks - invalid_ids
+            bulk_tasks = bulk_tasks - invalid_ids
+
+        processed_defect_tasks = {
+            t_id
+            for d in self.defects.query({}, ["task_ids"])
+            for t_id in d.get("task_ids", [])
+        }
+
+        all_tasks = defect_tasks | bulk_tasks
+
+        self.logger.debug("All tasks: {}".format(len(all_tasks)))
+        self.logger.debug("Bulk tasks before filter: {}".format(len(bulk_tasks)))
+        bulk_tasks = set(filter(self.__preprocess_bulk, bulk_tasks))
+        self.logger.debug("Bulk tasks after filter: {}".format(len(bulk_tasks)))
+        self.logger.debug("All defect tasks: {}".format(len(defect_tasks)))
+        unprocessed_defect_tasks = defect_tasks - processed_defect_tasks
+
+        if not unprocessed_defect_tasks:
+            self.logger.info("No unprocessed defect tasks. Exiting")
+            return
+        elif not bulk_tasks:
+            self.logger.info("No compatible bulk calculations. Exiting.")
+            return
+
+        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")
+        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")
+
+        # Set total for builder bars to have a total
+        self.total = len(unprocessed_defect_tasks)
+
+        # yield list of defects that are of the same type, matched to an appropriate bulk calc
+        self.logger.info(f"Starting defect matching.")
+
+        for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):
+            task_ids = self.__match_defects_to_bulks(bulk_tasks, defect_task_group)
+            doc = self.__get_defect_doc(defect)
+            item_bundle = self.__get_item_bundle(task_ids)
+            material_id = self.mpid_map[item_bundle[0][1]['task_id']]
+            yield doc, item_bundle, material_id
+
+    def process_item(self, items):
+        """
+        Process a group of defect tasks that correspond to the same defect into a single defect
+        document. If the DefectDoc already exists, then update it and return it. If it does not,
+        create a new DefectDoc
+
+        Args:
+            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]
+
+        returns: the defect document as a dictionary
+        """
+        defect_doc, item_bundle, material_id = items
+        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
+        if item_bundle:
+            tags = item_bundle[0][0].get('tags', [])
+            # TODO: This is a hack to get the 2d doc
+            DEFECT_DOC = DefectDoc2d if '2d' in tags else DefectDoc
+            if defect_doc:
+                defect_doc.update_all(item_bundle, query=self.defect_query)
+            else:
+                defect_doc = DEFECT_DOC.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id)
+            return defect_doc.dict()
+        return {}
+
+    def update_targets(self, items):
+        """
+        Inserts the new task_types into the task_types collection
+        """
+
+        items = [item for item in items if item]
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} defects")
+            for item in items:
+                item.update({"_bt": self.timestamp})
+                self.defects.remove_docs(
+                    {
+                       "task_ids": item['task_ids'],
+                    }
+                )
+            self.defects.update(
+                docs=jsanitize(items, allow_bson=True),
+                key='task_ids',
+            )
+        else:
+            self.logger.info("No items to update")
+
+    def __filter_and_group_tasks(self, tasks):
+        """
+        Groups defect tasks. Tasks are grouped according to the reduced representation
+        of the defect, and so tasks with different settings (e.g. supercell size, functional)
+        will be grouped together.
+
+        Args:
+            tasks: task_ids for unprocessed defects
+
+        returns:
+            [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect
+        """
+
+        # TODO self.defect_query does not work here because mongo does not allow projection 
+        # that retrieves a list index. So, we have to retrieve the whole transformations
+        props = [
+            #self.defect_query,
+            'transformations',
+            'task_id',
+            'output.structure'
+        ]
+
+        self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")
+
+        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)
+        sm = StructureMatcher(
+            ltol=self.settings.LTOL, stol=self.settings.STOL,
+            angle_tol=self.settings.ANGLE_TOL, allow_subset=False
+        )
+        defects = [
+            {
+                'task_id': t['task_id'], 'defect': self.__get_defect_from_task(t),
+                'structure': Structure.from_dict(t['output']['structure'])
+            }
+            for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props)
+        ]
+        for d in defects:
+            # TODO remove oxidation state because spins/oxidation cause errors in comparison.
+            #  but they shouldnt if those props are close in value
+            d['structure'].remove_oxidation_states()
+
+        def key(x):
+            s = x.get('defect').bulk_structure
+            return get_sg(s), s.composition.reduced_composition
+
+        def are_equal(x, y):
+            """
+            To decide if defects are equal. Either the defect objects are
+            equal, OR two different defect objects relaxed to the same final structure
+            (common with interstitials).
+
+            TODO Need a way to do the output structure comparison for a X atom defect cell
+            TODO which can be embedded in a Y atom defect cell up to tolerance.
+            """
+            if pdc.are_equal(x['defect'], y['defect']):
+                return True
+
+            # TODO This is needed for ghost vacancy unfortunately, since sm.fit can't distinguish ghosts
+            if x['defect'].defect_composition == y['defect'].defect_composition and \
+                    x['defect'].charge == y['defect'].charge and \
+                    sm.fit(x['structure'], y['structure']):
+                return True
+            return False
+
+        sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))
+        all_groups = []
+
+        # For each pre-grouped list of structures, perform actual matching.
+        for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])):
+            unmatched = list(g)
+            while len(unmatched) > 0:
+                i, refs = unmatched.pop(0)
+                matches = [i]
+                inds = list(filter(lambda j: are_equal(refs, unmatched[j][1]), list(range(len(unmatched)))))
+                matches.extend([unmatched[i][0] for i in inds])
+                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]
+                all_groups.append(
+                    (defects[i]['defect'], [defects[i]['task_id'] for i in matches])
+                )
+
+        self.logger.debug(f"All groups {all_groups}")
+        return all_groups
+
+    def __get_defect_from_task(self, task):
+        """
+        Using the defect_query property, retrieve a pymatgen defect object from the task document
+        """
+        defect = unpack(self.defect_query.split('.'), task)
+        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
+        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
+
+    def __get_defect_doc(self, defect):
+        """
+        Given a defect, find the DefectDoc corresponding to it in the defects store if it exists
+
+        returns: DefectDoc or None
+        """
+        material_id = self._get_mpid(defect.bulk_structure)
+        docs = [
+            DefectDoc(**doc)
+            for doc in self.defects.query(criteria={'material_id': material_id}, properties=None)
+        ]
+        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=True)
+        for doc in docs:
+            if pdc.are_equal(defect, doc.defect):
+                return doc
+        return None
+
+    # TODO should move to returning dielectric doc or continue returning the total diel tensor?
+    def __get_dielectric(self, key):
+        """
+        Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store
+        and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would
+        be the case for metallic systems, return None.
+        """
+        for diel in self.dielectric.query(criteria={"material_id": key}, properties=['total']):
+            return diel['total']
+        return None
+
+    #TODO retrieving the electrostatic potential is by far the most expesive part of the builder. Any way to reduce?
+    def __get_item_bundle(self, task_ids):
+        """
+        Gets a group of items that can be processed together into a defect document.
+
+        Args:
+            bulk_tasks: possible bulk tasks to match to defects
+            defect_task_group: group of equivalent defects (defined by PointDefectComparator)
+
+        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]
+        """
+        return [
+            (
+                next(
+                    synchronous_query(
+                        self.tasks, 
+                        self.electrostatic_potentials, 
+                        query={'task_id': defect_tasks_id}, properties=None) #self.required_defect_properties) #TODO
+                        ),
+                next(
+                    synchronous_query(
+                        self.tasks, 
+                        self.electrostatic_potentials, 
+                        query={'task_id': bulk_tasks_id}, properties=self.required_bulk_properties)
+                        ),
+                self.__get_dielectric(self._mpid_map[bulk_tasks_id]),
+            )
+            for defect_tasks_id, bulk_tasks_id in task_ids
+        ]
+
+    def _get_mpid(self, structure):
+        """
+        Given a structure, determine if an equivalent structure exists, with a material_id,
+        in the materials store.
+
+        Args:
+            structure: Candidate structure
+
+        returns: material_id, if one exists, else None
+        """
+        sga = SpacegroupAnalyzer(structure, symprec=SETTINGS.SYMPREC, angle_tolerance=SETTINGS.ANGLE_TOL)
+        mats = self.materials.query(
+            criteria={
+                'chemsys': structure.composition.chemical_system,
+            }, properties=['structure', 'material_id']
+        )
+        # TODO coudl more than one material match true?
+        sm = StructureMatcher(ltol=SETTINGS.LTOL, stol=SETTINGS.STOL, angle_tol=SETTINGS.ANGLE_TOL)
+        for m in mats:
+            if sm.fit(structure, Structure.from_dict(m['structure'])):
+                return m['material_id']
+        return None
+
+    def __match_defects_to_bulks(self, bulk_ids, defect_ids):
+        """
+        Given task_ids of bulk and defect tasks, match the defects to a bulk task that has
+        commensurate:
+
+            - Composition
+            - Number of sites
+            - Symmetry
+
+        """
+
+        self.logger.debug(f"Finding bulk/defect task combinations.")
+        self.logger.debug(f"Bulk tasks: {bulk_ids}")
+        self.logger.debug(f"Defect tasks: {defect_ids}")
+
+        # TODO mongo projection on array doesn't work (see above)
+        props = [
+            'task_id',
+            'input',
+            'nsites',
+            'output.structure',
+            'transformations',
+            'defect',
+            'scale',
+        ]
+        defects = list(self.tasks.query(criteria={'task_id': {'$in': list(defect_ids)}}, properties=props))
+        ps = DefectBuilder.__get_pristine_supercell(defects[0])
+        bulks = list(
+            self.tasks.query(
+                criteria={
+                    'task_id': {'$in': list(bulk_ids)},
+                    'composition_reduced': ps.composition.reduced_composition.as_dict(),
+                },
+                properties=props
+            )
+        )
+
+        sm = StructureMatcher(
+            ltol=SETTINGS.LTOL,
+            stol=SETTINGS.STOL,
+            angle_tol=SETTINGS.ANGLE_TOL,
+            primitive_cell=False,
+            scale=True,
+            attempt_supercell=False,
+            allow_subset=False,
+            comparator=ElementComparator(),
+        )
+
+        def _compare(b, d):
+            if run_type(b.get('input').get('dft')).value.split('+U')[0] == \
+                run_type(d.get('input').get('dft')).value.split('+U')[0] and \
+                   sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)):
+                       return True
+            return False
+
+        pairs = [
+            (defect['task_id'], bulk['task_id'])
+            for bulk in bulks
+            for defect in defects
+            if _compare(bulk, defect)
+        ]
+
+        self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")
+        return pairs
+
+    def __preprocess_bulk(self, task):
+        """
+        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk
+        tasks must have:
+
+            (1) Correspond to an existing material_id in the materials store
+            (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store
+            (3) If bulk is not a metal, electronic structure document must exist in the store
+
+        """
+        self.logger.debug("Preprocessing bulk task {}".format(task))
+        t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure', 'mpid']))
+
+        struc = Structure.from_dict(t.get('output').get('structure'))
+        mpid = self._get_mpid(struc)
+        if not mpid:
+            self.logger.debug(f"No material id found for bulk task {task}")
+            return False
+        self._mpid_map[task] = mpid
+        self.logger.debug(f"Material ID: {mpid}")
+
+        elec = self.electronic_structure.query_one(
+            properties=['band_gap'], criteria={self.electronic_structure.key: mpid}
+            )
+        if not elec:
+            self.logger.debug(f"Electronic structure data not found for {mpid}")
+            return False
+
+        # TODO right now pulling dos from electronic structure, should just pull summary document
+        if elec['band_gap'] > 0:
+            diel = self.__get_dielectric(mpid)
+            if not diel:
+                self.logger.info(f"Task {task} for {mpid} ({struc.composition.reduced_formula}) requires "
+                                 f"dielectric properties, but none found in dielectric store")
+                return False
+
+        return True
+
+    @staticmethod
+    def __get_pristine_supercell(task):
+        """
+        Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.
+            - If defect transform exists, the following transform's input will be returned
+            - If no follow up transform exists, the calculation input will be returned
+
+        If defect cannot be found in task, return the input structure.
+        """
+        if task.get('transformations'):
+            for i, transformation in enumerate(task['transformations']['history']):
+                    if transformation['@class'] == 'DefectTransformation':
+                        tmp = Structure.from_dict(transformation['input_structure'])
+                        tmp.make_supercell(transformation['scaling_matrix'])
+                        return tmp
+        return Structure.from_dict(task['input']['structure'])
+
+
+#TODO Major problem with this builder. materials store is used to sync the diel, elec, and pd with a single material id
+#TODO This is a problem because the material id in vasp store is not synced to cp2k store
+#TODO Also the chempots needed to adjust entries must come from cp2k, but you need to give vasp to sync the others
+class DefectThermoBuilder(Builder):
+
+    """
+    This builder creates collections of the DefectThermoDoc object.
+
+        (1) Find all DefectDocs that correspond to the same bulk material
+            given by material_id
+        (2) Create a new DefectThermoDoc for all of those documents
+        (3) Insert/Update the defect_thermos store with the new documents
+    """
+
+    def __init__(
+            self,
+            defects: Store,
+            defect_thermos: Store,
+            materials: Store,
+            thermo: Store,
+            electronic_structures: Store,
+            dos: Store,
+            query: Optional[Dict] = None,
+            **kwargs,
+    ):
+        """
+        Args:
+            defects: Store of defect documents (generated by DefectBuilder)
+            defect_thermos: Store of DefectThermoDocs to generate.
+            materials: Store of MaterialDocs to construct phase diagram
+            electronic_structures: Store of DOS objects
+            query: dictionary to limit tasks to be analyzed
+        """
+
+        self.defects = defects
+        self.defect_thermos = defect_thermos
+        self.materials = materials
+        self.thermo = thermo
+        self.dos = dos
+        self.electronic_structures = electronic_structures
+
+        self.query = query if query else {}
+        self.timestamp = None
+        self.kwargs = kwargs
+
+        super().__init__(sources=[defects, materials, thermo, electronic_structures, dos], targets=[defect_thermos], **kwargs)
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the collections
+        """
+
+        # Basic search index for tasks
+        self.defects.ensure_index("material_id")
+        self.defects.ensure_index("defect_id")
+
+        # Search index for materials
+        self.defect_thermos.ensure_index("material_id")
+
+    # TODO need to only process new tasks. Fast builder so currently is OK for small collections
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets items to process into DefectThermoDocs.
+
+        returns:
+            iterator yielding tuples containing:
+                - group of DefectDocs belonging to the same bulk material as indexed by material_id,
+                - materials in the chemsys of the bulk material for constructing phase diagram
+                - Dos of the bulk material for constructing phase diagrams/getting doping
+
+        """
+
+        self.logger.info("Defect thermo builder started")
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark build time for defect thermo documents
+        self.timestamp = datetime.utcnow()
+
+        # Get all tasks
+        self.logger.info("Finding tasks to process")
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+
+        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks
+
+        all_docs = [doc for doc in self.defects.query(self.query)]
+
+        self.logger.debug(f"Found {len(all_docs)} defect docs to process")
+
+        def filterfunc(x):
+            # material for defect x exists
+            if not list(self.materials.query(criteria={'material_id': x['material_id']}, properties=None)):
+                self.logger.debug(f"No material with MPID={x['material_id']} in the material store")
+                return False
+
+            for el in load_class(x['defect']['@module'], x['defect']['@class']).from_dict(x['defect']).defect_composition:
+                if not list(self.thermo.query(criteria={'chemsys': str(el)}, properties=None)):
+                    self.logger.debug(f"No entry for {el} in Thermo Store")
+                    return False
+
+            return True
+
+        for key, group in groupby(
+                filter(
+                    filterfunc,
+                    sorted(all_docs, key=lambda x: x['material_id'])
+                ), key=lambda x: x['material_id']
+        ):
+            group = [g for g in group]
+            try:
+                mat = self.__get_materials(key)
+                thermo = self.__get_thermos(mat.composition)
+                elec = self.__get_electronic_structure(group[0]['material_id'])
+                yield (group, mat, thermo, elec)
+            except LookupError as exception:
+                raise exception
+
+    def process_item(self, docs):
+        """
+        Process a group of defects belonging to the same material into a defect thermo doc
+        """
+        self.logger.info(f"Processing defects")
+        defects, material, thermos, elec_struc = docs
+        defects = [DefectDoc(**d) for d in defects]
+        thermos = [ThermoDoc(**t) for t in thermos]
+        defect_thermo_doc = DefectThermoDoc.from_docs(defects, thermos=thermos, electronic_structure=elec_struc)
+        return defect_thermo_doc.dict()
+
+    def update_targets(self, items):
+        """
+        Inserts the new DefectThermoDocs into the defect_thermos store
+        """
+        items = [item for item in items if item]
+        for item in items:
+            item.update({"_bt": self.timestamp})
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} defect thermo docs")
+            self.defect_thermos.update(
+                docs=jsanitize(items, allow_bson=True),
+                key=self.defect_thermos.key,
+            )
+        else:
+            self.logger.info("No items to update")
+
+    def __get_electronic_structure(self, material_id):
+        """
+        Gets the electronic structure of the bulk material
+        """
+        self.logger.info(f"Getting electronic structure for {material_id}")
+
+        # TODO This is updated to return the whole query because a.t.m. the
+        # DOS part of the electronic builder isn't working, so I'm using
+        # this to pull direct from the store of dos objects with no processing.
+        dosdoc = self.electronic_structures.query_one(
+            criteria={self.electronic_structures.key: material_id},
+            properties=None,
+        )
+        t_id = ElectronicStructureDoc(**dosdoc).dos.total['1'].task_id
+        dos = self.dos.query_one(criteria={'task_id': int(t_id)}, properties=None) #TODO MPID str/int issues
+        return dos
+
+    def __get_materials(self, key) -> List:
+        """
+        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the
+        materials store.
+        """
+        bulk = self.materials.query_one(criteria={'material_id': key}, properties=None)
+        if not bulk:
+            raise LookupError(
+                f"The bulk material ({key}) for these defects cannot be found in the materials store"
+            )
+        return MaterialsDoc(**bulk)
+
+    def __get_thermos(self, composition) -> List:
+        return list(self.thermo.query(criteria={'elements': {"$size": 1}}, properties=None))
+
+
+def unpack(query, d):
+    """
+    Unpack a mongo-style query into dictionary retrieval
+    """
+    if not query:
+        return d
+    if isinstance(d, List):
+        return unpack(query[1:], d.__getitem__(int(query.pop(0))))
+    return unpack(query[1:], d.__getitem__(query.pop(0)))
diff --git a/emmet-builders/emmet/builders/cp2k/materials.py b/emmet-builders/emmet/builders/cp2k/materials.py
new file mode 100644
index 0000000000..1846bcc57d
--- /dev/null
+++ b/emmet-builders/emmet/builders/cp2k/materials.py
@@ -0,0 +1,275 @@
+from datetime import datetime
+from itertools import chain
+from typing import Dict, Iterator, List, Optional
+
+from maggma.builders import Builder
+from maggma.stores import Store
+
+from emmet.builders.settings import EmmetBuildSettings
+from emmet.core.utils import group_structures, jsanitize
+from emmet.core.cp2k.material import MaterialsDoc
+from emmet.core.cp2k.task import TaskDocument
+
+SETTINGS = EmmetBuildSettings()
+
+__author__ = "Nicholas Winner <nwinner@berkeley.edu>"
+__maintainer__ = "Shyam Dwaraknath <shyamd@lbl.gov>"
+
+
+class MaterialsBuilder(Builder):
+    """
+    The Materials Builder matches VASP task documents by structure similarity into materials
+    document. The purpose of this builder is group calculations and determine the best structure.
+    All other properties are derived from other builders.
+
+    The process is as follows:
+
+        1.) Find all documents with the same formula
+        2.) Select only task documents for the task_types we can select properties from
+        3.) Aggregate task documents based on structure similarity
+        4.) Convert task docs to property docs with metadata for selection and aggregation
+        5.) Select the best property doc for each property
+        6.) Build material document from best property docs
+        7.) Post-process material document
+        8.) Validate material document
+
+    """
+
+    def __init__(
+        self,
+        tasks: Store,
+        materials: Store,
+        task_validation: Optional[Store] = None,
+        query: Optional[Dict] = None,
+        settings: Optional[EmmetBuildSettings] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            tasks: Store of task documents
+            materials: Store of materials documents to generate
+            query: dictionary to limit tasks to be analyzed
+            allowed_task_types: list of task_types that can be processed
+            symprec: tolerance for SPGLib spacegroup finding
+            ltol: StructureMatcher tuning parameter for matching tasks to materials
+            stol: StructureMatcher tuning parameter for matching tasks to materials
+            angle_tol: StructureMatcher tuning parameter for matching tasks to materials
+        """
+
+        self.tasks = tasks
+        self.materials = materials
+        self.task_validation = task_validation
+        self.query = query if query else {}
+        self.settings = EmmetBuildSettings.autoload(settings)
+        self.kwargs = kwargs
+
+        sources = [tasks]
+        if self.task_validation:
+            sources.append(self.task_validation)
+        super().__init__(sources=sources, targets=[materials], **kwargs)
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the tasks and materials collections
+        """
+
+        # Basic search index for tasks
+        self.tasks.ensure_index("task_id")
+        self.tasks.ensure_index("last_updated")
+        self.tasks.ensure_index("state")
+        self.tasks.ensure_index("formula_pretty")
+
+        # Search index for materials
+        self.materials.ensure_index("material_id")
+        self.materials.ensure_index("last_updated")
+        self.materials.ensure_index("sandboxes")
+        self.materials.ensure_index("task_ids")
+
+        if self.task_validation:
+            self.task_validation.ensure_index("task_id")
+            self.task_validation.ensure_index("valid")
+
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets all items to process into materials documents.
+        This does no datetime checking; relying on on whether
+        task_ids are included in the Materials Collection
+
+        Returns:
+            generator or list relevant tasks and materials to process into materials documents
+        """
+
+        self.logger.info("Materials builder started")
+        # TODO make a cp2k allowed type setting
+        self.logger.info(
+            f"Allowed task types: {[task_type.value for task_type in self.settings.CP2K_ALLOWED_TASK_TYPES]}"
+        )
+
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark buildtime for material documents
+        self.timestamp = datetime.utcnow()
+
+        # Get all processed tasks:
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+
+        self.logger.info("Finding tasks to process")
+        all_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(temp_query, [self.tasks.key])
+        }
+        processed_tasks = {
+            t_id
+            for d in self.materials.query({}, ["task_ids"])
+            for t_id in d.get("task_ids", [])
+        }
+        to_process_tasks = all_tasks - processed_tasks
+        to_process_forms = self.tasks.distinct(
+            "formula_pretty", {self.tasks.key: {"$in": list(to_process_tasks)}}
+        )
+        self.logger.info(f"Found {len(to_process_tasks)} unprocessed tasks")
+        self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
+
+        # Set total for builder bars to have a total
+        self.total = len(to_process_forms)
+
+        if self.task_validation:
+            invalid_ids = {
+                doc[self.tasks.key]
+                for doc in self.task_validation.query(
+                    {"is_valid": False}, [self.task_validation.key]
+                )
+            }
+        else:
+            invalid_ids = set()
+
+        projected_fields = [
+            "last_updated",
+            "completed_at",
+            "task_id",
+            "formula_pretty",
+            "output.energy_per_atom",
+            "output.structure",
+            "input",
+            "orig_inputs",
+            "input.structure",
+            "tags",
+        ]
+
+        for formula in to_process_forms:
+            tasks_query = dict(temp_query)
+            tasks_query["formula_pretty"] = formula
+            tasks = list(
+                self.tasks.query(criteria=tasks_query, properties=None)
+            )
+            for t in tasks:
+                if t[self.tasks.key] in invalid_ids:
+                    t["is_valid"] = False
+                else:
+                    t["is_valid"] = True
+
+            yield tasks
+
+    def process_item(self, tasks: List[Dict]) -> List[Dict]:
+        """
+        Process the tasks into a list of materials
+
+        Args:
+            tasks [dict] : a list of task docs
+
+        Returns:
+            ([dict],list) : a list of new materials docs and a list of task_ids that were processsed
+        """
+
+        tasks = [TaskDocument(**task) for task in tasks]
+        formula = tasks[0].formula_pretty
+        task_ids = [task.task_id for task in tasks]
+        self.logger.debug(f"Processing {formula} : {task_ids}")
+
+        grouped_tasks = self.filter_and_group_tasks(tasks)
+        materials = []
+        for group in grouped_tasks:
+            try:
+                materials.append(
+                    MaterialsDoc.from_tasks(
+                        group,
+                        quality_scores=self.settings.CP2K_QUALITY_SCORES,
+                    )
+                )
+            except Exception as e:
+                # TODO construct deprecated
+
+                failed_ids = list({t_.task_id for t_ in group})
+                doc = MaterialsDoc.construct_deprecated_material(tasks)
+                doc.warnings.append(str(e))
+                materials.append(doc)
+                self.logger.warn(
+                    f"Failed making material for {failed_ids}."
+                    f" Inserted as deprecated Material: {doc.material_id}"
+                )
+
+        self.logger.debug(f"Produced {len(materials)} materials for {formula}")
+
+        return jsanitize([mat.dict() for mat in materials], allow_bson=True)
+
+    def update_targets(self, items: List[List[Dict]]):
+        """
+        Inserts the new task_types into the task_types collection
+
+        Args:
+            items ([([dict],[int])]): A list of tuples of materials to update and the corresponding
+                processed task_ids
+        """
+
+        items = list(filter(None, chain.from_iterable(items)))
+
+        for item in items:
+            item.update({"_bt": self.timestamp})
+
+        material_ids = list({item["material_id"] for item in items})
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} materials")
+            self.materials.remove_docs({self.materials.key: {"$in": material_ids}})
+            self.materials.update(
+                docs=jsanitize(items, allow_bson=True),
+                key=["material_id"],
+            )
+        else:
+            self.logger.info("No items to update")
+
+    def filter_and_group_tasks(self, tasks: List[TaskDocument]) -> Iterator[List[Dict]]:
+        """
+        Groups tasks by structure matching
+        """
+
+        # TODO why did the way vasp builder did it not work here?
+        filtered_tasks = []
+        for task in tasks:
+            for allowed_type in self.settings.CP2K_ALLOWED_TASK_TYPES:
+                if task.task_type is allowed_type:
+                    filtered_tasks.append(task)
+                    continue
+
+        structures = []
+
+        for idx, task in enumerate(filtered_tasks):
+            s = task.output.structure
+            s.index: int = idx  # type: ignore
+            s.remove_oxidation_states()
+            s.remove_spin()
+            structures.append(s)
+
+        grouped_structures = group_structures(
+            structures,
+            ltol=self.settings.LTOL,
+            stol=self.settings.STOL,
+            angle_tol=self.settings.ANGLE_TOL,
+            symprec=self.settings.SYMPREC,
+        )
+
+        for group in grouped_structures:
+            grouped_tasks = [filtered_tasks[struc.index] for struc in group]  # type: ignore
+            yield grouped_tasks
diff --git a/emmet-builders/emmet/builders/cp2k/task_validator.py b/emmet-builders/emmet/builders/cp2k/task_validator.py
new file mode 100644
index 0000000000..6ae2e4b684
--- /dev/null
+++ b/emmet-builders/emmet/builders/cp2k/task_validator.py
@@ -0,0 +1,63 @@
+from typing import Dict, Optional
+
+from maggma.builders import MapBuilder
+from maggma.core import Store
+
+from emmet.builders.settings import EmmetBuildSettings
+from emmet.core.cp2k.task import TaskDocument
+from emmet.core.cp2k.validation import DeprecationMessage, ValidationDoc
+
+
+class TaskValidator(MapBuilder):
+    def __init__(
+        self,
+        tasks: Store,
+        task_validation: Store,
+        settings: Optional[EmmetBuildSettings] = None,
+        query: Optional[Dict] = None,
+        **kwargs,
+    ):
+        """
+        Creates task_types from tasks and type definitions
+
+        Args:
+            tasks: Store of task documents
+            task_validation: Store of task_types for tasks
+        """
+        self.tasks = tasks
+        self.task_validation = task_validation
+        self.settings = EmmetBuildSettings.autoload(settings)
+        self.query = query
+        self.kwargs = kwargs
+
+        super().__init__(
+            source=tasks,
+            target=task_validation,
+            projection=[
+                "input",
+                "output.forces",
+                "tags",
+            ],
+            query=query,
+            **kwargs,
+        )
+
+    def unary_function(self, item):
+        """
+        Find the task_type for the item
+
+        Args:
+            item (dict): a (projection of a) task doc
+        """
+        task_doc = TaskDocument(**item)
+        validation_doc = ValidationDoc.from_task_doc(
+            task_doc=task_doc,
+        )
+
+        bad_tags = list(set(task_doc.tags).intersection(self.settings.DEPRECATED_TAGS))
+        if len(bad_tags) > 0:
+            validation_doc.warnings.append(f"Manual Deprecation by tags: {bad_tags}")
+            validation_doc.valid = False
+            validation_doc.reasons.append(DeprecationMessage.MANUAL)
+
+        return validation_doc
diff --git a/emmet-builders/emmet/builders/cp2k/thermo.py b/emmet-builders/emmet/builders/cp2k/thermo.py
new file mode 100644
index 0000000000..9c8345a018
--- /dev/null
+++ b/emmet-builders/emmet/builders/cp2k/thermo.py
@@ -0,0 +1,368 @@
+import warnings
+from collections import defaultdict
+from itertools import chain
+from typing import Dict, Iterable, Iterator, List, Optional, Set
+from math import ceil
+
+from maggma.core import Builder, Store
+from maggma.utils import grouper
+from monty.json import MontyDecoder
+from pymatgen.analysis.phase_diagram import PhaseDiagramError
+from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
+from pymatgen.entries.computed_entries import ComputedStructureEntry
+
+from emmet.builders.utils import chemsys_permutations
+from emmet.core.thermo import ThermoDoc, PhaseDiagramDoc
+from emmet.core.utils import jsanitize
+
+
+class ThermoBuilder(Builder):
+    def __init__(
+        self,
+        materials: Store,
+        thermo: Store,
+        phase_diagram: Optional[Store] = None,
+        oxidation_states: Optional[Store] = None,
+        query: Optional[Dict] = None,
+        compatibility=None,
+        num_phase_diagram_eles: Optional[int] = None,
+        **kwargs,
+    ):
+        """
+        Calculates thermodynamic quantities for materials from phase
+        diagram constructions
+
+        Args:
+            materials (Store): Store of materials documents
+            thermo (Store): Store of thermodynamic data such as formation
+                energy and decomposition pathway
+            phase_diagram (Store): Store of phase diagram data for each unique chemical system
+            oxidation_states (Store): Store of oxidation state data to use in correction scheme application
+            query (dict): dictionary to limit materials to be analyzed
+            compatibility (PymatgenCompatability): Compatability module
+                to ensure energies are compatible
+            num_phase_diagram_eles (int): Maximum number of elements to use in phase diagram construction
+                for data within the separate phase_diagram collection
+        """
+
+        self.materials = materials
+        self.thermo = thermo
+        self.query = query if query else {}
+        self.compatibility = compatibility
+        self.oxidation_states = oxidation_states
+        self.phase_diagram = phase_diagram
+        self.num_phase_diagram_eles = num_phase_diagram_eles
+        self._completed_tasks: Set[str] = set()
+        self._entries_cache: Dict[str, List[ComputedStructureEntry]] = defaultdict(list)
+
+        sources = [materials]
+        if oxidation_states is not None:
+            sources.append(oxidation_states)
+
+        targets = [thermo]
+        if phase_diagram is not None:
+            targets.append(phase_diagram)
+
+        super().__init__(sources=sources, targets=targets, **kwargs)
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the tasks and materials collections
+        """
+
+        # Search index for materials
+        self.materials.ensure_index("material_id")
+        self.materials.ensure_index("chemsys")
+        self.materials.ensure_index("last_updated")
+
+        # Search index for thermo
+        self.thermo.ensure_index("material_id")
+        self.thermo.ensure_index("last_updated")
+
+        # Search index for phase_diagram
+        if self.phase_diagram:
+            self.phase_diagram.ensure_index("chemsys")
+
+    def prechunk(self, number_splits: int) -> Iterable[Dict]:  # pragma: no cover
+        updated_chemsys = self.get_updated_chemsys()
+        new_chemsys = self.get_new_chemsys()
+
+        affected_chemsys = self.get_affected_chemsys(updated_chemsys | new_chemsys)
+
+        # Remove overlapping chemical systems
+        to_process_chemsys = set()
+        for chemsys in updated_chemsys | new_chemsys | affected_chemsys:
+            if chemsys not in to_process_chemsys:
+                to_process_chemsys |= chemsys_permutations(chemsys)
+
+        N = ceil(len(to_process_chemsys) / number_splits)
+
+        for chemsys_chunk in grouper(to_process_chemsys, N):
+
+            yield {"query": {"chemsys": {"$in": list(chemsys_chunk)}}}
+
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets whole chemical systems of entries to process
+        """
+
+        self.logger.info("Thermo Builder Started")
+
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        updated_chemsys = self.get_updated_chemsys()
+        new_chemsys = self.get_new_chemsys()
+
+        affected_chemsys = self.get_affected_chemsys(updated_chemsys | new_chemsys)
+
+        # Remove overlapping chemical systems
+        processed = set()
+        to_process_chemsys = []
+        for chemsys in sorted(
+            updated_chemsys | new_chemsys | affected_chemsys,
+            key=lambda x: len(x),
+            reverse=True,
+        ):
+            if chemsys not in processed:
+                processed |= chemsys_permutations(chemsys)
+                to_process_chemsys.append(chemsys)
+
+        self.logger.info(
+            f"Found {len(to_process_chemsys)} chemical systems with new/updated materials to process"
+        )
+        self.total = len(to_process_chemsys)
+
+        # Yield the chemical systems in order of increasing size
+        # Will build them in a similar manner to fast Pourbaix
+        for chemsys in sorted(
+            to_process_chemsys, key=lambda x: len(x.split("-")), reverse=False
+        ):
+            entries = self.get_entries(chemsys)
+            yield entries
+
+    def process_item(self, item: List[Dict]):
+
+        if len(item) == 0:
+            return []
+
+        entries = [ComputedStructureEntry.from_dict(entry) for entry in item]
+        # determine chemsys
+        elements = sorted(
+            set([el.symbol for e in entries for el in e.composition.elements])
+        )
+        chemsys = "-".join(elements)
+
+        self.logger.debug(f"Processing {len(entries)} entries for {chemsys}")
+
+        material_entries: Dict[str, Dict[str, ComputedStructureEntry]] = defaultdict(
+            dict
+        )
+        pd_entries = []
+        for entry in entries:
+            material_entries[entry.entry_id][entry.data["run_type"]] = entry
+
+        if self.compatibility:
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore", message="Failed to guess oxidation states.*"
+                )
+                pd_entries = self.compatibility.process_entries(entries)
+        else:
+            pd_entries = entries
+        self.logger.debug(f"{len(pd_entries)} remain in {chemsys} after filtering")
+
+        try:
+            docs, pd = ThermoDoc.from_entries(pd_entries, deprecated=False)
+            for doc in docs:
+                doc.entries = material_entries[doc.material_id]
+                doc.entry_types = list(material_entries[doc.material_id].keys())
+
+            pd_data = None
+
+            if self.phase_diagram:
+                if (
+                    self.num_phase_diagram_eles is None
+                    or len(elements) <= self.num_phase_diagram_eles
+                ):
+                    pd_doc = PhaseDiagramDoc(chemsys=chemsys, phase_diagram=pd)
+                    pd_data = jsanitize(pd_doc.dict(), allow_bson=True)
+
+            docs_pd_pair = (
+                jsanitize([d.dict() for d in docs], allow_bson=True),
+                [pd_data],
+            )
+
+        except PhaseDiagramError as p:
+            elsyms = []
+            for e in entries:
+                elsyms.extend([el.symbol for el in e.composition.elements])
+
+            self.logger.warning(
+                f"Phase diagram error in chemsys {'-'.join(sorted(set(elsyms)))}: {p}"
+            )
+            return []
+        except Exception as e:
+            self.logger.error(
+                f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}"
+            )
+            return []
+
+        return docs_pd_pair
+
+    def update_targets(self, items):
+        """
+        Inserts the thermo and phase diagram docs into the thermo collection
+        Args:
+            items ([[tuple(List[dict],List[dict])]]): a list of list of thermo dictionaries to update
+        """
+
+        # print(len(items))
+        thermo_docs = [item[0] for item in items if item]
+        phase_diagram_docs = [item[1] for item in items if item]
+
+        # flatten out lists
+        thermo_docs = list(filter(None, chain.from_iterable(thermo_docs)))
+        phase_diagram_docs = list(filter(None, chain.from_iterable(phase_diagram_docs)))
+
+        # Check if already updated this run
+        thermo_docs = [
+            i for i in thermo_docs if i["material_id"] not in self._completed_tasks
+        ]
+
+        self._completed_tasks |= {i["material_id"] for i in thermo_docs}
+
+        for item in thermo_docs:
+            if isinstance(item["last_updated"], dict):
+                item["last_updated"] = MontyDecoder().process_decoded(
+                    item["last_updated"]
+                )
+
+        if self.phase_diagram:
+            self.phase_diagram.update(phase_diagram_docs)
+
+        if len(thermo_docs) > 0:
+            self.logger.info(f"Updating {len(thermo_docs)} thermo documents")
+            self.thermo.update(docs=thermo_docs, key=["material_id"])
+        else:
+            self.logger.info("No thermo items to update")
+
+    def get_entries(self, chemsys: str) -> List[Dict]:
+        """
+        Gets a entries from the tasks collection for the corresponding chemical systems
+        Args:
+            chemsys(str): a chemical system represented by string elements seperated by a dash (-)
+        Returns:
+            set(ComputedEntry): a set of entries for this system
+        """
+
+        self.logger.info(f"Getting entries for: {chemsys}")
+        # First check the cache
+        all_chemsys = chemsys_permutations(chemsys)
+        cached_chemsys = all_chemsys & set(self._entries_cache.keys())
+        query_chemsys = all_chemsys - cached_chemsys
+        all_entries = list(
+            chain.from_iterable(self._entries_cache[c] for c in cached_chemsys)
+        )
+
+        self.logger.debug(
+            f"Getting {len(cached_chemsys)} sub-chemsys from cache for {chemsys}"
+        )
+        self.logger.debug(
+            f"Getting {len(query_chemsys)} sub-chemsys from DB for {chemsys}"
+        )
+
+        # Second grab the materials docs
+        new_q = dict(self.query)
+        new_q["chemsys"] = {"$in": list(query_chemsys)}
+        new_q["deprecated"] = False
+        materials_docs = list(
+            self.materials.query(
+                criteria=new_q, properties=["material_id", "entries", "deprecated"]
+            )
+        )
+
+        # Get Oxidation state data for each material
+        oxi_states_data = {}
+        if self.oxidation_states:
+            material_ids = [t["material_id"] for t in materials_docs]
+            oxi_states_data = {
+                d["material_id"]: d.get("average_oxidation_states", {})
+                for d in self.oxidation_states.query(
+                    properties=["material_id", "average_oxidation_states"],
+                    criteria={
+                        "material_id": {"$in": material_ids},
+                        "state": "successful",
+                    },
+                )
+            }
+
+        self.logger.debug(
+            f"Got {len(materials_docs)} entries from DB for {len(query_chemsys)} sub-chemsys for {chemsys}"
+        )
+
+        # Convert the entries into ComputedEntries and store
+        for doc in materials_docs:
+            for r_type, entry_dict in doc.get("entries", {}).items():
+                entry_dict["data"]["oxidation_states"] = oxi_states_data.get(
+                    entry_dict["entry_id"], {}
+                )
+                entry_dict["data"]["run_type"] = r_type
+                elsyms = sorted(set([el for el in entry_dict["composition"]]))
+                self._entries_cache["-".join(elsyms)].append(entry_dict)
+                all_entries.append(entry_dict)
+
+        self.logger.info(f"Total entries in {chemsys} : {len(all_entries)}")
+
+        return all_entries
+
+    def get_updated_chemsys(self,) -> Set:
+        """Gets updated chemical system as defined by the updating of an existing material"""
+
+        updated_mats = self.thermo.newer_in(self.materials, criteria=self.query)
+        updated_chemsys = set(
+            self.materials.distinct(
+                "chemsys", {"material_id": {"$in": list(updated_mats)}, **self.query}
+            )
+        )
+        self.logger.debug(f"Found {len(updated_chemsys)} updated chemical systems")
+
+        return updated_chemsys
+
+    def get_new_chemsys(self) -> Set:
+        """Gets newer chemical system as defined by introduction of a new material"""
+
+        # All materials that are not present in the thermo collection
+        thermo_mat_ids = self.thermo.distinct("material_id")
+        mat_ids = self.materials.distinct("material_id", self.query)
+        dif_task_ids = list(set(mat_ids) - set(thermo_mat_ids))
+        q = {"material_id": {"$in": dif_task_ids}}
+        new_mat_chemsys = set(self.materials.distinct("chemsys", q))
+        self.logger.debug(f"Found {len(new_mat_chemsys)} new chemical systems")
+
+        return new_mat_chemsys
+
+    def get_affected_chemsys(self, chemical_systems: Set) -> Set:
+        """Gets chemical systems affected by changes in the supplied chemical systems"""
+        # First get all chemsys with any of the elements we've marked
+        affected_chemsys = set()
+        affected_els = list({el for c in chemical_systems for el in c.split("-")})
+        possible_affected_chemsys = self.materials.distinct(
+            "chemsys", {"elements": {"$in": affected_els}}
+        )
+
+        sub_chemsys = defaultdict(list)
+        # Build a dictionary mapping sub_chemsys to all super_chemsys
+        for chemsys in possible_affected_chemsys:
+            for permutation in chemsys_permutations(chemsys):
+                sub_chemsys[permutation].append(chemsys)
+
+        # Select and merge distinct super chemsys from sub_chemsys
+        for chemsys in chemical_systems:
+            affected_chemsys |= set(sub_chemsys[chemsys])
+
+        self.logger.debug(
+            f"Found {len(affected_chemsys)} chemical systems affected by this build"
+        )
+
+        return affected_chemsys
diff --git a/emmet-builders/emmet/builders/cp2k/utils.py b/emmet-builders/emmet/builders/cp2k/utils.py
new file mode 100644
index 0000000000..e3bd98c8a3
--- /dev/null
+++ b/emmet-builders/emmet/builders/cp2k/utils.py
@@ -0,0 +1,113 @@
+from typing import List
+from copy import deepcopy
+import numpy as np
+from pymatgen.ext.matproj import MPRester
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+from pymatgen.analysis.structure_matcher import StructureMatcher
+
+
+def unpack(query, d):
+    if not query:
+        return d
+    if isinstance(d, List):
+        return unpack(query[1:], d.__getitem__(int(query.pop(0))))
+    return unpack(query[1:], d.__getitem__(query.pop(0)))
+
+
+# TODO Move polaron compare to a different function to make validation easier
+# TODO Matching should have have an easier way of setting distance tolerance
+    # can't this all be done inside of structure matcher to find mapping or something?
+def matcher(bulk_struc, defect_struc, final_bulk_struc=None, final_defect_struc=None):
+    matching_indices = []
+    dis = []
+
+    for j in range(len(defect_struc)):
+        for i in range(len(bulk_struc)):
+            if (
+                    bulk_struc[i].species == defect_struc[j].species
+                    and bulk_struc[i].distance(defect_struc[j]) < .02
+                    and bulk_struc[i].properties.get('ghost', False) == defect_struc[j].properties.get('ghost', False)
+            ):
+                matching_indices.append((i, j))
+                dis.append(j)
+                break
+
+    def_index = list(set(range(len(defect_struc))).difference(set(dis)))
+
+    # Consider a possible polaron
+    if len(def_index) == 0:
+        matching_indices = []
+
+        for j in range(len(defect_struc)):
+            for i in range(len(bulk_struc)):
+                if (
+                        bulk_struc[i].specie.as_dict().get('element') == defect_struc[j].specie.as_dict().get('element')
+                        and bulk_struc[i].distance(defect_struc[j]) < .02
+                        and bulk_struc[i].properties.get('ghost', False) == defect_struc[j].properties.get('ghost',
+                                                                                                           False)):
+                    matching_indices.append((i, j))
+                    break
+
+        oxi_diff = [
+            abs(
+                final_defect_struc.site_properties['oxidation_state'][d] -
+                final_bulk_struc.site_properties['oxidation_state'][b]
+            )
+            for b, d in matching_indices
+        ]
+        def_index = np.argmax(oxi_diff)
+        matching_indices.pop(def_index)
+        return def_index, matching_indices
+
+    elif len(def_index) > 1:
+        print(def_index)
+        raise ValueError("The provided defect structure and bulk structure "
+                         "have more than one potential defect site")
+
+    return def_index[0], matching_indices
+
+
+def get_dielectric(mpid):
+    with MPRester() as mp:
+        dat = mp.get_data(mpid, prop='diel')
+        band_gap = mp.get_data(mpid, prop='band_gap')[0]['band_gap']
+    if band_gap == 0.0:
+        return np.inf
+    try:
+        return dat[0]['diel']['e_total']
+    except:
+        return None
+
+def get_mpid(s):
+    struc = deepcopy(s)
+    struc.remove_oxidation_states()
+    struc.remove_spin()
+    for p in struc.site_properties:
+        struc.remove_site_property(p)
+    sga = SpacegroupAnalyzer(struc)
+    sm = StructureMatcher()
+    with MPRester() as mp:
+        dat = mp.query(
+            criteria={
+                'chemsys': struc.composition.chemical_system,
+                'spacegroup.symbol': sga.get_space_group_symbol()
+            },
+            properties=['material_id', 'structure']
+        )
+    dat = list(filter(lambda x: sm.fit(s, x['structure']), dat))
+    if len(dat) == 1:
+        return dat[0]['material_id']
+    return None
+
+
+def synchronous_query(store1, store2, query, properties=None):
+    for t in store1.query(query, properties):
+        for key in t.keys():
+            if isinstance(t[key], dict) and t[key].get('gfs_id'):
+                t.update(
+                    {
+                        k: v for k,v in next(store2.query({"_id": t[key].get('gfs_id')}, properties)).items()
+                        if properties is None or k in properties
+                    }
+                    )
+        yield t
diff --git a/emmet-builders/emmet/builders/materials/dielectric.py b/emmet-builders/emmet/builders/materials/dielectric.py
index 68d9a8aa86..262dbcf5ae 100644
--- a/emmet-builders/emmet/builders/materials/dielectric.py
+++ b/emmet-builders/emmet/builders/materials/dielectric.py
@@ -74,7 +74,6 @@ def get_items(self):
 
         for mat in mats:
             doc = self._get_processed_doc(mat)
-
             if doc is not None:
                 yield doc
             else:
@@ -151,7 +150,7 @@ def _get_processed_doc(self, mat):
                     "output.epsilon_ionic",
                     "output.bandgap",
                 ],
-                criteria={self.tasks.key: str(task_id)},
+                criteria={self.tasks.key: int(task_id)},
             )
 
             if task_query["output"]["bandgap"] > 0:
diff --git a/emmet-builders/emmet/builders/materials/electronic_structure.py b/emmet-builders/emmet/builders/materials/electronic_structure.py
index e84d6aa1e0..8bc99ff18b 100644
--- a/emmet-builders/emmet/builders/materials/electronic_structure.py
+++ b/emmet-builders/emmet/builders/materials/electronic_structure.py
@@ -17,7 +17,7 @@
 from emmet.core.settings import EmmetSettings
 from emmet.core.electronic_structure import ElectronicStructureDoc
 from emmet.core.utils import jsanitize
-
+import bson
 SETTINGS = EmmetSettings()
 
 
@@ -367,7 +367,6 @@ def _update_materials_doc(self, mat_id):
             if "NSCF Line" in mat["task_types"][task_id]:
 
                 bs_type = None
-
                 task_query = self.tasks.query_one(
                     properties=[
                         "calcs_reversed",
@@ -378,7 +377,7 @@ def _update_materials_doc(self, mat_id):
                         "input.parameters",
                         "output.structure",
                     ],
-                    criteria={"task_id": str(task_id)},
+                    criteria={"task_id": int(task_id)},
                 )
 
                 fs_id = str(task_query["calcs_reversed"][0]["bandstructure_fs_id"])
@@ -441,7 +440,7 @@ def _update_materials_doc(self, mat_id):
                         "input.parameters",
                         "output.structure",
                     ],
-                    criteria={"task_id": str(task_id)},
+                    criteria={"task_id": int(task_id)},
                 )
 
                 fs_id = str(task_query["calcs_reversed"][0]["dos_fs_id"])
@@ -493,7 +492,7 @@ def _update_materials_doc(self, mat_id):
                         "calcs_reversed",
                         "output.structure",
                     ],
-                    criteria={"task_id": str(task_id)},
+                    criteria={"task_id": int(task_id)},
                 )
 
                 structure = Structure.from_dict(task_query["output"]["structure"])
@@ -566,11 +565,11 @@ def _obtain_blessed_calculations(
                 ]
 
                 bs_obj = self.bandstructure_fs.query_one(
-                    criteria={"fs_id": sorted_bs_data[0]["fs_id"]}
+                    criteria={"_id": bson.ObjectId(sorted_bs_data[0]["fs_id"])}
                 )
 
                 materials_doc["bandstructure"][bs_type]["object"] = (
-                    bs_obj["data"] if bs_obj is not None else None
+                    bs_obj if bs_obj is not None else None
                 )
 
                 materials_doc["bandstructure"][bs_type][
@@ -593,12 +592,11 @@ def _obtain_blessed_calculations(
             materials_doc["dos"]["task_id"] = sorted_dos_data[0]["task_id"]
 
             materials_doc["dos"]["lmaxmix"] = sorted_dos_data[0]["lmaxmix"]
-
             dos_obj = self.dos_fs.query_one(
-                criteria={"fs_id": sorted_dos_data[0]["fs_id"]}
+                criteria={"_id": bson.ObjectId(sorted_dos_data[0]["fs_id"])}
             )
             materials_doc["dos"]["object"] = (
-                dos_obj["data"] if dos_obj is not None else None
+                dos_obj if dos_obj is not None else None
             )
 
             materials_doc["dos"]["output_structure"] = sorted_dos_data[0][
diff --git a/emmet-builders/emmet/builders/materials/materials.py b/emmet-builders/emmet/builders/materials/materials.py
new file mode 100644
index 0000000000..322bdc1017
--- /dev/null
+++ b/emmet-builders/emmet/builders/materials/materials.py
@@ -0,0 +1,336 @@
+"""
+Unified materials builder for tasks docs from different calc codes.
+"""
+
+from datetime import datetime
+from itertools import chain, groupby
+import itertools
+from math import ceil
+from typing import Dict, Iterable, Iterator, List, Optional, Union
+
+from maggma.builders import Builder
+from maggma.stores import Store
+from maggma.utils import grouper
+
+from emmet.builders.settings import EmmetBuildSettings
+from emmet.core.utils import group_structures, jsanitize
+from pymatgen.core import Structure
+
+__author__ = "Nicholas Winner <nwinner@berkeley.edu>"
+
+SETTINGS = EmmetBuildSettings()
+
+class MaterialsBuilder(Builder):
+    """
+    The Materials Builder matches VASP task documents by structure similarity into materials
+    document. The purpose of this builder is group calculations and determine the best structure.
+    All other properties are derived from other builders.
+
+    The process is as follows:
+
+        1.) Find all documents with the same formula
+        2.) Select only task documents for the task_types we can select properties from
+        3.) Aggregate task documents based on strucutre similarity
+        4.) Create a MaterialDoc from the group of task documents
+        5.) Validate material document
+
+    """
+
+    def __init__(
+        self,
+        tasks: Union[Store, Iterable[Store]],
+        materials: Store,
+        task_validation: Store,
+        query: Optional[Dict] = None,
+        settings: Optional[EmmetBuildSettings] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            tasks: Store of task documents
+            materials: Store of materials documents to generate
+            task_validation: Store for storing task validation results
+            query: dictionary to limit tasks to be analyzed
+            settings: EmmetSettings to use in the build process
+        """
+
+        self.tasks = tasks if isinstance(tasks, Iterable) else [tasks]
+        self.materials = materials
+        self.task_validation = task_validation
+        self.query = query if query else {}
+        self.settings = EmmetBuildSettings.autoload(settings)
+        
+        if len(set(t.key for t in self.tasks)) != 1:
+            raise ValueError("All task stores must have the same key")
+
+        self._tasks_key = self.tasks[0].key
+        self._TASK_DOCS = dict()
+        self._MAT_DOCS = dict()
+        for calc_code in self.settings.ALLOWED_CALC_CODES:
+            mod = __import__(f"emmet.core.{calc_code.lower()}.task", globals(), locals(), ["TaskDocument"], 0)
+            self._TASK_DOCS[calc_code] = getattr(mod, "TaskDocument") 
+            mod = __import__(f"emmet.core.{calc_code.lower()}.material", globals(), locals(), ["MaterialsDoc"], 0)
+            self._MAT_DOCS[calc_code] = getattr(mod, "MaterialsDoc")
+
+        self.kwargs = kwargs
+
+        sources = self.tasks + [self.task_validation]
+        super().__init__(sources=sources, targets=[materials], **kwargs)
+
+    def _get_allowed_task_types(self, calc_code):
+        if calc_code == 'vasp':
+            return self.settings.VASP_ALLOWED_VASP_TYPES
+        if calc_code == 'cp2k':
+            return self.settings.CP2K_ALLOWED_CP2K_TYPES
+
+    @property
+    def tasks_key(self):
+        return self._tasks_key
+
+    @property
+    def task_docs(self):
+        """
+        Returns:
+            dict: A dictionary of task document classes
+        """
+        return self._TASK_DOCS
+    
+    @property
+    def mat_docs(self):
+        """
+        Returns:
+            dict: A dictionary of material document classes
+        """
+        return self._MAT_DOCS
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the tasks and materials collections
+        """
+
+        for tasks in self.tasks:
+            # Basic search index for tasks
+            tasks.ensure_index("task_id")
+            tasks.ensure_index("last_updated")
+            tasks.ensure_index("state")
+            tasks.ensure_index("formula_pretty")
+
+        # Search index for materials
+        self.materials.ensure_index("material_id")
+        self.materials.ensure_index("last_updated")
+        self.materials.ensure_index("task_ids")
+
+        if self.task_validation:
+            self.task_validation.ensure_index("task_id")
+            self.task_validation.ensure_index("valid")
+
+    def prechunk(self, number_splits: int) -> Iterable[Dict]:  # pragma: no cover
+        """Prechunk the materials builder for distributed computation"""
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+        if len(self.settings.BUILD_TAGS) > 0 and len(self.settings.EXCLUDED_TAGS) > 0:
+            temp_query["$and"] = [
+                {"tags": {"$in": self.settings.BUILD_TAGS}},
+                {"tags": {"$nin": self.settings.EXCLUDED_TAGS}},
+            ]
+        elif len(self.settings.BUILD_TAGS) > 0:
+            temp_query["tags"] = {"$in": self.settings.BUILD_TAGS}
+
+        self.logger.info("Finding tasks to process")
+        all_tasks = list(
+            self.tasks.query(temp_query, [self.tasks.key, "formula_pretty"])
+        )
+
+        processed_tasks = set(self.materials.distinct("task_ids"))
+        to_process_tasks = {d[self.tasks.key] for d in all_tasks} - processed_tasks
+        to_process_forms = {
+            d["formula_pretty"]
+            for d in all_tasks
+            if d[self.tasks.key] in to_process_tasks
+        }
+
+        N = ceil(len(to_process_forms) / number_splits)
+
+        for formula_chunk in grouper(to_process_forms, N):
+
+            yield {"query": {"formula_pretty": {"$in": list(formula_chunk)}}}
+
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets all items to process into materials documents.
+        This does no datetime checking; relying on on whether
+        task_ids are included in the Materials Colection
+
+        Returns:
+            generator or list relevant tasks and materials to process into materials documents
+        """
+
+        self.logger.info("Materials builder started")
+        for t in self.settings.ALLOWED_CALC_CODES:
+            tt = getattr(self.settings, f"{t.upper()}_ALLOWED_TASK_TYPES") 
+            self.logger.info(
+                f"Allowed {t} task types: {[task_type.value for task_type in tt]}"
+            )
+
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark buildtime for material documents
+        self.timestamp = datetime.utcnow()
+
+        # Get all processed tasks:
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+        if len(self.settings.BUILD_TAGS) > 0 and len(self.settings.EXCLUDED_TAGS) > 0:
+            temp_query["$and"] = [
+                {"tags": {"$in": self.settings.BUILD_TAGS}},
+                {"tags": {"$nin": self.settings.EXCLUDED_TAGS}},
+            ]
+        elif len(self.settings.BUILD_TAGS) > 0:
+            temp_query["tags"] = {"$in": self.settings.BUILD_TAGS}
+
+        self.logger.info("Finding tasks to process")
+        self.logger.info(self.tasks)
+        all_tasks = []
+        for tasks in self.tasks:
+            all_tasks.extend(
+               [t for t in tasks.query(temp_query, [self.tasks_key, "formula_pretty"])] 
+            )
+        processed_tasks = set(self.materials.distinct("task_ids"))
+        to_process_tasks = {d[self.tasks_key] for d in all_tasks} - processed_tasks
+        to_process_forms = {
+            d["formula_pretty"]
+            for d in all_tasks
+            if d[self.tasks_key] in to_process_tasks
+        }
+
+        self.logger.info(f"Found {len(to_process_tasks)} unprocessed tasks")
+        self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas")
+
+        # Set total for builder bars to have a total
+        self.total = len(to_process_forms)
+
+        validation = {
+            doc[self.tasks_key]: {'valid': doc["valid"], 'calc_code': doc['calc_code']}
+            for doc in self.task_validation.query(
+                criteria={"calc_code": {"$exists": True}, "valid": {"$exists": True}},
+                properties=[self.task_validation.key, "valid", "calc_code"],
+            )
+        }
+
+        for formula in to_process_forms:
+            tasks_query = dict(temp_query)
+            tasks_query["formula_pretty"] = formula
+            tasks = []
+            for t in self.tasks:
+                tasks.extend(
+                    [tsk for tsk in t.query(criteria=tasks_query, properties=None)]
+                )
+
+            for t in tasks:
+                t["is_valid"] = validation[t[self.tasks_key]]['valid']
+                t["calc_code"] = validation[t[self.tasks_key]]['calc_code']
+
+            yield tasks
+
+    def process_item(self, items: List[Dict]) -> List[Dict]:
+        """
+        Process the tasks into a list of materials
+
+        Args:
+            tasks [dict] : a list of task docs
+
+        Returns:
+            ([dict],list) : a list of new materials docs and a list of task_ids that were processsed
+        """
+
+        materials = []
+        for structure_group in self.group_tasks(items):
+            materials.append([])
+            sorted_structures = sorted(structure_group, key=lambda x: x['calc_code'])
+            self.logger.debug(f"Processing {len(sorted_structures)} tasks for a structure")
+
+            for key, task_group in groupby(sorted_structures, key=lambda x: x["calc_code"]):
+                self.logger.debug(f"Processing tasks for the calc_code {key}")
+
+                tasks = [self.task_docs[key.upper()](**task) for task in task_group]
+                formula = tasks[0].formula_pretty
+                task_ids = [task.task_id for task in tasks]
+                self.logger.debug(f"Processing {formula} : {task_ids}")
+
+                try:
+                    doc = self.mat_docs[key.upper()].from_tasks(
+                            tasks,
+                            quality_scores=self.settings.VASP_QUALITY_SCORES,
+                            use_statics=self.settings.VASP_USE_STATICS,
+                        )
+                    materials[-1].append(doc)
+                    
+                except Exception as e:
+                    failed_ids = list({t_.task_id for t_ in tasks})
+                    doc = self.mat_docs[key.upper()].construct_deprecated_material(tasks)
+                    doc.warnings.append(str(e))
+                    materials[-1].append(doc)
+                    self.logger.warn(
+                        f"Failed making material for {failed_ids}."
+                        f" Inserted as deprecated Material: {doc.material_id}"
+                    )
+
+                if key == 'vasp':
+                    material_id = doc.material_id
+            
+            for m in materials[-1]:
+                m.material_id = material_id
+        
+        self.logger.debug(f"Produced {len(materials)} materials for {formula}")
+        mats = list(itertools.chain.from_iterable(materials))
+        return jsanitize([m.dict() for m in mats], allow_bson=True)
+
+    def update_targets(self, items: List[List[Dict]]):
+        """
+        Inserts the new task_types into the task_types collection
+
+        Args:
+            items ([([dict],[int])]): A list of tuples of materials to update and the corresponding
+                processed task_ids
+        """
+
+        docs = list(chain.from_iterable(items))  # type: ignore
+
+        for item in docs:
+            item.update({"_bt": self.timestamp})
+
+        material_ids = list({item["material_id"] for item in docs})
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(docs)} materials")
+            self.materials.remove_docs({self.materials.key: {"$in": material_ids}})
+            self.materials.update(
+                docs=docs, key=["material_id"],
+            )
+        else:
+            self.logger.info("No items to update")
+
+    def group_tasks(
+        self, tasks: List
+    ) -> Iterator[List]:
+        """
+        Groups tasks by structure matching
+        """
+        structures = []
+
+        for idx, task in enumerate(tasks):
+            s = Structure.from_dict(task['output']['structure'])
+            s.index: int = idx  # type: ignore
+            structures.append(s)
+
+        grouped_structures = group_structures(
+            structures,
+            ltol=self.settings.LTOL,
+            stol=self.settings.STOL,
+            angle_tol=self.settings.ANGLE_TOL,
+            symprec=self.settings.SYMPREC,
+        )
+        for group in grouped_structures:
+            grouped_tasks = [tasks[struc.index] for struc in group]  # type: ignore
+            yield grouped_tasks
diff --git a/emmet-builders/emmet/builders/settings.py b/emmet-builders/emmet/builders/settings.py
index 83f48c4876..9cfadbad50 100644
--- a/emmet-builders/emmet/builders/settings.py
+++ b/emmet-builders/emmet/builders/settings.py
@@ -9,6 +9,7 @@
 from emmet.core.settings import EmmetSettings
 from emmet.core.vasp.calc_types import TaskType as VaspTaskType
 from emmet.core.qchem.calc_types import TaskType as QChemTaskType
+from emmet.core.cp2k.calc_types import TaskType as Cp2kTaskType
 
 
 class EmmetBuildSettings(EmmetSettings):
@@ -30,7 +31,12 @@ class EmmetBuildSettings(EmmetSettings):
         [], description="Tags for calculations to deprecate"
     )
 
-    VASP_ALLOWED_VASP_TYPES: List[VaspTaskType] = Field(
+    ALLOWED_CALC_CODES: List[str] = Field(
+        ['VASP', 'CP2K'],
+        description="Calculation codes to allow in the build pipeline"
+    )
+
+    VASP_ALLOWED_TASK_TYPES: List[VaspTaskType] = Field(
         [t.value for t in VaspTaskType],
         description="Allowed task_types to build materials from",
     )
@@ -40,6 +46,11 @@ class EmmetBuildSettings(EmmetSettings):
         description="Allowed task_types to build molecules from",
     )
 
+    CP2K_ALLOWED_TASK_TYPES: List[Cp2kTaskType] = Field(
+        [t.value for t in Cp2kTaskType],
+        description="Allowed CP2K task_types to build materials from",
+    )
+
     DEFAULT_REFERENCE: str = Field(
         "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and "
         "Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and "
diff --git a/emmet-builders/emmet/builders/vasp/defects.py b/emmet-builders/emmet/builders/vasp/defects.py
new file mode 100644
index 0000000000..09424127e2
--- /dev/null
+++ b/emmet-builders/emmet/builders/vasp/defects.py
@@ -0,0 +1,936 @@
+"""
+This module contains 2 builders for defect properties in non-metals.
+
+1.  The DefectBuilder builds individual Defect documents
+    with bulk and dielectric properties included, along
+    with properties neccessary for accessing delocalization
+    and parsing further defect thermodynamics.
+2.  The DefectThermoBuilder builds DefectPhaseDiagram
+    documents from defect objects with identical bulk structures
+    and calculation metadata.
+"""
+
+from datetime import datetime
+import numpy as np
+
+from monty.json import MontyDecoder, jsanitize
+
+from pymatgen import Structure, MPRester, PeriodicSite
+from pymatgen.analysis.structure_matcher import StructureMatcher, PointDefectComparator
+from pymatgen.electronic_structure.bandstructure import BandStructure
+from pymatgen.analysis.defects.core import Vacancy, Substitution, Interstitial, DefectEntry
+from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram
+from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
+
+from maggma.builder import Builder
+
+
+__author__ = "Danny Broberg, Shyam Dwaraknath"
+
+
+class DefectBuilder(Builder):
+    def __init__(self,
+                 tasks,
+                 defects,
+                 query=None,
+                 compatibility=DefectCompatibility(),
+                 ltol=0.2,
+                 stol=0.3,
+                 angle_tol=5,
+                 max_items_size=0,
+                 update_all=False,
+                 **kwargs):
+        """
+        Creates DefectEntry from vasp task docs
+
+        Args:
+            tasks (Store): Store of tasks documents
+            defects (Store): Store of defect entries with all metadata required for followup decisions on defect thermo
+            query (dict): dictionary to limit materials to be analyzed
+            compatibility (DefectCompatability): Compatability module to ensure defect calculations are compatible
+            ltol (float): StructureMatcher tuning parameter for matching tasks to materials
+            stol (float): StructureMatcher tuning parameter for matching tasks to materials
+            angle_tol (float): StructureMatcher tuning parameter for matching tasks to materials
+            max_items_size (int): limits number of items approached from tasks (zero places no limit on number of items)
+            update_all (bool): Whether to consider all task ids from defects store.
+                Default is False (will not re-consider task-ids which have been considered previously.
+        """
+
+        self.tasks = tasks
+        self.defects = defects
+        self.query = query if query else {}
+        self.compatibility = compatibility
+        self.ltol = ltol
+        self.stol = stol
+        self.angle_tol = angle_tol
+        self.max_items_size = max_items_size
+        self.update_all=update_all
+        super().__init__(sources=[tasks], targets=[defects], **kwargs)
+
+    def get_items(self):
+        """
+        Gets sets of entries from chemical systems that need to be processed
+
+        Returns:
+            generator of relevant entries from one chemical system
+
+        """
+        self.logger.info("Defect Builder Started")
+
+        self.logger.info("Setting indexes")
+        self.ensure_indicies()
+
+        # Save timestamp for update operation
+        self.time_stamp = datetime.utcnow()
+
+        # Get all successful defect tasks that have been updated since
+        # defect_store was last updated
+        q = dict(self.query)
+        q["state"] = "successful"
+        if not self.update_all:
+            if 'task_id' in q:
+                if isinstance(q['task_id'], int) or isinstance(q['task_id'], float):
+                    q['task_id'] = {'$nin': [], '$in': [q['task_id']]}
+                if '$nin' in q['task_id']:
+                    q['task_id']['$nin'].extend( self.defects.distinct('entry_id'))
+                else:
+                    q['task_id'].update( {'$nin': self.defects.distinct('entry_id')})
+            else:
+                q.update({'task_id': {'$nin': self.defects.distinct('entry_id')}})
+
+        q.update({'transformations.history.@module':
+                      {'$in': ['pymatgen.transformations.defect_transformations']}})
+        defect_tasks = list(self.tasks.query(criteria=q,
+                                             properties=['task_id', 'transformations.history.0.defect.structure']))
+        if self.max_items_size and len(defect_tasks) > self.max_items_size:
+            defect_tasks = [dtask for dind, dtask in enumerate(defect_tasks) if dind < self.max_items_size]
+        task_ids = [dtask['task_id'] for dtask in defect_tasks]
+        self.logger.info("Found {} new defect tasks to consider:\n{}".format( len(defect_tasks), task_ids))
+
+        # get a few other tasks which are needed for defect entries (regardless of when they were last updated):
+        # bulk_supercell, dielectric calc, BS calc, HSE-BS calc
+        bulksc = {"state": "successful", 'transformations.history.0.@module':
+            {'$in': ['pymatgen.transformations.standard_transformations']}}
+        dielq = {"state": "successful", "input.incar.LEPSILON": True, "input.incar.LPEAD": True}
+        HSE_BSq = {"state": "successful", 'calcs_reversed.0.input.incar.LHFCALC': True,
+                   'transformations.history.0.@module':
+                        {'$nin': ['pymatgen.transformations.defect_transformations',
+                                  'pymatgen.transformations.standard_transformations']}}
+        # TODO: add smarter capability for getting HSE bandstructure from tasks
+        # TODO: add capability for getting GGA bandstructure from tasks?
+
+        # now load up all defect tasks with relevant information required for process_item step
+        # includes querying bulk, diel and hybrid calcs as you go along.
+        log_additional_tasks = dict() #to minimize number of bulk + diel + hse queries, log by chemsys
+
+        needed_defect_properties = ['task_id', 'transformations', 'input',
+                                    'task_label', 'last_updated',
+                                    'output', 'calcs_reversed', 'chemsys']
+        needed_bulk_properties = ['task_id', 'chemsys', 'task_label',
+                                  'last_updated', 'transformations',
+                                  'input', 'output', 'calcs_reversed']
+        for d_task_id in task_ids:
+            d_task = list(self.tasks.query(criteria={"task_id": d_task_id},
+                                           properties=needed_defect_properties))[0]
+            chemsys = "-".join(sorted((Structure.from_dict(
+                d_task['transformations']['history'][0]['defect']['structure']).symbol_set)))
+
+            if chemsys not in log_additional_tasks.keys():
+                #grab all bulk calcs for chemsys
+                q = bulksc.copy()
+                q.update( {"chemsys": chemsys})
+                bulk_tasks = list(self.tasks.query(criteria=q, properties=needed_bulk_properties))
+
+                #grab all diel calcs for chemsys
+                q = dielq.copy()
+                q.update( {"chemsys": chemsys})
+                diel_tasks = list(self.tasks.query(criteria=q,
+                                                   properties=['task_id', 'task_label', 'last_updated',
+                                                               'input', 'output']))
+
+                #grab all hse bs calcs for chemsys
+                q = HSE_BSq.copy()
+                q.update( {"chemsys": chemsys})
+                hybrid_tasks = list(self.tasks.query(criteria=q,
+                                                     properties=['task_id', 'input',
+                                                                 'output', 'task_label']))
+
+                self.logger.debug("\t{} has {} bulk loaded {} diel and {} hse"
+                                  "".format( chemsys, len(bulk_tasks),
+                                             len(diel_tasks), len(hybrid_tasks)))
+
+                log_additional_tasks.update({chemsys: {'bulksc': bulk_tasks[:],
+                                                       'diel': diel_tasks[:],
+                                                       'hsebs': hybrid_tasks[:]}})
+
+            yield [d_task, log_additional_tasks[chemsys]]
+
+    def process_item(self, item):
+        """
+        Process a defect item (containing defect, bulk and dielectric information as processed in get_items)
+
+        Args:
+            item (defect_task): a defect_task to process into a DefectEntry object
+
+        Returns:
+            dict: a DefectEntry dictionary to update defect database with
+        """
+        d_task, chemsys_additional_tasks = item
+
+        defect_task = self.find_and_load_bulk_tasks(d_task, chemsys_additional_tasks)
+        if defect_task is None:
+            self.logger.error("Could not determine defect bulk properties for {}".format( d_task['task_id']))
+            return
+        elif 'bulk_task' not in defect_task.keys():
+            self.logger.error("bulk_task is not in item! Cannot parse task id = {}.".format( d_task['task_id']))
+            return
+        elif 'diel_task_meta' not in defect_task.keys():
+            self.logger.error("diel_task_meta is not in item! Cannot parse task id = {}.".format( d_task['task_id']))
+            return
+
+        #initialize parameters with dielectric data
+        eps_ionic = defect_task['diel_task_meta']['epsilon_ionic']
+        eps_static = defect_task['diel_task_meta']['epsilon_static']
+        eps_total = []
+        for i in range(3):
+            eps_total.append([e[0]+e[1] for e in zip(eps_ionic[i], eps_static[i])])
+        parameters = {'epsilon_ionic': eps_ionic, 'epsilon_static':  eps_static,
+                      'dielectric': eps_total,
+                      'task_level_metadata':
+                          {'diel_taskdb_task_id': defect_task['diel_task_meta']['diel_taskdb_task_id']}}
+
+        parameters = self.get_run_metadata( defect_task, parameters)
+
+        # add bulk data to parameters
+        bulk_task = defect_task['bulk_task']
+        bulk_energy = bulk_task['output']['energy']
+        bulk_sc_structure = Structure.from_dict( bulk_task['input']['structure'])
+        parameters.update( {'bulk_energy': bulk_energy, 'bulk_sc_structure': bulk_sc_structure})
+
+        parameters = self.get_bulk_gap_data( bulk_task, parameters)
+        parameters = self.get_bulk_chg_correction_metadata( bulk_task, parameters)
+
+
+        # Add defect data to parameters
+        defect_energy = defect_task['output']['energy']
+        parameters.update({'defect_energy': defect_energy})
+
+        defect, parameters = self.load_defect_and_structure_data( defect_task, parameters)
+
+        parameters = self.load_defect_chg_correction_metadata( defect, defect_task, parameters)
+
+        if 'vr_eigenvalue_dict' in defect_task['calcs_reversed'][0]['output'].keys():
+            eigenvalues = defect_task['calcs_reversed'][0]['output']['vr_eigenvalue_dict']['eigenvalues']
+            kpoint_weights = defect_task['calcs_reversed'][0]['output']['vr_eigenvalue_dict']['kpoint_weights']
+            parameters.update( {'eigenvalues': eigenvalues,
+                                'kpoint_weights': kpoint_weights} )
+        else:
+            self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have eigenvalue data for parsing '
+                  'bandfilling.'.format(defect_task['task_label'], defect_task['task_id']))
+
+        if 'defect' in defect_task['calcs_reversed'][0]['output'].keys():
+            parameters.update( {'defect_ks_delocal_data':
+                                    defect_task['calcs_reversed'][0]['output']['defect'].copy()})
+        else:
+            self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have defect data for parsing '
+                  'delocalization.'.format(defect_task['task_label'], defect_task['task_id']))
+
+        defect_entry = DefectEntry( defect, parameters['defect_energy'] - parameters['bulk_energy'],
+                                    corrections = {}, parameters = parameters, entry_id= defect_task['task_id'])
+
+        defect_entry = self.compatibility.process_entry( defect_entry)
+        defect_entry.parameters = jsanitize( defect_entry.parameters, strict=True, allow_bson=True)
+        defect_entry_as_dict = defect_entry.as_dict()
+        defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id'] #this seemed neccessary for legacy db
+
+        return defect_entry_as_dict
+
+    def update_targets(self, items):
+        """
+        Inserts the defect docs into the defect collection
+
+        Args:
+            items ([dict]): a list of defect entries as dictionaries
+        """
+        items = [item for item in items if item]
+        self.logger.info("Updating {} defect documents".format(len(items)))
+
+        self.defects.update(items, update_lu=True, key='entry_id')
+
+    def ensure_indicies(self):
+        """
+        Ensures indicies on the tasks and defects collections
+        :return:
+        """
+        # Search indicies for tasks
+        self.tasks.ensure_index(self.tasks.key, unique=True)
+        self.tasks.ensure_index("chemsys")
+
+        # Search indicies for defects
+        self.defects.ensure_index(self.defects.key, unique=True)
+        self.defects.ensure_index("chemsys")
+
+    def find_and_load_bulk_tasks(self, defect_task, additional_tasks):
+        """
+        This takes defect_task and finds bulk task, diel task, and other things as appropriate (example hse_data...)
+
+        needs to make sure INCAR settings and structure matching works for all
+
+        :param defect_task:
+        :param additional_tasks:
+        :return:
+        """
+        out_defect_task = defect_task.copy()
+
+        # get suitable BULK calc by matching structures (right lattice
+        # constant + same supercell size...) and matching essential INCAR + POTCAR settings...
+        bulk_tasks = additional_tasks['bulksc']
+        bulk_sm = StructureMatcher( ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol,
+            primitive_cell=False, scale=False, attempt_supercell=False, allow_subset=False)
+        bulk_matched = []
+        dstruct_withoutdefect = Structure.from_dict(out_defect_task['transformations']['history'][0]['defect']['structure'])
+        scaling_matrix = out_defect_task['transformations']['history'][0]['scaling_matrix']
+        dstruct_withoutdefect.make_supercell( scaling_matrix)
+        dincar = out_defect_task["calcs_reversed"][0]["input"]["incar"] #identify essential INCAR properties which differentiate different calcs
+        dincar_reduced = {k: dincar.get(k, None) if dincar.get(k) not in  ['None', 'False', False] else None
+                          for k in ["LHFCALC", "HFSCREEN", "IVDW", "LUSE_VDW", "LDAU", "METAGGA"]
+                          }
+        d_potcar_base = {'pot_spec': [potelt["titel"] for potelt in out_defect_task['input']['potcar_spec']],
+                         'pot_labels': out_defect_task['input']['pseudo_potential']['labels'][:],
+                         'pot_type': out_defect_task['input']['pseudo_potential']['pot_type'],
+                         'functional': out_defect_task['input']['pseudo_potential']['functional']}
+
+        for b_task in bulk_tasks:
+            bstruct = Structure.from_dict(b_task['input']['structure'])
+            if bulk_sm.fit( bstruct, dstruct_withoutdefect):
+                #also match essential INCAR and POTCAR settings
+                bincar = b_task["calcs_reversed"][0]["input"]["incar"]
+                bincar_reduced = {k: bincar.get(k, None) if bincar.get(k) not in ['None', 'False', False] else None
+                                  for k in dincar_reduced.keys()
+                                  }
+
+                b_potcar = {'pot_spec': set([potelt["titel"] for potelt in b_task['input']['potcar_spec']]),
+                            'pot_labels': set(b_task['input']['pseudo_potential']['labels'][:]),
+                            'pot_type': b_task['input']['pseudo_potential']['pot_type'],
+                            'functional': b_task['input']['pseudo_potential']['functional']}
+                d_potcar = d_potcar_base.copy() #need to reduce in cases of extrinsic species or reordering
+                d_potcar['pot_spec'] = set([d for d in d_potcar['pot_spec'] if d in b_potcar['pot_spec']])
+                d_potcar['pot_labels'] = set([d for d in d_potcar['pot_labels'] if d in b_potcar['pot_labels']])
+
+                #track to make sure that cartesian coords are same (important for several levels of analysis in defect builder)
+                same_cart_positions = True
+                for bsite_coords in bstruct.cart_coords:
+                    if not len( dstruct_withoutdefect.get_sites_in_sphere(bsite_coords, .1)):
+                        same_cart_positions = False
+
+                if bincar_reduced == dincar_reduced and b_potcar == d_potcar and same_cart_positions:
+                    bulk_matched.append( b_task.copy())
+                else:
+                    try:
+                        self.logger.debug("Bulk structure match was found for {} with {}, "
+                                          "but:".format( b_task['task_label'], out_defect_task['task_label']))
+                    except:
+                        self.logger.debug("BULK STRUCT MATCH was found, but task_label did not exist AND:")
+                    if bincar_reduced != dincar_reduced:
+                        out_inc = {k:[v, bincar_reduced[k]] for k,v in dincar_reduced.items() if v != bincar_reduced[k]}
+                        self.logger.debug("\tIncars were different: {} ".format( out_inc))
+                    if b_potcar != d_potcar:
+                        out_pot = {k:[v, b_potcar[k]] for k,v in d_potcar.items() if v != b_potcar[k]}
+                        self.logger.debug("\tPotcar specs were different: {} ".format( out_pot))
+                    if not same_cart_positions:
+                        self.logger.debug("\tBulk site coords were different")
+
+        #if bulk_task found then take most recently updated bulk_task for defect
+        if len( bulk_matched):
+            bulk_matched = sorted( bulk_matched, key=lambda x: x["last_updated"], reverse=True)
+            out_defect_task["bulk_task"] = bulk_matched[0].copy()
+            self.logger.debug("Found {} possible bulk supercell structures. Taking most recent entry updated "
+                  "on: {}".format(len(bulk_matched), bulk_matched[0]['last_updated']))
+        else:
+            self.logger.error("Bulk task doesnt exist for: {} ({})! Cant create defect "
+                             "object...\nMetadata: {}\n{}".format( out_defect_task['task_label'],
+                                                                   out_defect_task['task_id'],
+                                                                   d_potcar, dincar_reduced))
+            return None
+
+
+        # get suitable dielectric calc by matching structures (only lattice
+        # constant fitting needed - not supercell) and POTCAR settings...
+        diel_task_list = additional_tasks['diel']
+        diel_sm = StructureMatcher( ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol,
+            primitive_cell=True, scale=False, attempt_supercell=True, allow_subset=False)
+        diel_matched = []
+        for diel_task in diel_task_list:
+            diel_struct = Structure.from_dict( diel_task['input']['structure'])
+            if diel_sm.fit( diel_struct, dstruct_withoutdefect):
+                #also match essential POTCAR settings and confirm LVTOT = True and LVHAR = True
+
+                diel_potcar = {'pot_spec': set([potelt["titel"] for potelt in diel_task['input']['potcar_spec']]),
+                            'pot_labels': set(diel_task['input']['pseudo_potential']['labels'][:]),
+                            'pot_type': diel_task['input']['pseudo_potential']['pot_type'],
+                            'functional': diel_task['input']['pseudo_potential']['functional']}
+                d_potcar = d_potcar_base.copy() #need to reduce in cases of extrinsic species or reordering
+                d_potcar['pot_spec'] = set([d for d in d_potcar['pot_spec'] if d in diel_potcar['pot_spec']])
+                d_potcar['pot_labels'] = set([d for d in d_potcar['pot_labels'] if d in diel_potcar['pot_labels']])
+
+                if diel_potcar == d_potcar:
+                    diel_matched.append( diel_task.copy())
+                else:
+                    try:
+                        self.logger.debug("Dielectric structure match was found for {} with {}, "
+                                          "but:".format( diel_task['task_label'], out_defect_task['task_label']))
+                    except:
+                        self.logger.debug("Dielectric STRUCT MATCH was found, but task_label did not exist AND:")
+                    out_pot = {k:[v, diel_potcar[k]] for k,v in d_potcar.items() if v != diel_potcar[k]}
+                    self.logger.debug("\tPotcar specs were different: {} ".format( out_pot))
+            # else:
+            #     self.logger.debug("{} ({}) had a structure which did not match {} for use "
+            #                       "as a dielectric calculation".format( diel_task['task_label'],
+            #                                                          diel_task['task_id'],
+            #                                                          out_defect_task['task_label']))
+
+        #if diel_tasks found then take most recently updated bulk_task for defect
+        if len( diel_matched):
+            diel_matched = sorted( diel_matched, key=lambda x: x["last_updated"], reverse=True)
+            diel_dict = {'diel_taskdb_task_id': diel_matched[0]['task_id'],
+                         'epsilon_static': diel_matched[0]['output']['epsilon_static'],
+                         'epsilon_ionic': diel_matched[0]['output']['epsilon_ionic']}
+            out_defect_task["diel_task_meta"] = diel_dict.copy()
+            self.logger.debug("Found {} possible dieletric calcs. Taking most recent entry updated "
+                  "on: {}".format(len(diel_matched), diel_matched[0]['last_updated']))
+        else:
+            try:
+                self.logger.error("Dielectric task doesnt exist for: {} ({})! Cant create defect "
+                                 "object...\nMetadata for defect: {}\n{}".format( out_defect_task['task_label'],
+                                                                                  out_defect_task['task_id'],
+                                                                                  d_potcar, dincar_reduced))
+            except:
+                self.logger.debug("DIEL TASK DNE.")
+
+            return None
+
+        # FINALLY consider grabbing extra hybrid BS information...
+        # first confirm from the INCAR setting that this defect is NOT an HSE calculation itself...
+        if dincar_reduced['LHFCALC'] in [None, False, 'False'] and len(additional_tasks['hsebs']):
+            hse_bs_matched = []
+            for hse_bs_task in additional_tasks['hsebs']:
+                hse_bs_struct = Structure.from_dict(hse_bs_task['input']['structure'])
+                if diel_sm.fit(hse_bs_struct, dstruct_withoutdefect): #can use same matching scheme as the dielectric structure matcher
+                    hse_bs_matched.append( hse_bs_task.copy())
+                else:
+                    self.logger.debug("task id {} has a structure which did not match {} for use "
+                                      "as an HSE BS calculation".format( hse_bs_task['task_id'],
+                                                                         out_defect_task['task_label']))
+
+            if len(hse_bs_matched): #match the lowest CBM  and highest VBM values, keeping track of their task_ids
+                hybrid_cbm_data = min([[htask['output']['cbm'], htask['task_id']] for htask in hse_bs_matched])
+                hybrid_vbm_data = max([[htask['output']['vbm'], htask['task_id']] for htask in hse_bs_matched])
+                hybrid_meta = {'hybrid_cbm': hybrid_cbm_data[0], 'hybrid_CBM_task_id': hybrid_cbm_data[1],
+                               'hybrid_vbm': hybrid_vbm_data[0], 'hybrid_VBM_task_id': hybrid_vbm_data[1]}
+                out_defect_task["hybrid_bs_meta"] = hybrid_meta.copy()
+                self.logger.debug("Found hybrid band structure properties for {}:\n\t{}".format( out_defect_task['task_label'],
+                                                                                               hybrid_meta))
+            else:
+                self.logger.debug("Could NOT find hybrid band structure properties for {} despite "
+                                  "there being {} eligible hse calculations".format( out_defect_task['task_label'],
+                                                                                     len(additional_tasks['hsebs'])))
+
+        return out_defect_task
+
+    def get_run_metadata(self, item, parameters):
+        bulk_task = item['bulk_task']
+
+        # load INCAR, KPOINTS, POTCAR and task_id (for both bulk and defect)
+        potcar_summary = {'pot_spec': list([potelt["titel"] for potelt in item['input']['potcar_spec']]),
+                          'pot_labels': list(item['input']['pseudo_potential']['labels'][:]),
+                          'pot_type': item['input']['pseudo_potential']['pot_type'],
+                          'functional': item['input']['pseudo_potential'][
+                              'functional']}  # note bulk has these potcar values also, other wise it would not get to process_items
+        dincar = item["input"]["incar"].copy()
+        dincar_reduced = {k: dincar.get(k, None) for k in ["LHFCALC", "HFSCREEN", "IVDW", "LUSE_VDW",
+                                                           "LDAU", "METAGGA"]}  # same as bulk
+        bincar = item["input"]["incar"].copy()
+        d_kpoints = item['calcs_reversed'][0]['input']['kpoints']
+        if type(d_kpoints) != dict:
+            d_kpoints = d_kpoints.as_dict()
+        b_kpoints = bulk_task['calcs_reversed'][0]['input']['kpoints']
+        if type(b_kpoints) != dict:
+            b_kpoints = b_kpoints.as_dict()
+
+        dir_name = item['calcs_reversed'][0]['dir_name']
+        bulk_dir_name = bulk_task['calcs_reversed'][0]['dir_name']
+
+        parameters['task_level_metadata'].update({'defect_dir_name': dir_name,
+                                                  'bulk_dir_name': bulk_dir_name,
+                                                  'bulk_taskdb_task_id': bulk_task['task_id'],
+                                                  'potcar_summary': potcar_summary.copy(),
+                                                  'incar_calctype_summary': dincar_reduced.copy(),
+                                                  'defect_incar': dincar.copy(),
+                                                  'bulk_incar': bincar.copy(),
+                                                  'defect_kpoints': d_kpoints.copy(),
+                                                  'bulk_kpoints': b_kpoints.copy(),
+                                                  'defect_task_last_updated': item['last_updated']})
+
+        if (dincar_reduced['HFSCREEN'] in [None, False, 'False']) and ("hybrid_bs_meta" in item):
+            parameters.update({k: item["hybrid_bs_meta"][k] for k in ['hybrid_cbm', 'hybrid_vbm']})
+            parameters.update({'hybrid_gap': parameters['hybrid_cbm'] - parameters['hybrid_vbm']})
+            parameters['task_level_metadata'].update({k: item["hybrid_bs_meta"][k] for k in ['hybrid_CBM_task_id',
+                                                                                             'hybrid_VBM_task_id']})
+        return parameters
+
+    def get_bulk_chg_correction_metadata(self, bulk_task, parameters):
+        if 'locpot' in bulk_task['calcs_reversed'][0]['output'].keys():
+            bulklpt = bulk_task['calcs_reversed'][0]['output']['locpot']
+            axes = list(bulklpt.keys())
+            axes.sort()
+            parameters.update( {'bulk_planar_averages': [bulklpt[ax] for ax in axes]} )
+        else:
+            self.logger.error('BULKTYPEcalc: {} (task-id {}) does not '
+                              'have locpot values for parsing'.format( bulk_task['task_label'],
+                                                                       bulk_task['task_id']))
+
+        if 'outcar' in bulk_task['calcs_reversed'][0]['output'].keys():
+            bulkoutcar = bulk_task['calcs_reversed'][0]['output']['outcar']
+            bulk_atomic_site_averages = bulkoutcar['electrostatic_potential']
+            parameters.update( {'bulk_atomic_site_averages': bulk_atomic_site_averages})
+        else:
+            self.logger.error('BULKTYPEcalc: {} (task-id {}) does not '
+                              'have outcar values for parsing'.format( bulk_task['task_label'],
+                                                                       bulk_task['task_id']))
+        return parameters
+
+    def get_bulk_gap_data(self, bulk_task, parameters):
+        bulk_structure = parameters['bulk_sc_structure']
+        try:
+            with MPRester() as mp:
+                # mplist = mp.find_structure(bulk_structure) #had to hack this because this wasnt working??
+                tmp_mplist = mp.get_entries_in_chemsys(list(bulk_structure.symbol_set))
+            mplist = [ment.entry_id for ment in tmp_mplist if ment.composition.reduced_composition == \
+                      bulk_structure.composition.reduced_composition]
+            #TODO: this is a hack because find_structure was data intensive. simplify the hack to do less queries...
+        except:
+            raise ValueError("Error with querying MPRester for {}".format( bulk_structure.composition.reduced_formula))
+
+        mpid_fit_list = []
+        for trial_mpid in mplist:
+            with MPRester() as mp:
+                mpstruct = mp.get_structure_by_material_id(trial_mpid)
+            if StructureMatcher(ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol,
+                                primitive_cell=True, scale=False, attempt_supercell=True,
+                                allow_subset=False).fit(bulk_structure, mpstruct):
+                mpid_fit_list.append( trial_mpid)
+
+        if len(mpid_fit_list) == 1:
+            mpid = mpid_fit_list[0]
+            self.logger.debug("Single mp-id found for bulk structure:{}.".format( mpid))
+        elif len(mpid_fit_list) > 1:
+            num_mpid_list = [int(mp.split('-')[1]) for mp in mpid_fit_list]
+            num_mpid_list.sort()
+            mpid  = 'mp-'+str(num_mpid_list[0])
+            self.logger.debug("Multiple mp-ids found for bulk structure:{}\nWill use lowest number mpid "
+                  "for bulk band structure = {}.".format(str(mpid_fit_list), mpid))
+        else:
+            self.logger.debug("Could not find bulk structure in MP database after tying the "
+                              "following list:\n{}".format( mplist))
+            mpid = None
+
+        if mpid and not parameters['task_level_metadata']['incar_calctype_summary']['LHFCALC']:
+            #TODO: NEED to be smarter about use of +U etc in MP gga band structure calculations...
+            with MPRester() as mp:
+                bs = mp.get_bandstructure_by_material_id(mpid)
+
+            parameters['task_level_metadata'].update( {'MP_gga_BScalc_data':
+                                                           bs.get_band_gap().copy()} ) #contains gap kpt transition
+            cbm = bs.get_cbm()['energy']
+            vbm = bs.get_vbm()['energy']
+            gap = bs.get_band_gap()['energy']
+        else:
+            parameters['task_level_metadata'].update( {'MP_gga_BScalc_data': None}) #to signal no MP BS is used
+            cbm = bulk_task['output']['cbm']
+            vbm = bulk_task['output']['vbm']
+            gap = bulk_task['output']['bandgap']
+
+        parameters.update( {'mpid': mpid,
+                            "cbm": cbm, "vbm": vbm, "gap": gap} )
+
+        return parameters
+
+    def load_defect_and_structure_data(self, defect_task, parameters):
+        """
+        This loads defect object from task_dict AND
+        loads initial and final structures (making sure their sites are indexed in an equivalent manner)
+
+        if can't confirm that indices are equivalent, then initial_defect_structure = None
+
+        :param defect_task:
+        :param parameters:
+        :return:
+        """
+
+        if type(defect_task['transformations']) != dict:
+            defect_task['transformations'] = defect_task['transformations'].as_dict()
+
+        defect = defect_task['transformations']['history'][0]['defect']
+        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
+        defect = MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
+
+        scaling_matrix = MontyDecoder().process_decoded(defect_task['transformations']['history'][0]['scaling_matrix'])
+
+        final_defect_structure = defect_task['output']['structure']
+        if type(final_defect_structure) != Structure:
+            final_defect_structure = Structure.from_dict(final_defect_structure)
+
+        # build initial_defect_structure from very first ionic relaxation step (ensures they are indexed the same]
+        initial_defect_structure = Structure.from_dict(
+            defect_task['calcs_reversed'][-1]['output']['ionic_steps'][0]['structure'])
+
+        # confirm structure matching
+        ids = defect.generate_defect_structure(scaling_matrix)
+        ids_sm = StructureMatcher( ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol,
+                                   primitive_cell=False, scale=False, attempt_supercell=False,
+                                   allow_subset=False)
+        if not ids_sm.fit( ids, initial_defect_structure):
+            self.logger.error("Could not match initial-to-final structure. Will not load initial structure.")
+            initial_defect_structure = None
+
+        parameters.update({'final_defect_structure': final_defect_structure,
+                           'initial_defect_structure': initial_defect_structure,
+                           'scaling_matrix': scaling_matrix})
+
+        return defect, parameters
+
+    def load_defect_chg_correction_metadata(self, defect, defect_task, parameters):
+
+        # --> Load information for Freysoldt related parsing
+        if 'locpot' in defect_task['calcs_reversed'][0]['output'].keys():
+            deflpt = defect_task['calcs_reversed'][0]['output']['locpot']
+            axes = list(deflpt.keys())
+            axes.sort()
+            defect_planar_averages = [deflpt[ax] for ax in axes]
+            abc = parameters['initial_defect_structure'].lattice.abc
+            axis_grid = []
+            for ax in range(3):
+                num_pts = len(defect_planar_averages[ax])
+                axis_grid.append([i / num_pts * abc[ax] for i in range(num_pts)])
+            parameters.update({'axis_grid': axis_grid,
+                               'defect_planar_averages': defect_planar_averages})
+        else:
+            self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have locpot values for '
+                              'parsing Freysoldt correction'.format(defect_task['task_label'], defect_task['task_id']))
+
+
+        # --> Load information for Kumagai related parsing
+        if 'outcar' in defect_task['calcs_reversed'][0]['output'].keys() and \
+                parameters['initial_defect_structure']:
+
+            defoutcar = defect_task['calcs_reversed'][0]['output']['outcar']
+            defect_atomic_site_averages = defoutcar['electrostatic_potential']
+            bulk_sc_structure = parameters['bulk_sc_structure']
+            initial_defect_structure = parameters['initial_defect_structure']
+
+            bulksites = [site.frac_coords for site in bulk_sc_structure]
+            initsites = [site.frac_coords for site in initial_defect_structure]
+            distmatrix = initial_defect_structure.lattice.get_all_distances(bulksites, initsites) #first index of this list is bulk index
+            min_dist_with_index = [[min(distmatrix[bulk_index]), int(bulk_index),
+                                    int(distmatrix[bulk_index].argmin())] for bulk_index in range(len(distmatrix))] # list of [min dist, bulk ind, defect ind]
+
+            found_defect = False
+            site_matching_indices = []
+            poss_defect = []
+            if isinstance(defect, (Vacancy, Interstitial)):
+                for mindist, bulk_index, defect_index in min_dist_with_index:
+                    if mindist < self.ltol:
+                        site_matching_indices.append( [bulk_index, defect_index])
+                    elif isinstance(defect, Vacancy):
+                        poss_defect.append( [bulk_index, bulksites[ bulk_index][:]])
+
+                if isinstance(defect, Interstitial):
+                    poss_defect = [ [ind, fc[:]] for ind, fc in enumerate(initsites) \
+                                    if ind not in np.array(site_matching_indices)[:,1]]
+
+            elif isinstance(defect, Substitution):
+                for mindist, bulk_index, defect_index in min_dist_with_index:
+                    species_match = bulk_sc_structure[bulk_index].specie == \
+                                    initial_defect_structure[defect_index].specie
+                    if mindist < self.ltol and species_match:
+                        site_matching_indices.append( [bulk_index, defect_index])
+
+                    elif not species_match:
+                        poss_defect.append( [defect_index, initsites[ defect_index][:]])
+
+
+            if len(poss_defect) == 1:
+                found_defect = True
+                defect_index_sc_coords = poss_defect[0][0]
+                defect_frac_sc_coords = poss_defect[0][1]
+            else:
+                self.logger.error("Found {} possible defect sites when matching bulk and "
+                                  "defect structure".format(len(poss_defect)))
+
+
+            if len(set(np.array(site_matching_indices)[:, 0])) != len(set(np.array(site_matching_indices)[:, 1])):
+                self.logger.error("Error occured in site_matching routine. Double counting of site matching "
+                                  "occured:{}\nAbandoning Kumagai parsing.".format(site_matching_indices))
+                found_defect = False
+
+            # assuming Wigner-Seitz radius for sampling radius
+            wz = initial_defect_structure.lattice.get_wigner_seitz_cell()
+            dist = []
+            for facet in wz:
+                midpt = np.mean(np.array(facet), axis=0)
+                dist.append(np.linalg.norm(midpt))
+            sampling_radius = min(dist)
+
+            if found_defect:
+                parameters.update({'defect_atomic_site_averages': defect_atomic_site_averages,
+                                   'site_matching_indices': site_matching_indices,
+                                   'sampling_radius': sampling_radius,
+                                   'defect_frac_sc_coords': defect_frac_sc_coords,
+                                   'defect_index_sc_coords': defect_index_sc_coords})
+            else:
+                self.logger.error("Error in mapping procedure for bulk to initial defect structure.")
+
+        else:
+            self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have outcar values for '
+                              'parsing Kumagai'.format(defect_task['task_label'], defect_task['task_id']))
+
+        return parameters
+
+
+class DefectThermoBuilder(Builder):
+    def __init__(self,
+                 defects,
+                 defectthermo,
+                 query=None,
+                 ltol=0.2,
+                 stol=0.3,
+                 angle_tol=5,
+                 update_all=False,
+                 **kwargs):
+        """
+        Creates DefectEntry from vasp task docs
+
+        Args:
+            defects (Store): Store of defect entries
+            defectthermo (Store): Store of DefectPhaseDiagram documents
+            query (dict): dictionary to limit materials to be analyzed (note query is for defects Store)
+            ltol (float): StructureMatcher tuning parameter for matching tasks to materials
+            stol (float): StructureMatcher tuning parameter for matching tasks to materials
+            angle_tol (float): StructureMatcher tuning parameter for matching tasks to materials
+            update_all (bool): Whether to consider all task ids from defects store.
+                Default is False (will not re-consider task-ids which have been considered previously.
+        """
+
+        self.defects = defects
+        self.defectthermo = defectthermo
+        self.query = query if query else {}
+        self.ltol = ltol
+        self.stol = stol
+        self.angle_tol = angle_tol
+        self.update_all = update_all
+        super().__init__(sources=[defects], targets=[defectthermo], **kwargs)
+
+    def get_items(self):
+        self.logger.info("DefectThermo Builder Started")
+
+        # Save timestamp for update operation
+        self.time_stamp = datetime.utcnow()
+
+        #get all new Defect Entries since last time DefectThermo was updated...
+        q = dict(self.query)
+        self.logger.debug('query is initially: {}'.format( q))
+        if not self.update_all:
+            # if not update_all then grab entry_ids of defects that have been analyzed already...
+            prev_dpd = list(self.defectthermo.query(properties=['metadata.all_entry_ids_considered']))
+            self.logger.debug('Found {} previous dpd objects'.format( len(prev_dpd)))
+
+            if "entry_id" not in q.keys():
+                q["entry_id"] = {"$nin": []}
+            elif "$nin" not in q["entry_id"].keys():
+                q["entry_id"]["$nin"] = []
+            for dpd in prev_dpd:
+                q["entry_id"]["$nin"].extend( list( dpd['metadata']['all_entry_ids_considered']))
+
+            self.logger.debug('query after removing previously considered entry_ids is: {}'.format( q))
+
+        # q.update(self.defects.lu_filter(self.defectthermo))  #TODO: does this work?? / is it needed?
+
+        # restricted amount of defect info for PD, so not an overwhelming database query
+        entry_keys_needed_for_thermo = ['defect', 'parameters.task_level_metadata', 'parameters.last_updated',
+                                        'task_id', 'entry_id', '@module', '@class',
+                                        'uncorrected_energy', 'corrections', 'parameters.dielectric',
+                                        'parameters.cbm', 'parameters.vbm', 'parameters.gap',
+                                        'parameters.hybrid_cbm', 'parameters.hybrid_vbm',
+                                        'parameters.hybrid_gap', 'parameters.potalign',
+                                        'parameters.freysoldt_meta',
+                                        'parameters.kumagai_meta', 'parameters.is_compatible',
+                                        'parameters.delocalization_meta', 'parameters.phasediagram_meta']
+        defect_entries = list(self.defects.query(criteria=q,
+                                                        properties=entry_keys_needed_for_thermo))
+        thermo_entries = list(self.defectthermo.query(properties=['bulk_chemsys', 'bulk_prim_struct',
+                                                                  'run_metadata', 'entry_id']))
+        self.logger.info("Found {} new defect entries to consider".format( len(defect_entries)))
+
+        #group defect entries based on bulk types and task level metadata info
+
+        sm = StructureMatcher(ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol,
+                              primitive_cell=True, scale=False, attempt_supercell=True,
+                                allow_subset=False)
+        grpd_entry_list = []
+        for entry_dict in defect_entries:
+            #get bulk chemsys and struct
+            base_bulk_struct = Structure.from_dict(entry_dict['defect']['structure'])
+            base_bulk_struct = base_bulk_struct.get_primitive_structure()
+            bulk_chemsys = "-".join(sorted((base_bulk_struct.symbol_set)))
+
+            #get run metadata info
+            entry_dict_md = entry_dict['parameters']['task_level_metadata']
+            run_metadata = entry_dict_md['incar_calctype_summary'].copy()
+            if 'potcar_summary' in entry_dict_md:
+                run_metadata.update( entry_dict_md['potcar_summary'].copy())
+                #only want to filter based on bulk POTCAR
+                pspec, plab = set(), set()
+                for symbol, full_potcar_label in zip(run_metadata['pot_labels'],
+                                                     run_metadata['pot_spec']):
+                    red_sym = symbol.split('_')[0]
+                    if red_sym in base_bulk_struct.symbol_set:
+                        pspec.add( symbol)
+                        plab.add( full_potcar_label)
+
+                run_metadata['pot_spec'] = "-".join(sorted(set(pspec))) #had to switch to this approach for proper dict comparison later on
+                run_metadata['pot_labels'] = "-".join(sorted(set(plab)))
+
+            # see if defect_entry matches an grouped entry list item already in progress
+            # does this by matching bulk structure and run metadata
+            matched = False
+            for grp_ind, grpd_entry in enumerate(grpd_entry_list):
+                if grpd_entry[0] == bulk_chemsys and grpd_entry[1] == run_metadata:
+                    if sm.fit( base_bulk_struct, grpd_entry[2]):
+                        grpd_entry[3].append( entry_dict.copy())
+                        matched = True
+                        break
+                    else:
+                        continue
+                else:
+                    continue
+
+            if not matched: #have not logged yet, see if previous thermo phase diagram was created for this system
+                for t_ent in thermo_entries:
+                    if t_ent['bulk_chemsys'] == bulk_chemsys and t_ent['run_metadata'] == run_metadata:
+                        if sm.fit(base_bulk_struct, Structure.from_dict(t_ent['bulk_prim_struct'])):
+                            if not self.update_all:
+                                #if not wanting to update everything, then get previous entries from thermo database
+                                old_entry_set = list(self.defectthermo.query( {'entry_id': t_ent['entry_id']},
+                                                                               properties=['entries']))[0]['entries']
+                                old_entries_list = [entry.as_dict() if type(entry) != dict else entry for entry in
+                                                    old_entry_set]
+                            else:
+                                old_entries_list = []
+
+                            old_entries_list.append( entry_dict.copy())
+                            grpd_entry_list.append( [bulk_chemsys, run_metadata.copy(),  base_bulk_struct.copy(),
+                                                     old_entries_list, t_ent['entry_id']])
+                            matched = True
+                            break
+                        else:
+                            continue
+                    else:
+                        continue
+
+            if not matched:
+                # create new thermo phase diagram for this system,
+                # entry_id will be generated later on
+                grpd_entry_list.append( [bulk_chemsys, run_metadata.copy(), base_bulk_struct.copy(),
+                                         [entry_dict.copy()], 'None'])
+
+
+        for new_defects_for_thermo_entry in grpd_entry_list:
+            yield new_defects_for_thermo_entry
+
+
+    def process_item(self, items):
+        #group defect entries into groups of same defect type (charge included)
+        distinct_entries_full = [] #for storing full defect_dict
+        distinct_entries = {} #for storing defect object
+        bulk_chemsys, run_metadata, bulk_prim_struct, entrylist, entry_id = items
+        self.logger.debug("Processing bulk_chemsys {}".format(bulk_chemsys))
+
+        needed_entry_keys = ['@module', '@class', 'defect', 'uncorrected_energy', 'corrections',
+                             'parameters', 'entry_id', 'task_id']
+        pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False)
+        for entry_dict in entrylist:
+            entry = DefectEntry.from_dict({k:v for k,v in entry_dict.items() if k in needed_entry_keys})
+            matched = False
+            for grpind, grpdefect in distinct_entries.items():
+                if pdc.are_equal( entry.defect, grpdefect.defect):
+                    matched = True
+                    break
+
+            if matched:
+                distinct_entries_full[grpind].append( entry_dict.copy())
+            else:
+                distinct_entries_full.append( [entry_dict.copy()])
+                nxt_ind = max( distinct_entries.keys()) + 1 if len(distinct_entries.keys()) else 0
+                distinct_entries[nxt_ind] = entry.copy()
+
+        #now sort through entries and pick one that has been updated last (but track all previous entry_ids)
+        all_entry_ids_considered, entries = [], []
+        for ident_entries in distinct_entries_full:
+            all_entry_ids_considered.extend([ent['entry_id'] for ent in ident_entries])
+            # sort based on which was done most recently
+            lu_list = []
+            for ent_ind, ent in enumerate(ident_entries):
+                try:
+                    lu = ent['parameters']['task_level_metadata']['defect_task_last_updated']
+                except:
+                    lu = ent['parameters']['last_updated']
+                if isinstance( lu, dict): #deal with case when datetime objects were not properly loaded
+                    lu = MontyDecoder().process_decoded( lu)
+                lu_list.append( [lu, ent_ind])
+            try:
+                lu_list.sort(reverse=True)
+            except:
+                for_print_lu_list = [[lu, ident_entries[ent_ind]['entry_id']] for lu, ent_ind in lu_list]
+                raise ValueError("Error with last_updated sorting of list:\n{}".format( for_print_lu_list))
+            recent_entry_dict = ident_entries[ lu_list[0][1]]
+
+            new_entry = DefectEntry.from_dict( {k:v for k,v in recent_entry_dict.items() if k in needed_entry_keys})
+            entries.append( new_entry.copy())
+
+        # get vbm and bandgap; also verify all vbms and bandgaps are the same
+        vbm = entries[0].parameters['vbm']
+        band_gap = entries[0].parameters['gap']
+        for ent in entries:
+            if vbm != ent.parameters['vbm']:
+                self.logger.error("Logging vbm = {} (retrieved from {}, task-id {}) but {} "
+                                  "(task-id {}) has vbm = {}. Be careful with this defectphasediagram "
+                                  "if these are very different".format( vbm, entries[0].name, entries[0].entry_id,
+                                                                        ent.name, ent.entry_id, ent.parameters['vbm']))
+            if band_gap != ent.parameters['gap']:
+                self.logger.error("Logging gap = {} (retrieved from {}, task-id {}) but {} "
+                                  "(task-id {}) has gap = {}. Be careful with this defectphasediagram "
+                                  "if these are very different".format( band_gap, entries[0].name, entries[0].entry_id,
+                                                                        ent.name, ent.entry_id, ent.parameters['gap']))
+
+        defect_phase_diagram = DefectPhaseDiagram( entries, vbm, band_gap, filter_compatible=False,
+                                                   metadata={'all_entry_ids_considered': all_entry_ids_considered})
+        defect_phase_diagram_as_dict = defect_phase_diagram.as_dict()
+        defect_phase_diagram_as_dict.update( {'bulk_chemsys': bulk_chemsys, 'bulk_prim_struct': bulk_prim_struct.as_dict(),
+                                'run_metadata': run_metadata, 'entry_id': entry_id, 'last_updated': datetime.utcnow()})
+
+        return defect_phase_diagram_as_dict
+
+    def update_targets(self, items):
+        list_entry_ids = list(self.defectthermo.distinct('entry_id'))
+        if len(list_entry_ids):
+            next_entry_id = max(list_entry_ids) + 1
+        else:
+            next_entry_id = 1
+        for item in items:
+            if item['entry_id'] == 'None':
+                item['entry_id'] = next_entry_id
+                next_entry_id += 1
+
+        self.logger.info("Updating {} DefectThermo documents".format(len(items)))
+
+        self.defectthermo.update(items, update_lu=True, key='entry_id')
diff --git a/emmet-core/emmet/core/cp2k/calc_types/__init__.py b/emmet-core/emmet/core/cp2k/calc_types/__init__.py
new file mode 100644
index 0000000000..9503f160d7
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/calc_types/__init__.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+try:
+    import emmet.core.cp2k.calc_types.enums
+except ImportError:
+    import emmet.core.cp2k.calc_types.generate
+
+from emmet.core.cp2k.calc_types.enums import RunType, TaskType, CalcType
+from emmet.core.cp2k.calc_types.utils import run_type, task_type, calc_type
diff --git a/emmet-core/emmet/core/cp2k/calc_types/enums.py b/emmet-core/emmet/core/cp2k/calc_types/enums.py
new file mode 100644
index 0000000000..8f82f8fcd7
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/calc_types/enums.py
@@ -0,0 +1,95 @@
+"""
+Autogenerated Enums for CP2K RunType, TaskType, and CalcType
+Do not edit this by hand. Edit generate.py or run_types.yaml instead
+"""
+from emmet.core.utils import ValueEnum
+
+
+class RunType(ValueEnum):
+    """ CP2K calculation run types """
+
+    PBE = "PBE"
+    GGA = "GGA"
+    R2SCAN = "R2SCAN"
+    METAGGA = "METAGGA"
+    HYBRID = "HYBRID"
+    HSE06 = "HSE06"
+    PBE0 = "PBE0"
+    RSH = "RSH"
+    PBE_U = "PBE+U"
+    GGA_U = "GGA+U"
+    R2SCAN_U = "R2SCAN+U"
+    METAGGA_U = "METAGGA+U"
+    HYBRID_U = "HYBRID+U"
+    HSE06_U = "HSE06+U"
+    PBE0_U = "PBE0+U"
+    RSH_U = "RSH+U"
+    LDA = "LDA"
+    LDA_U = "LDA+U"
+
+
+class TaskType(ValueEnum):
+    """ CP2K calculation task types """
+
+    Static = "Static"
+    Structure_Optimization = "Structure Optimization"
+    Constrained_Structure_Optimization = "Constrained Structure Optimization"
+
+
+class CalcType(ValueEnum):
+    """ CP2K calculation types """
+
+    PBE_Static = "PBE Static"
+    PBE_Structure_Optimization = "PBE Structure Optimization"
+    PBE_Constrained_Structure_Optimization = "PBE Constrained Structure Optimization"
+    GGA_Static = "GGA Static"
+    GGA_Structure_Optimization = "GGA Structure Optimization"
+    GGA_Constrained_Structure_Optimization = "GGA Constrained Structure Optimization"
+    R2SCAN_Static = "R2SCAN Static"
+    R2SCAN_Structure_Optimization = "R2SCAN Structure Optimization"
+    R2SCAN_Constrained_Structure_Optimization = "R2SCAN Constrained Structure Optimization"
+    METAGGA_Static = "METAGGA Static"
+    METAGGA_Structure_Optimization = "METAGGA Structure Optimization"
+    METAGGA_Constrained_Structure_Optimization = "METAGGA Constrained Structure Optimization"
+    HYBRID_Static = "HYBRID Static"
+    HYBRID_Structure_Optimization = "HYBRID Structure Optimization"
+    HYBRID_Constrained_Structure_Optimization = "HYBRID Constrained Structure Optimization"
+    HSE06_Static = "HSE06 Static"
+    HSE06_Structure_Optimization = "HSE06 Structure Optimization"
+    HSE06_Constrained_Structure_Optimization = "HSE06 Constrained Structure Optimization"
+    PBE0_Static = "PBE0 Static"
+    PBE0_Structure_Optimization = "PBE0 Structure Optimization"
+    PBE0_Constrained_Structure_Optimization = "PBE0 Constrained Structure Optimization"
+    RSH_Static = "RSH Static"
+    RSH_Structure_Optimization = "RSH Structure Optimization"
+    RSH_Constrained_Structure_Optimization = "RSH Constrained Structure Optimization"
+    PBE_U_Static = "PBE+U Static"
+    PBE_U_Structure_Optimization = "PBE+U Structure Optimization"
+    PBE_U_Constrained_Structure_Optimization = "PBE+U Constrained Structure Optimization"
+    GGA_U_Static = "GGA+U Static"
+    GGA_U_Structure_Optimization = "GGA+U Structure Optimization"
+    GGA_U_Constrained_Structure_Optimization = "GGA+U Constrained Structure Optimization"
+    R2SCAN_U_Static = "R2SCAN+U Static"
+    R2SCAN_U_Structure_Optimization = "R2SCAN+U Structure Optimization"
+    R2SCAN_U_Constrained_Structure_Optimization = "R2SCAN+U Constrained Structure Optimization"
+    METAGGA_U_Static = "METAGGA+U Static"
+    METAGGA_U_Structure_Optimization = "METAGGA+U Structure Optimization"
+    METAGGA_U_Constrained_Structure_Optimization = "METAGGA+U Constrained Structure Optimization"
+    HYBRID_U_Static = "HYBRID+U Static"
+    HYBRID_U_Structure_Optimization = "HYBRID+U Structure Optimization"
+    HYBRID_U_Constrained_Structure_Optimization = "HYBRID+U Constrained Structure Optimization"
+    HSE06_U_Static = "HSE06+U Static"
+    HSE06_U_Structure_Optimization = "HSE06+U Structure Optimization"
+    HSE06_U_Constrained_Structure_Optimization = "HSE06+U Constrained Structure Optimization"
+    PBE0_U_Static = "PBE0+U Static"
+    PBE0_U_Structure_Optimization = "PBE0+U Structure Optimization"
+    PBE0_U_Constrained_Structure_Optimization = "PBE0+U Constrained Structure Optimization"
+    RSH_U_Static = "RSH+U Static"
+    RSH_U_Structure_Optimization = "RSH+U Structure Optimization"
+    RSH_U_Constrained_Structure_Optimization = "RSH+U Constrained Structure Optimization"
+    LDA_Static = "LDA Static"
+    LDA_Structure_Optimization = "LDA Structure Optimization"
+    LDA_Constrained_Structure_Optimization = "LDA Constrained Structure Optimization"
+    LDA_U_Static = "LDA+U Static"
+    LDA_U_Structure_Optimization = "LDA+U Structure Optimization"
+    LDA_U_Constrained_Structure_Optimization = "LDA+U Constrained Structure Optimization"
diff --git a/emmet-core/emmet/core/cp2k/calc_types/generate.py b/emmet-core/emmet/core/cp2k/calc_types/generate.py
new file mode 100644
index 0000000000..96b7f6f2cd
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/calc_types/generate.py
@@ -0,0 +1,79 @@
+""" Module to define various calculation types as Enums for CP2K """
+from itertools import product
+from pathlib import Path
+
+from monty.serialization import loadfn
+
+_RUN_TYPE_DATA = loadfn(str(Path(__file__).parent.joinpath("run_types.yaml").resolve()))
+_TASK_TYPES = [
+    "Static",
+    "Structure Optimization",
+    "Constrained Structure Optimization"
+]
+
+_RUN_TYPES = (
+    [
+        rt
+        for functional_class in _RUN_TYPE_DATA
+        for rt in _RUN_TYPE_DATA[functional_class]
+    ]
+    + [
+        f"{rt}+U"
+        for functional_class in _RUN_TYPE_DATA
+        for rt in _RUN_TYPE_DATA[functional_class]
+    ]
+    + ["LDA", "LDA+U"]
+)
+
+
+def get_enum_source(enum_name, doc, items):
+    header = f"""
+class {enum_name}(ValueEnum):
+    \"\"\" {doc} \"\"\"\n
+"""
+    items = [f'    {const} = "{val}"' for const, val in items.items()]
+
+    return header + "\n".join(items)
+
+
+run_type_enum = get_enum_source(
+    "RunType",
+    "CP2K calculation run types",
+    dict(
+        {
+            "_".join(rt.split()).replace("+", "_").replace("-", "_"): rt
+            for rt in _RUN_TYPES
+        }
+    ),
+)
+task_type_enum = get_enum_source(
+    "TaskType",
+    "CP2K calculation task types",
+    {"_".join(tt.split()): tt for tt in _TASK_TYPES},
+)
+calc_type_enum = get_enum_source(
+    "CalcType",
+    "CP2K calculation types",
+    {
+        f"{'_'.join(rt.split()).replace('+','_').replace('-','_')}_{'_'.join(tt.split())}": f"{rt} {tt}"
+        for rt, tt in product(_RUN_TYPES, _TASK_TYPES)
+    },
+)
+
+
+with open(Path(__file__).parent / "enums.py", "w") as f:
+    f.write(
+        """\"\"\"
+Autogenerated Enums for CP2K RunType, TaskType, and CalcType
+Do not edit this by hand. Edit generate.py or run_types.yaml instead
+\"\"\"
+from emmet.core.utils import ValueEnum
+
+"""
+    )
+    f.write(run_type_enum)
+    f.write("\n\n")
+    f.write(task_type_enum)
+    f.write("\n\n")
+    f.write(calc_type_enum)
+    f.write("\n")
diff --git a/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml b/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml
new file mode 100644
index 0000000000..8d52d481fb
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml
@@ -0,0 +1,25 @@
+GGA:
+  PBE:
+    FUNCTIONAL: PBE
+    FRACTION: 0
+  GGA:
+    GGA: --
+METAGGA:
+  R2SCAN:
+    FUNCTIONAL:
+      - MGGA_C_R2SCAN
+      - MGGA_X_R2SCAN
+    FRACTION: 0
+  METAGGA:
+    METAGGA: --
+HYBRID:
+  HYBRID:
+    HYBRID: --
+  HSE06:
+    FRACTION: 0.25
+    INTERACTION_POTENTIAL: SHORTRANGE
+  PBE0:
+    FRACTION: 0.25
+    INTERACTION_POTENTIAL: TRUNCATED
+  RSH:
+    INTERACTION_POTENTIAL: TRUNCATED_MIX_CL
\ No newline at end of file
diff --git a/emmet-core/emmet/core/cp2k/calc_types/utils.py b/emmet-core/emmet/core/cp2k/calc_types/utils.py
new file mode 100644
index 0000000000..99a1112264
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/calc_types/utils.py
@@ -0,0 +1,133 @@
+""" Module to define various calculation types as Enums for CP2K """
+from pathlib import Path
+from typing import Dict
+from monty.serialization import loadfn
+from typing import Iterable
+
+from pymatgen.io.cp2k.inputs import Cp2kInput
+
+from emmet.core.cp2k.calc_types import RunType, TaskType, CalcType
+
+_RUN_TYPE_DATA = loadfn(str(Path(__file__).parent.joinpath("run_types.yaml").resolve()))
+
+
+def run_type(dft: Dict) -> RunType:
+    """
+    Determines the run_type from the CP2K input dict
+    This is adapted from pymatgen to be far less unstable
+
+    Args:
+        dft: dictionary of dft parameters (standard from task doc)
+    """
+
+    def _variant_equal(v1, v2) -> bool:
+        """
+        helper function to deal with strings
+        """
+        if isinstance(v1, str) and isinstance(v2, str):
+            return v1.strip().upper() == v2.strip().upper()
+        elif isinstance(v1, Iterable) and isinstance(v2, Iterable):
+            return set(v1) == set(v2)
+        else:
+            return v1 == v2
+
+    is_hubbard = '+U' if dft.get('dft_plus_u') else ''
+
+    parameters = {
+        'FUNCTIONAL': dft.get('functional'),
+        'INTERACTION_POTENTIAL': dft.get('hfx', {}).get('Interaction_Potential'),
+        'FRACTION': dft.get('hfx', {}).get('FRACTION', 0)
+    }
+
+    # Standard calc will only have one functional. If there are multiple functionals
+    # used this is either a hybrid calc or a non-generic mixed calculation.
+    if len(parameters['FUNCTIONAL']) == 1:
+        parameters['FUNCTIONAL'] = parameters['FUNCTIONAL'][0]
+
+    # If all parameters in for the functional_class.special_type located in
+    # run_types.yaml are met, then that is the run type.
+    for functional_class in _RUN_TYPE_DATA:
+        for special_type, params in _RUN_TYPE_DATA[functional_class].items():
+            if all(
+                [
+                    _variant_equal(parameters.get(param, True), value)
+                    for param, value in params.items()
+                ]
+            ):
+                return RunType(f"{functional_class}{is_hubbard}")
+
+    # TODO elegant way to handle this? 
+    # This is a hack to get the non-standard hybrids to work
+    if parameters.get('FRACTION'):
+        return RunType(f"HYBRID{is_hubbard}")
+
+def task_type(
+    inputs: Dict
+) -> TaskType:
+    """
+    Determines the task type
+
+    Args:
+        inputs
+    """
+
+    calc_type = []
+    cp2k_run_type = inputs.get('cp2k_global').get('Run_type', '')
+
+    if cp2k_run_type.upper() in ['ENERGY', 'ENERGY_FORCE', 'WAVEFUNCTION_OPTIMIZATION', 'WFN_OPT']:
+        calc_type.append('Static')
+
+    elif cp2k_run_type.upper() in ['GEO_OPT', 'GEOMETRY_OPTIMIZATION', 'CELL_OPT']:
+        calc_type.append('Structure Optimization')
+
+    elif cp2k_run_type.upper() in ['BAND']:
+        calc_type.append('Band')
+
+    elif cp2k_run_type.upper() in ['MOLECULAR_DYNAMICS', 'MD']:
+        calc_type.append('Molecular Dynamics')
+
+    elif cp2k_run_type.upper() in ['MONTE_CARLO', 'MC', 'TMC', 'TAMC']:
+        calc_type.append('Monte Carlo')
+
+    elif cp2k_run_type.upper() in ['LINEAR_RESPONSE', 'LR']:
+        calc_type.append('Linear Response')
+
+    elif cp2k_run_type.upper() in ['VIBRATIONAL_ANALYSIS', 'NORMAL_MODES']:
+        calc_type.append('Vibrational Analysis')
+
+    elif cp2k_run_type.upper() in ['ELECTRONIC_SPECTRA', 'SPECTRA']:
+        calc_type.append('Electronic Spectra')
+
+    elif cp2k_run_type.upper() in ['NEGF']:
+        calc_type.append('Non-equilibrium Green\'s Function')
+
+    elif cp2k_run_type.upper() in ['PINT', 'DRIVER']:
+        calc_type.append('Path Integral')
+
+    elif cp2k_run_type.upper() in ['RT_PROPAGATION', 'EHRENFEST_DYN']:
+        calc_type.append('Real-time propagation')
+
+    elif cp2k_run_type.upper() in ['BSSE']:
+        calc_type.append('Base set superposition error')
+
+    elif cp2k_run_type.upper() in ['DEBUG']:
+        calc_type.append('Debug analysis')
+
+    elif cp2k_run_type.upper() in ['NONE']:
+        calc_type.append('None')
+
+    return TaskType(" ".join(calc_type))
+
+def calc_type(
+    inputs: Dict,
+) -> CalcType:
+    """
+    Determines the calc type
+
+    Args:
+        inputs: dict from InputSummary containing necessary data for determining 
+            calc type
+    """
+    rt = run_type(inputs.get("dft")).value
+    tt = task_type(inputs).value
+    return CalcType(f"{rt} {tt}")
diff --git a/emmet-core/emmet/core/cp2k/material.py b/emmet-core/emmet/core/cp2k/material.py
new file mode 100644
index 0000000000..46f633e14c
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/material.py
@@ -0,0 +1,235 @@
+""" Core definition of a Materials Document """
+from typing import List, Mapping, Sequence, Tuple
+
+from pydantic import Field
+from pymatgen.analysis.structure_matcher import StructureMatcher
+from pymatgen.entries.computed_entries import ComputedStructureEntry
+
+from emmet.core.settings import EmmetSettings
+from emmet.core.material import MaterialsDoc as CoreMaterialsDoc
+from emmet.core.material import PropertyOrigin as PropertyOrigin
+from emmet.core.structure import StructureMetadata
+from emmet.core.cp2k.calc_types import CalcType, RunType, TaskType
+from emmet.core.cp2k.task import TaskDocument
+from emmet.builders.cp2k.utils import get_mpid
+
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+
+SETTINGS = EmmetSettings()
+
+class MaterialsDoc(CoreMaterialsDoc, StructureMetadata):
+
+    calc_types: Mapping[str, CalcType] = Field(  # type: ignore
+        None,
+        description="Calculation types for all the calculations that make up this material",
+    )
+    task_types: Mapping[str, TaskType] = Field(
+        None,
+        description="Task types for all the calculations that make up this material",
+    )
+    run_types: Mapping[str, RunType] = Field(
+        None,
+        description="Run types for all the calculations that make up this material",
+    )
+
+    origins: Sequence[PropertyOrigin] = Field(
+        None, description="Mappingionary for tracking the provenance of properties"
+    )
+
+    entries: Mapping[RunType, ComputedStructureEntry] = Field(
+        None, description="Dictionary for tracking entries for CP2K calculations"
+    )
+
+    @classmethod
+    def from_tasks(
+        cls,
+        task_group: List[TaskDocument],
+        quality_scores=SETTINGS.CP2K_QUALITY_SCORES,
+        special_tags=SETTINGS.CP2K_SPECIAL_TAGS,
+    ) -> "MaterialsDoc":
+        """
+        Converts a group of tasks into one material
+        """
+
+        # Metadata
+        last_updated = max(task.last_updated for task in task_group)
+        created_at = min(task.completed_at for task in task_group)
+        task_ids = list({task.task_id for task in task_group})
+
+        deprecated_tasks = list(
+            {task.task_id for task in task_group if not task.is_valid}
+        )
+        run_types = {task.task_id: task.run_type for task in task_group}
+        task_types = {task.task_id: task.task_type for task in task_group}
+        calc_types = {task.task_id: task.calc_type for task in task_group}
+
+        # TODO: Fix the type checking by hardcoding the Enums?
+        structure_optimizations = [
+            task
+            for task in task_group
+            if task.task_type == TaskType.Structure_Optimization  # type: ignore
+        ]
+        statics = [task for task in task_group if task.task_type == TaskType.Static]  # type: ignore
+
+        # Material ID
+        possible_mat_ids = [task.task_id for task in structure_optimizations + statics]
+        material_id = min(possible_mat_ids)
+
+        def _structure_eval(task: TaskDocument):
+            """
+            Helper function to order structures optimization and statics calcs by
+            - Functional Type
+            - Spin polarization
+            - Special Tags
+            - Energy
+            """
+
+            task_run_type = task.run_type
+
+            is_valid = task.task_id in deprecated_tasks
+
+            return (
+                -1 * is_valid,
+                -1 * quality_scores.get(task_run_type.value, 0),
+                -1 * (2 if task.input.dft.get('UKS') else 1),
+                -1 * task.nsites  # TODO Make k-point density
+                -1 * sum(task.input.get(tag, False) for tag in special_tags),  # TODO what is this?
+                task.output.energy_per_atom,
+            )
+
+        structure_calcs = structure_optimizations + statics
+        best_structure_calc = sorted(structure_calcs, key=_structure_eval)[0]
+        structure = best_structure_calc.output.structure
+
+        # Initial Structures
+        initial_structures = [task.input.structure for task in task_group]
+        sm = StructureMatcher(
+            ltol=0.1, stol=0.1, angle_tol=0.1, scale=False, attempt_supercell=False
+        )
+        initial_structures = [
+            group[0] for group in sm.group_structures(initial_structures)
+        ]
+
+        # Deprecated
+        deprecated = all(
+            task.task_id in deprecated_tasks for task in structure_optimizations
+        )
+
+        # Origins
+        origins = [
+            PropertyOrigin(
+                name="structure",
+                task_id=best_structure_calc.task_id,
+                last_updated=best_structure_calc.last_updated,
+            )
+        ]
+
+        # entries
+        entries = {}
+        all_run_types = set(run_types.values())
+        for rt in all_run_types:
+            relevant_calcs = sorted(
+                [doc for doc in structure_calcs if doc.run_type == rt],
+                key=_structure_eval,
+            )
+            if len(relevant_calcs) > 0:
+                best_task_doc = relevant_calcs[0]
+                entry = best_task_doc.entry
+                entry.data["task_id"] = entry.entry_id
+                entry.entry_id = material_id
+
+                # TODO Standard material doc doesn't have this
+                best_task_doc.output.structure.remove_oxidation_states()
+                entries[rt] = ComputedStructureEntry(
+                    structure=best_task_doc.output.structure,
+                    energy=entry.energy,
+                    correction=entry.correction,
+                    energy_adjustments=entry.energy_adjustments,
+                    parameters=entry.parameters,
+                    data=entry.data,
+                    entry_id=entry.entry_id
+                )
+
+        # Warnings
+        # TODO: What warning should we process?
+
+        return cls.from_structure(
+            structure=structure,
+            material_id=material_id,
+            last_updated=last_updated,
+            created_at=created_at,
+            task_ids=task_ids,
+            calc_types=calc_types,
+            run_types=run_types,
+            task_types=task_types,
+            initial_structures=initial_structures,
+            deprecated=deprecated,
+            deprecated_tasks=deprecated_tasks,
+            origins=origins,
+            entries=entries,
+        )
+
+    @classmethod
+    def construct_deprecated_material(
+        cls, task_group: List[TaskDocument],
+    ) -> "MaterialsDoc":
+        """
+        Converts a group of tasks into a deprecated material
+
+        Args:
+            task_group: List of task document
+        """
+        if len(task_group) == 0:
+            raise Exception("Must have more than one task in the group.")
+
+        # Metadata
+        last_updated = max(task.last_updated for task in task_group)
+        created_at = min(task.completed_at for task in task_group)
+        task_ids = list({task.task_id for task in task_group})
+
+        deprecated_tasks = {task.task_id for task in task_group}
+        run_types = {task.task_id: task.run_type for task in task_group}
+        task_types = {task.task_id: task.task_type for task in task_group}
+        calc_types = {task.task_id: task.calc_type for task in task_group}
+
+        # Material ID
+        material_id = min([task.task_id for task in task_group])
+
+        # Choose any random structure for metadata
+        structure = SpacegroupAnalyzer(
+            task_group[0].output.structure, symprec=0.1
+        ).get_conventional_standard_structure()
+
+        # Deprecated
+        deprecated = True
+
+        return cls.from_structure(
+            structure=structure,
+            material_id=material_id,
+            last_updated=last_updated,
+            created_at=created_at,
+            task_ids=task_ids,
+            calc_types=calc_types,
+            run_types=run_types,
+            task_types=task_types,
+            deprecated=deprecated,
+            deprecated_tasks=deprecated_tasks,
+        )
+
+
+
+def ID_to_int(s_id: str) -> Tuple[str, int]:
+    """
+    Converts a string id to tuple
+    falls back to assuming ID is an Int if it can't process
+    Assumes string IDs are of form "[chars]-[int]" such as mp-234
+    """
+    if isinstance(s_id, str):
+        return (s_id.split("-")[0], int(str(s_id).split("-")[-1]))
+    elif isinstance(s_id, (int, float)):
+        return ("", s_id)
+    else:
+        return None
+
+
+
diff --git a/emmet-core/emmet/core/cp2k/task.py b/emmet-core/emmet/core/cp2k/task.py
new file mode 100644
index 0000000000..9d569fbac0
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/task.py
@@ -0,0 +1,219 @@
+""" Core definition of a CP2K Task Document """
+from datetime import datetime
+from typing import Dict, List
+
+from pydantic import BaseModel, Field, validator
+from pymatgen.analysis.structure_analyzer import oxide_type
+from pymatgen.core import Structure
+from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
+
+from emmet.core.base import EmmetBaseModel
+from emmet.core.structure import StructureMetadata
+from emmet.core.utils import ValueEnum
+from emmet.core.cp2k.calc_types import calc_type, run_type, task_type, CalcType, RunType, TaskType
+from emmet.core.math import Matrix3D, Vector3D
+from emmet.core.mpid import MPID
+
+
+class BaseTaskDocument(EmmetBaseModel):
+    """
+    Definition of base Task Document
+    """
+
+    calc_code: str = Field(description="The calculation code used to compute this task")
+    version: str = Field(None, description="The version of the calculation code")
+    dir_name: str = Field(None, description="The directory for this task")
+    task_id: MPID = Field(None, description="the Task ID For this document")
+
+    completed: bool = Field(False, description="Whether this calcuation completed")
+    completed_at: datetime = Field(
+        None, description="Timestamp for when this task was completed"
+    )
+    last_updated: datetime = Field(
+        default_factory=datetime.utcnow,
+        description="Timestamp for this task document was last updated",
+    )
+
+    tags: List[str] = Field([], description="Metadata tags for this task document")
+
+    warnings: List[str] = Field(
+        None, description="Any warnings related to this property"
+    )
+
+
+class Status(ValueEnum):
+    """
+    CP2K Calculation State
+    """
+
+    SUCESS = "successful"
+    FAILED = "failed"
+
+
+class InputSummary(BaseModel):
+    """
+    Summary of inputs for a CP2K calculation
+    """
+
+    structure: Structure = Field(None, description="The input structure object")
+
+    atomic_kind_info: Dict = Field(None, description="Description of parameters used for each atomic kind")
+
+    cp2k_input: Dict = Field(None, description="The cp2k input used for this task")
+
+    dft: Dict = Field(None, description="Description of the DFT parameters used in the last calc of this task")
+
+    cp2k_global: Dict = Field(None, description="CP2K global parameters used in the last calc of this task")
+
+    @validator('atomic_kind_info')
+    def remove_unnecessary(cls, atomic_kind_info):
+        for k in atomic_kind_info:
+            if 'total_pseudopotential_energy' in atomic_kind_info[k]:
+                del atomic_kind_info[k]['total_pseudopotential_energy']
+        return atomic_kind_info
+
+    @validator('dft')
+    def cleanup_dft(cls, dft):
+        if any(v.upper() == 'UKS' for v in dft.values()):
+            dft['UKS'] = True
+        return dft
+
+
+class OutputSummary(BaseModel):
+    """
+    Summary of the outputs for a CP2K calculation
+    """
+
+    structure: Structure = Field(None, description="The output structure object")
+    energy: float = Field(
+        None, description="The final total DFT energy for the last calculation"
+    )
+    energy_per_atom: float = Field(
+        None, description="The final DFT energy per atom for the last calculation"
+    )
+    bandgap: float = Field(None, description="The DFT band gap for the last calculation")
+    forces: List[Vector3D] = Field(
+        None, description="Forces on atoms from the last calculation"
+    )
+    stress: Matrix3D = Field(
+        None, description="Stress on the unit cell from the last calculation"
+    )
+
+
+class RunStatistics(BaseModel):
+    """
+    Summary of the Run statistics for a CP2K calculation
+    """
+
+    average_memory: float = Field(None, description="The average memory used in kb")
+    max_memory: float = Field(None, description="The maximum memory used in kb")
+    elapsed_time: float = Field(None, description="The real time elapsed in seconds")
+    system_time: float = Field(None, description="The system CPU time in seconds")
+    user_time: float = Field(
+        None, description="The user CPU time spent by CP2K in seconds"
+    )
+    total_time: float = Field(
+        None, description="The total CPU time for this calculation"
+    )
+    cores: int = Field(None, description="The number of cores used by CP2K")
+
+
+class TaskDocument(BaseTaskDocument, StructureMetadata):
+    """
+    Definition of CP2K Task Document
+    """
+
+    calc_code = "CP2K"
+    dir_name: str = Field(None, description="The directory for this CP2K task")
+    run_stats: RunStatistics = Field(
+        None,
+        description="Summary of runtime statistics for each calculation in this task",
+    )
+    completed_at: datetime = Field(
+        None, description="Timestamp for when this task was completed"
+    )
+    last_updated: datetime = Field(
+        None, description="Timestamp for this task document was last updated"
+    )
+
+    is_valid: bool = Field(
+        True, description="Whether this task document passed validation or not"
+    )
+
+    input: InputSummary = Field(None)
+
+    output: OutputSummary = Field(None)
+
+    state: Status = Field(None, description="State of this calculation")
+
+    orig_inputs: Dict = Field(
+        None, description="Summary of the original CP2K inputs"
+    )
+    task_id: int = Field(None, description="the Task ID For this document")
+    tags: List[str] = Field([], description="Metadata tags for this task document")
+
+    run_type: RunType = Field(None)
+    task_type: TaskType = Field(None)
+    calc_type: CalcType = Field(None)
+
+    @validator('input', pre=True, always=True)
+    def rename_global(cls, input):
+        if 'cp2k_global' not in input:
+            input['cp2k_global'] = input.pop('global')
+        return input
+
+    @validator('run_type', pre=True, always=True)
+    def find_run_type(cls, v, values):
+        return run_type(values.get('input').dft)
+
+    @validator('task_type', pre=True, always=True)
+    def find_task_type(cls, v, values):
+        return task_type(values.get('input').dict())
+
+    @validator('calc_type', pre=True, always=True)
+    def find_calc_type(cls, v, values):
+        return calc_type(values.get("input").dict())
+
+    @property
+    def entry(self):
+        """ Turns a Task Doc into a ComputedEntry"""
+        entry_dict = {
+            "correction": 0.0,
+            "entry_id": self.task_id,
+            "composition": self.output.structure.composition,
+            "energy": self.output.energy,
+            "parameters": {
+                "atomic_kind_info": self.input.atomic_kind_info,
+                # This is done to be compatible with MontyEncoder for the ComputedEntry
+                "run_type": str(self.run_type),
+            },
+            "data": {
+                "oxide_type": oxide_type(self.output.structure),
+                "last_updated": self.last_updated,
+            },
+        }
+
+        return ComputedEntry.from_dict(entry_dict)
+
+    @property
+    def structure_entry(self) -> ComputedStructureEntry:
+        """Turns a Task Doc into a ComputedStructureEntry"""
+        entry_dict = {
+            "correction": 0.0,
+            "entry_id": self.task_id,
+            "composition": self.output.structure.composition,
+            "energy": self.output.energy,
+            "parameters": {
+                "atomic_kind_info": self.input.atomic_kind_info,
+                # This is done to be compatible with MontyEncoder for the ComputedEntry
+                "run_type": str(self.run_type),
+            },
+            "data": {
+                "oxide_type": oxide_type(self.output.structure),
+                "last_updated": self.last_updated,
+            },
+            "structure": self.output.structure,
+        }
+
+        return ComputedStructureEntry.from_dict(entry_dict)
+
diff --git a/emmet-core/emmet/core/cp2k/validation.py b/emmet-core/emmet/core/cp2k/validation.py
new file mode 100644
index 0000000000..f727158cee
--- /dev/null
+++ b/emmet-core/emmet/core/cp2k/validation.py
@@ -0,0 +1,76 @@
+from datetime import datetime
+from typing import Dict, List, Union
+
+import numpy as np
+from pydantic import Field, PyObject
+
+from emmet.core.settings import EmmetSettings
+from emmet.core.base import EmmetBaseModel
+from emmet.core.mpid import MPID
+from emmet.core.utils import DocEnum
+from emmet.core.cp2k.task import TaskDocument
+
+SETTINGS = EmmetSettings()
+
+
+class DeprecationMessage(DocEnum):
+    MANUAL = "M", "manual deprecation"
+    FORCES = "C001", "Forces too large"
+    CONVERGENCE = "E001", "Calculation did not converge"
+
+class ValidationDoc(EmmetBaseModel):
+    """
+    Validation document for a VASP calculation
+    """
+    calc_code = "cp2k"
+
+    task_id: MPID = Field(..., description="The task_id for this validation document")
+    valid: bool = Field(False, description="Whether this task is valid or not")
+    last_updated: datetime = Field(
+        description="Last updated date for this document",
+        default_factory=datetime.utcnow,
+    )
+    reasons: List[Union[DeprecationMessage, str]] = Field(
+        None, description="List of deprecation tags detailing why this task isn't valid"
+    )
+    warnings: List[str] = Field(
+        [], description="List of potential warnings about this calculation"
+    )
+    data: Dict = Field(
+        description="Dictioary of data used to perform validation."
+        " Useful for post-mortem analysis"
+    )
+
+    class Config:
+        extra = "allow"
+
+    @classmethod
+    def from_task_doc(
+        cls,
+        task_doc: TaskDocument,
+    ) -> "ValidationDoc":
+        """
+
+        """
+
+        calc_type = task_doc.calc_type
+
+        # Force check
+        forces = np.array(task_doc.output.forces) > 0.05
+        if np.any(forces):
+            reasons = [DeprecationMessage.FORCES] 
+
+        reasons = []
+        data = {}
+        warnings = []
+
+        doc = ValidationDoc(
+            task_id=task_doc.task_id,
+            calc_type=calc_type,
+            run_type=task_doc.run_type,
+            valid=len(reasons) == 0,
+            reasons=reasons,
+            data=data,
+            warnings=warnings,
+        )
+        return doc
diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py
new file mode 100644
index 0000000000..d82e19959e
--- /dev/null
+++ b/emmet-core/emmet/core/defect.py
@@ -0,0 +1,539 @@
+""" Core definition for Defect property Document """
+from __future__ import annotations
+
+from datetime import datetime
+from tokenize import group
+from typing import ClassVar, Dict, Tuple, Mapping, List
+from pydantic import BaseModel, Field
+from pydantic import validator
+
+from monty.json import MontyDecoder
+from monty.tempfile import ScratchDir
+from itertools import groupby
+
+from pymatgen.core import Structure, Composition, Element
+from pymatgen.analysis.defects.core import DefectEntry, Defect
+from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility
+from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram, BrouwerDiagram
+from pymatgen.electronic_structure.dos import CompleteDos
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+from pymatgen.entries.computed_entries import CompositionEnergyAdjustment
+from pymatgen.entries.compatibility import MaterialsProject2020Compatibility
+from pymatgen.ext.matproj import MPRester
+
+from emmet.core.structure import StructureMetadata
+from emmet.core.mpid import MPID
+from emmet.core.thermo import ThermoDoc
+from emmet.core.cp2k.task import TaskDocument
+from emmet.core.cp2k.calc_types.enums import CalcType, TaskType, RunType
+from emmet.core.cp2k.material import MaterialsDoc
+from emmet.core.cp2k.calc_types.utils import run_type, task_type
+from emmet.builders.cp2k.utils import matcher
+
+from pymatgen.analysis.defects.corrections import FreysoldtCorrection2d
+from pymatgen.io.vasp.outputs import VolumetricData
+
+# TODO Update DefectDoc on defect entry level so you don't re-do uncessary corrections
+class DefectDoc(StructureMetadata):
+    """
+    A document used to represent a single defect. e.g. a O vacancy with a -2 charge.
+
+    This document can contain an arbitrary number of defect entries, originating from
+    pairs (defect and bulk) of calculations. This document provides access to the "best"
+    calculation of each run_type.
+    """
+
+    # TODO VASP MatDocs dont need this, but I get error requiring arbitrary type
+    class Config:
+        arbitrary_types_allowed = True
+
+    property_name: ClassVar[str] = "defect"
+
+    defect: Defect = Field(None, description="Pymatgen defect object for this defect doc")
+
+    name: str = Field(None, description="Name of this defect as generated by the defect object")
+
+    material_id: MPID = Field(None, description="Unique material ID for the bulk material")
+
+    task_ids: List[int] = Field(
+        None, description="All task ids used in creating this defect doc."
+    )
+
+    calc_types: Mapping[int, CalcType] = Field(  # type: ignore
+        None,
+        description="Calculation types for all the calculations that make up this material",
+    )
+    task_types: Mapping[int, TaskType] = Field(
+        None,
+        description="Task types for all the calculations that make up this material",
+    )
+    run_types: Mapping[int, RunType] = Field(
+        None,
+        description="Run types for all the calculations that make up this material",
+    )
+
+    tasks: Mapping[RunType, Tuple[TaskDocument, TaskDocument]] = Field(
+        None, description="Task documents (defect task, bulk task) for the defect entry of RunType"
+    )
+
+    entries: Mapping[RunType, DefectEntry] = Field(
+        None, description="Dictionary for tracking entries for CP2K calculations"
+    )
+
+    last_updated: datetime = Field(
+        description="Timestamp for when this document was last updated",
+        default_factory=datetime.utcnow,
+    )
+
+    created_at: datetime = Field(
+        description="Timestamp for when this material document was first created",
+        default_factory=datetime.utcnow,
+    )
+
+    metadata: Dict = Field(description="Metadata for this defect")
+
+    # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this
+    @validator("entries", pre=True)
+    def decode(cls, entries):
+        for e in entries:
+            if isinstance(entries[e], dict):
+                entries[e] = MontyDecoder().process_decoded({k: v for k, v in entries[e].items()})
+        return entries
+
+    def update(self, defect_task, bulk_task, dielectric, query='defect'):
+
+        defect_task_doc = TaskDocument(**defect_task)
+        bulk_task_doc = TaskDocument(**bulk_task)
+
+        rt = defect_task_doc.run_type
+        tt = defect_task_doc.task_type
+        ct = defect_task_doc.calc_type
+
+        # Metadata
+        last_updated = max(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now()
+        created_at = min(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now()
+
+        if defect_task_doc.task_id in self.task_ids:
+            return
+        else:
+            self.last_updated = last_updated
+            self.created_at = created_at
+            self.task_ids.append(defect_task_doc.task_id)
+            #self['deprecated_tasks'].update(defect_task.task_id)
+
+            def _run_type(x):
+                return run_type(x[0]['input']['dft']).value
+
+            def _compare(new, old):
+                # TODO return kpoint density
+                return new['nsites'] > old.nsites
+
+            if defect_task_doc.run_type not in self.tasks or _compare(defect_task, self.tasks[rt][0]):
+                self.run_types.update({defect_task_doc.task_id: rt})
+                self.task_types.update({defect_task_doc.task_id: tt})
+                self.calc_types.update({defect_task_doc.task_id: ct})
+                entry = self.__class__.get_defect_entry_from_tasks(
+                            defect_task=defect_task,
+                            bulk_task=bulk_task,
+                            dielectric=dielectric,
+                            query=query
+                        )
+                self.entries[rt] = entry
+                self.tasks[rt] = (defect_task_doc, bulk_task_doc)
+
+    def update_all(self, tasks, query='defect'):
+        for defect_task, bulk_task, dielectric in tasks:
+            self.update(defect_task=defect_task, bulk_task=bulk_task, dielectric=dielectric, query=query)
+
+    @classmethod
+    def from_tasks(cls, tasks: List, query='defect', material_id=None) -> "DefectDoc":
+        """
+        The standard way to create this document.
+
+        Args:
+            tasks: A list of defect,bulk task pairs which will be used to construct a
+                series of DefectEntry objects.
+            query: How to retrieve the defect object stored in the task.
+        """
+        task_group = [TaskDocument(**defect_task) for defect_task, bulk_task, dielectric in tasks]
+
+        # Metadata
+        last_updated = datetime.now() or max(task.last_updated for task in task_group)
+        created_at = datetime.now() or min(task.completed_at for task in task_group)
+        task_ids = {task.task_id for task in task_group}
+
+        deprecated_tasks = list(
+            {task.task_id for task in task_group if not task.is_valid}
+        )
+
+        run_types = {task.task_id: task.run_type for task in task_group}
+        task_types = {task.task_id: task.task_type for task in task_group}
+        calc_types = {task.task_id: task.calc_type for task in task_group}
+
+        def _run_type(x):
+            return run_type(x[0]['input']['dft']).value
+
+        def _task_type(x):
+            return task_type(x[0]['input']['dft']).value
+
+        def _sort(x):
+            # TODO return kpoint density, currently just does supercell size
+            return -x[0]['nsites'], x[0]['output']['energy']
+
+        entries = {}
+        final_tasks = {}
+        metadata = {}
+        for key, tasks_for_runtype in groupby(sorted(tasks, key=_run_type), key=_run_type):
+            sorted_tasks = sorted(tasks_for_runtype, key=_sort)
+            ents = [cls.get_defect_entry_from_tasks(t[0], t[1], t[2], query) for t in sorted_tasks]
+            best_entry = ents[0]
+            best_defect_task, best_bulk_task, dielectric = sorted_tasks[0]
+            metadata[key] = {'convergence': [(sorted_tasks[i][0]['nsites'], ents[i].energy) for i in range(len(ents))]}
+            best_defect_task, best_bulk_task = TaskDocument(**best_defect_task), TaskDocument(**best_bulk_task)
+            entries[best_defect_task.run_type] = best_entry
+            final_tasks[best_defect_task.run_type] = (best_defect_task, best_bulk_task)
+
+        data = {
+                'entries': entries,
+                'run_types': run_types,
+                'task_types': task_types,
+                'calc_types': calc_types,
+                'last_updated': last_updated,
+                'created_at': created_at,
+                'task_ids': task_ids,
+                'deprecated_tasks': deprecated_tasks,
+                'tasks': final_tasks,
+                'material_id': material_id if material_id else best_entry.parameters['material_id'],
+                'entry_ids': {rt: entries[rt].entry_id for rt in entries},
+                'defect': best_entry.defect,
+                'name': best_entry.defect.name,
+                'metadata': metadata,
+        }
+        prim = SpacegroupAnalyzer(best_entry.defect.bulk_structure).get_primitive_standard_structure()
+        data.update(StructureMetadata.from_structure(prim).dict())
+        return cls(**data)
+
+    @classmethod
+    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='transformations.history.0.defect'):
+        """
+        Extract a defect entry from a single pair (defect and bulk) of tasks.
+
+        Args:
+            defect_task: task dict for the defect calculation
+            bulk_task: task dict for the bulk calculation
+            dielectric: Dielectric doc if the defect is charged. If not present, no dielectric
+                corrections will be performed, even if the defect is charged.
+            query: Mongo-style query to retrieve the defect object from the defect task
+        """
+        parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task)
+        if dielectric:
+            parameters['dielectric'] = dielectric
+
+        defect_entry = DefectEntry(
+            cls.get_defect_from_task(query=query, task=defect_task),
+            uncorrected_energy=parameters['defect_energy'] - parameters['bulk_energy'],
+            parameters=parameters,
+            entry_id=parameters['entry_id']
+        )
+        DefectCompatibility().process_entry(defect_entry, perform_corrections=True)
+        defect_entry_as_dict = defect_entry.as_dict()
+        defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id']  # this seemed necessary for legacy db
+
+        return defect_entry
+
+    @classmethod
+    def get_defect_from_task(cls, query, task):
+        """
+        Unpack a Mongo-style query and retrieve a defect object from a task.
+        """
+        defect = unpack(query.split('.'), task)
+        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
+        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
+
+    @classmethod
+    def get_parameters_from_tasks(cls, defect_task, bulk_task):
+        """
+        Get parameters necessary to create a defect entry from defect and bulk task dicts
+
+        Args:
+            defect_task: task dict for the defect calculation
+            bulk_task: task dict for the bulk calculation
+        """
+
+        def get_init(x):
+            """
+            Helper function. If transformations were applied, get the structure post defect
+            transformation
+            """
+            if x.get('transformations', {}).get('history'):
+                for i, y in enumerate(x['transformations']['history']):
+                    if y['@class'] == 'DefectTransformation':
+                        if len(x['transformations']['history']) == 1:
+                            return Structure.from_dict(x['input']['structure'])
+                        else:
+                            return Structure.from_dict(x['transformations']['history'][i+1]['input_structure'])
+                return Structure.from_dict(x['transformations']['history'][0]['input_structure'])
+            return Structure.from_dict(x['input']['structure'])
+
+        init_defect_structure = get_init(defect_task)
+        init_bulk_structure = get_init(bulk_task)  # use init to avoid site_properties in matching
+
+        final_defect_structure = Structure.from_dict(defect_task['output']['structure'])
+        final_bulk_structure = Structure.from_dict(bulk_task['output']['structure'])
+
+        dfi, site_matching_indices = matcher(
+            init_bulk_structure, init_defect_structure,
+            final_bulk_struc=final_bulk_structure, final_defect_struc=final_defect_structure
+        )
+        defect_frac_sc_coords = final_defect_structure[dfi].frac_coords
+
+        parameters = {
+            'defect_energy': defect_task['output']['energy'],
+            'bulk_energy': bulk_task['output']['energy'],
+            'initial_defect_structure': init_defect_structure,
+            'final_defect_structure': final_defect_structure,
+            'vbm': bulk_task['output']['vbm'],
+            'cbm': bulk_task['output']['cbm'],
+            'defect_frac_sc_coords': defect_frac_sc_coords,
+            'entry_id': defect_task.get('task_id')
+        }
+
+        # TODO Should probably get these even if not needed for corrections
+        if init_defect_structure.charge != 0:
+            axis_grid = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree_grid']] if bulk_task['output'].get('v_hartree_grid') else None
+            bulk_planar_averages = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree_planar']] if bulk_task['output'].get('v_hartree_planar') else None
+            defect_planar_averages = [[float(x) for x in _] for _ in defect_task['output']['v_hartree_planar']] if defect_task['output'].get('v_hartree_planar') else None
+            bulk_site_averages = [float(x) for x in bulk_task['output']['v_hartree_sites']] if bulk_task['output'].get('v_hartree_sites') else None
+            defect_site_averages = [float(x) for x in defect_task['output']['v_hartree_sites']] if defect_task['output'].get('v_hartree_sites') else None
+
+            parameters['axis_grid'] = axis_grid
+            parameters['bulk_planar_averages'] = bulk_planar_averages
+            parameters['defect_planar_averages'] = defect_planar_averages
+            parameters['defect_frac_sc_coords'] = defect_frac_sc_coords
+            parameters['site_matching_indices'] = site_matching_indices
+            parameters['bulk_site_averages'] = bulk_site_averages
+            parameters['defect_site_averages'] = defect_site_averages
+
+        return parameters
+
+
+# TODO Some of this should be done by DefectCompatibility,
+# but it's not clear how to do that since 2d materials 
+# are not tagged in any particular way to allow defect compatibility
+# to decide which correction to apply
+class DefectDoc2d(DefectDoc):
+    """
+    DefectDoc subclass for 2D defects
+    """
+
+    @classmethod
+    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='transformations.history.0.defect'):
+        """
+        Get defect entry from defect and bulk tasks. 
+
+        Args:
+            defect_task: task dict for the defect calculation
+            bulk_task: task dict for the bulk calculation
+            dielectric: dielectric tensor for the defect calculation
+            query: query string for defect entry
+        """
+        parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task)
+        if dielectric:
+            eps_parallel = (dielectric[0][0] + dielectric[1][1]) / 2
+            eps_perp = dielectric[2][2]
+            parameters['dielectric'] = (eps_parallel - 1) / (1 - 1/eps_perp)
+
+        defect_entry = DefectEntry(
+            cls.get_defect_from_task(query=query, task=defect_task),
+            uncorrected_energy=parameters.pop('defect_energy') - parameters.pop('bulk_energy'),
+            parameters=parameters,
+            entry_id=parameters.pop('entry_id')
+        )
+
+        DefectCompatibility().process_entry(defect_entry, perform_corrections=False)
+        with ScratchDir('.'):
+            fc = FreysoldtCorrection2d(
+                    defect_entry.parameters.get('dielectric'), 
+                    "LOCPOT.ref", "LOCPOT.def", encut=520, buffer=2
+                ) 
+            lref = VolumetricData(
+                structure=Structure.from_dict(bulk_task['input']['structure']), 
+                data={'total': MontyDecoder().process_decoded(bulk_task['v_hartree'])}
+            )
+            ldef = VolumetricData(
+                structure=Structure.from_dict(defect_task['input']['structure']), 
+                data={'total': MontyDecoder().process_decoded(defect_task['v_hartree'])}
+            )
+            lref.write_file("LOCPOT.ref")
+            ldef.write_file("LOCPOT.def")
+            ecorr = fc.get_correction(defect_entry)
+            defect_entry.corrections.update(ecorr)
+            defect_entry.parameters['freysoldt2d_meta'] = fc.metadata
+
+        defect_entry_as_dict = defect_entry.as_dict()
+        defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id']  # this seemed necessary for legacy db
+
+        return defect_entry
+    
+class DefectThermoDoc(BaseModel):
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    property_name: ClassVar[str] = "Defect Thermo"
+
+    material_id: str = Field(None, description="Unique material ID for the host material")
+
+    task_ids: Dict = Field(
+        None, description="All task ids used in creating these phase diagrams"
+    )
+
+    #TODO not by run type... in principle shouldn't be this way, but DOS is almost always GGA
+    bulk_dos: CompleteDos = Field(None, description="Complete Density of States for the bulk Structure")
+
+    defect_phase_diagrams: Mapping[RunType, DefectPhaseDiagram] = Field(
+        None, description="Defect phase diagrams for each run type"
+    )
+
+    #brouwer_diagrams: Mapping[RunType, BrouwerDiagram] = Field(
+    #    None, description="Brouwer diagrams"
+    #)
+
+    '''
+    # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this
+    @validator("brouwer_diagrams", pre=True)
+    def decode(cls, brouwer_diagrams):
+        for e in brouwer_diagrams:
+            if isinstance(brouwer_diagrams[e], dict):
+                brouwer_diagrams[e] = MontyDecoder().process_decoded(
+                    {k: v for k, v in brouwer_diagrams[e].items()}
+                )
+        return brouwer_diagrams
+    '''
+    @classmethod
+    def from_docs(cls, defects: List[DefectDoc], thermos: List[ThermoDoc], electronic_structure: CompleteDos) -> "DefectThermoDoc":
+
+        DEFAULT_RT = RunType('GGA')  # TODO NEED A procedure for getting all GGA or GGA+U keys
+        DEFAULT_RT_U = RunType('GGA+U')
+
+        mpid = defects[0].material_id
+
+        #chempots = {
+        #    m.structure.composition.elements[0]:
+        #        {rt: m.entries[rt].energy_per_atom for rt in m.entries}
+        #    for m in materials if m.structure.composition.is_element
+        #}
+
+        chempots = {
+           td.composition.elements[0]: td.energy_per_atom 
+           for td in thermos if td.composition.is_element
+        }
+
+        defect_entries = {}
+        defect_phase_diagram = {}
+        vbms = {}
+        band_gaps = {}
+        brouwer_diagrams = {}
+        task_ids = {}
+
+        dos = CompleteDos.from_dict(electronic_structure)
+        bg = dos.get_gap()
+
+        """
+        for m in materials:
+            for rt, ent in m.entries.items():
+                __found_chempots__ = True
+
+                # Chempot shift
+                for el, amt in ent.composition.element_composition.items():
+                    if Element(el) not in chempots:
+                        __found_chempots__ = False
+                        break
+                    _rt = DEFAULT_RT if rt not in chempots[Element(el)] else rt
+                    adj = CompositionEnergyAdjustment(-chempots[Element(el)][_rt],
+                                                      n_atoms=amt,
+                                                      name=f"Elemental shift {el} to formation energy space"
+                                                      )
+                    ent.energy_adjustments.append(adj)
+
+                if not __found_chempots__:
+                    continue
+
+                # Other stuff
+                band_gaps[rt] = bg
+                ent.parameters['software'] = 'cp2k'
+                ent.structure.remove_spin()
+                ent.structure.remove_oxidation_states()
+                MaterialsProject2020Compatibility().process_entry(ent)
+        """
+        for d in defects:
+            for rt, ent in d.entries.items():
+                # Chempot shift
+                __found_chempots__ = True
+                comp = Composition(ent.defect.defect_composition, allow_negative=True) - \
+                       ent.defect.bulk_structure.composition
+                for el, amt in comp.items():
+                    if Element(el) not in chempots:
+                        __found_chempots__ = False
+                        break
+                    ent.corrections[f"Elemental shift {el} to formation energy space"] = -amt * chempots[Element(el)]
+
+                if not __found_chempots__:
+                    continue
+
+                # Other stuff
+                if rt not in defect_entries:
+                    defect_entries[rt] = []
+                if rt not in task_ids:
+                    task_ids[rt] = set()
+                defect_entries[rt].append(d.entries[rt])
+                vbms[rt] = d.entries[rt].parameters['vbm']  # TODO Need to find best vbm
+                task_ids[rt].update(d.task_ids)
+
+        for run_type in defect_entries:
+            # TODO MUST FILTER COMPATIBLE AT SOME POINT
+            defect_phase_diagram[run_type] = DefectPhaseDiagram(
+                entries=defect_entries[run_type],
+                vbm=vbms[run_type],
+                band_gap=bg, #band_gaps[run_type], # TODO Need to do rt dependent band gap
+                filter_compatible=False
+            )
+            """
+            brouwer_diagrams[run_type] = BrouwerDiagram(
+                defect_phase_diagram=defect_phase_diagram[run_type],
+                bulk_dos=dos,
+                entries=[
+                    m.entries[run_type]
+                    if run_type in m.entries else m.entries[DEFAULT_RT_U]
+                    if DEFAULT_RT_U in m.entries else m.entries[DEFAULT_RT]
+                    for m in materials
+                ]
+            )
+            """
+
+        data = {
+            'material_id': mpid,
+            'task_ids': task_ids,
+            'bulk_dos': dos,
+            'defect_phase_diagrams': defect_phase_diagram,
+        }
+
+        return cls(**{k: v for k, v in data.items()})
+
+
+def get_dos(mpid):
+    with MPRester() as mp:
+        return mp.get_dos_by_material_id(mpid)
+
+
+def get_entries(chemsys):
+    with MPRester() as mp:
+        return mp.get_entries_in_chemsys(chemsys)
+
+
+def unpack(query, d):
+    if not query:
+        return d
+    if isinstance(d, List):
+        return unpack(query[1:], d.__getitem__(int(query.pop(0))))
+    return unpack(query[1:], d.__getitem__(query.pop(0)))
diff --git a/emmet-core/emmet/core/mpid.py b/emmet-core/emmet/core/mpid.py
index 8a74b8fb91..69e22cfed2 100644
--- a/emmet-core/emmet/core/mpid.py
+++ b/emmet-core/emmet/core/mpid.py
@@ -32,7 +32,7 @@ def __init__(self, val: Union["MPID", int, str]):
 
         elif isinstance(val, str):
             parts = val.split("-")
-            parts[1] = int(parts[1])  # type: ignore
+            parts[-1] = int(parts[-1])  # type: ignore
             self.parts = tuple(parts)
             self.string = val
 
diff --git a/emmet-core/emmet/core/settings.py b/emmet-core/emmet/core/settings.py
index 9f6e606f13..a732f4f792 100644
--- a/emmet-core/emmet/core/settings.py
+++ b/emmet-core/emmet/core/settings.py
@@ -30,6 +30,24 @@ class EmmetSettings(BaseSettings):
         DEFAULT_CONFIG_FILE_PATH, description="File to load alternative defaults from"
     )
 
+    CP2K_SPECIAL_TAGS: List[str] = Field(
+        [], description="Special tags to prioritize for CP2K task documents"
+    )
+
+    CP2K_QUALITY_SCORES: Dict[str, int] = Field(
+        {"HYBRID": 4, "SCAN": 3, "GGA+U": 2, "GGA": 1},
+        description="Dictionary Mapping CP2K calculation run types to rung level for CP2K materials builders",
+    )
+
+    CP2K_DEFAULT_INPUT_SETS: Dict = Field(
+        {
+            "Static DFT": "pymatgen.io.cp2k.sets.StaticSet",
+            "GGA Structure Optimization": "pymatgen.io.cp2k.sets.RelaxSet",
+            "GGA+U Structure Optimization": "pymatgen.io.cp2k.sets.RelaxSet",
+        },
+        description="Default input sets for task validation",
+    )
+
     LTOL: float = Field(
         0.2, description="Fractional length tolerance for structure matching"
     )
diff --git a/emmet-core/emmet/core/vasp/calc_types/enums.py b/emmet-core/emmet/core/vasp/calc_types/enums.py
index b946992d1c..6f300a1a33 100644
--- a/emmet-core/emmet/core/vasp/calc_types/enums.py
+++ b/emmet-core/emmet/core/vasp/calc_types/enums.py
@@ -21,7 +21,7 @@ class RunType(ValueEnum):
     HF = "HF"
     HSE03 = "HSE03"
     HSE06 = "HSE06"
-    PB0 = "PB0"
+    PBE0 = "PBE0"
     M06L = "M06L"
     MBJL = "MBJL"
     MS0 = "MS0"
@@ -52,7 +52,7 @@ class RunType(ValueEnum):
     HF_U = "HF+U"
     HSE03_U = "HSE03+U"
     HSE06_U = "HSE06+U"
-    PB0_U = "PB0+U"
+    PBE0_U = "PBE0+U"
     M06L_U = "M06L+U"
     MBJL_U = "MBJL+U"
     MS0_U = "MS0+U"
@@ -237,17 +237,17 @@ class CalcType(ValueEnum):
     HSE06_Structure_Optimization = "HSE06 Structure Optimization"
     HSE06_Deformation = "HSE06 Deformation"
     HSE06_Unrecognized = "HSE06 Unrecognized"
-    PB0_NSCF_Line = "PB0 NSCF Line"
-    PB0_NSCF_Uniform = "PB0 NSCF Uniform"
-    PB0_Dielectric = "PB0 Dielectric"
-    PB0_DFPT = "PB0 DFPT"
-    PB0_DFPT_Dielectric = "PB0 DFPT Dielectric"
-    PB0_NMR_Nuclear_Shielding = "PB0 NMR Nuclear Shielding"
-    PB0_NMR_Electric_Field_Gradient = "PB0 NMR Electric Field Gradient"
-    PB0_Static = "PB0 Static"
-    PB0_Structure_Optimization = "PB0 Structure Optimization"
-    PB0_Deformation = "PB0 Deformation"
-    PB0_Unrecognized = "PB0 Unrecognized"
+    PBE0_NSCF_Line = "PBE0 NSCF Line"
+    PBE0_NSCF_Uniform = "PBE0 NSCF Uniform"
+    PBE0_Dielectric = "PBE0 Dielectric"
+    PBE0_DFPT = "PBE0 DFPT"
+    PBE0_DFPT_Dielectric = "PBE0 DFPT Dielectric"
+    PBE0_NMR_Nuclear_Shielding = "PBE0 NMR Nuclear Shielding"
+    PBE0_NMR_Electric_Field_Gradient = "PBE0 NMR Electric Field Gradient"
+    PBE0_Static = "PBE0 Static"
+    PBE0_Structure_Optimization = "PBE0 Structure Optimization"
+    PBE0_Deformation = "PBE0 Deformation"
+    PBE0_Unrecognized = "PBE0 Unrecognized"
     M06L_NSCF_Line = "M06L NSCF Line"
     M06L_NSCF_Uniform = "M06L NSCF Uniform"
     M06L_Dielectric = "M06L Dielectric"
@@ -582,17 +582,17 @@ class CalcType(ValueEnum):
     HSE06_U_Structure_Optimization = "HSE06+U Structure Optimization"
     HSE06_U_Deformation = "HSE06+U Deformation"
     HSE06_U_Unrecognized = "HSE06+U Unrecognized"
-    PB0_U_NSCF_Line = "PB0+U NSCF Line"
-    PB0_U_NSCF_Uniform = "PB0+U NSCF Uniform"
-    PB0_U_Dielectric = "PB0+U Dielectric"
-    PB0_U_DFPT = "PB0+U DFPT"
-    PB0_U_DFPT_Dielectric = "PB0+U DFPT Dielectric"
-    PB0_U_NMR_Nuclear_Shielding = "PB0+U NMR Nuclear Shielding"
-    PB0_U_NMR_Electric_Field_Gradient = "PB0+U NMR Electric Field Gradient"
-    PB0_U_Static = "PB0+U Static"
-    PB0_U_Structure_Optimization = "PB0+U Structure Optimization"
-    PB0_U_Deformation = "PB0+U Deformation"
-    PB0_U_Unrecognized = "PB0+U Unrecognized"
+    PBE0_U_NSCF_Line = "PBE0+U NSCF Line"
+    PBE0_U_NSCF_Uniform = "PBE0+U NSCF Uniform"
+    PBE0_U_Dielectric = "PBE0+U Dielectric"
+    PBE0_U_DFPT = "PBE0+U DFPT"
+    PBE0_U_DFPT_Dielectric = "PBE0+U DFPT Dielectric"
+    PBE0_U_NMR_Nuclear_Shielding = "PBE0+U NMR Nuclear Shielding"
+    PBE0_U_NMR_Electric_Field_Gradient = "PBE0+U NMR Electric Field Gradient"
+    PBE0_U_Static = "PBE0+U Static"
+    PBE0_U_Structure_Optimization = "PBE0+U Structure Optimization"
+    PBE0_U_Deformation = "PBE0+U Deformation"
+    PBE0_U_Unrecognized = "PBE0+U Unrecognized"
     M06L_U_NSCF_Line = "M06L+U NSCF Line"
     M06L_U_NSCF_Uniform = "M06L+U NSCF Uniform"
     M06L_U_Dielectric = "M06L+U Dielectric"
diff --git a/emmet-core/emmet/core/vasp/validation.py b/emmet-core/emmet/core/vasp/validation.py
index 817d315ed1..b3dc6af633 100644
--- a/emmet-core/emmet/core/vasp/validation.py
+++ b/emmet-core/emmet/core/vasp/validation.py
@@ -43,6 +43,8 @@ class ValidationDoc(EmmetBaseModel):
     Validation document for a VASP calculation
     """
 
+    calc_code = "vasp"
+
     task_id: MPID = Field(..., description="The task_id for this validation document")
     valid: bool = Field(False, description="Whether this task is valid or not")
     last_updated: datetime = Field(