diff --git a/emmet-builders/emmet/builders/cp2k/defects.py b/emmet-builders/emmet/builders/cp2k/defects.py new file mode 100644 index 0000000000..e988665d7e --- /dev/null +++ b/emmet-builders/emmet/builders/cp2k/defects.py @@ -0,0 +1,910 @@ +from datetime import datetime +from itertools import chain, groupby, combinations +from re import A +from tkinter import W +from typing import Dict, Iterator, List, Optional +from copy import deepcopy +from math import ceil +from monty.json import MontyDecoder + +from maggma.builders import Builder +from maggma.stores import Store +from maggma.utils import grouper + +from pymatgen.core import Structure +from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher, PointDefectComparator +from pymatgen.symmetry.analyzer import SpacegroupAnalyzer + +from atomate.utils.utils import load_class + +from emmet.core.settings import EmmetSettings +from emmet.core.thermo import ThermoDoc +from emmet.core.material import MaterialsDoc + +from emmet.core.utils import jsanitize, get_sg +from emmet.core.defect import DefectDoc, DefectDoc2d, DefectThermoDoc +from emmet.core.cp2k.calc_types import TaskType +from emmet.core.cp2k.calc_types.utils import run_type +from emmet.builders.settings import EmmetBuildSettings +from emmet.builders.cp2k.utils import synchronous_query + +from emmet.core.electronic_structure import ElectronicStructureDoc + +SETTINGS = EmmetSettings() + +__author__ = "Nicholas Winner " +__maintainer__ = "Jason Munro" + + +# TODO this builder is very close to being code agnostic. We need only resolve the standard key names and +# how they are fed to the DefectDoc class. e.g. VASP calcs store "locpot", but CP2K store "v_hartree" +class DefectBuilder(Builder): + """ + The DefectBuilder collects task documents performed on structures containing a single point defect. + The builder is intended to group tasks corresponding to the same defect (species including charge state), + find the best ones, and perform finite-size defect corrections to create a defect document. These + defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder. + + In order to make the build process easier, an entry must exist inside of the task doc that identifies it + as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future, + this may be changed to having a defect transformation in the transformation history. + + The process is as follows: + + 1.) Find all documents containing the defect query. + 2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already + calculated. These are the candidate bulk tasks. + 3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites + (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding + bulk calculation. + 4.) Convert (defect, bulk task) doc pairs to DefectDocs + 5.) Post-process and validate defect document + 6.) Update the defect store + """ + + #TODO how to incorporate into settings? + DEFAULT_ALLOWED_TASKS = [ + TaskType.Structure_Optimization.value, + TaskType.Static.value + ] + + # TODO: should electrostatic_potential store be optional in case people + # unify their LOCPOT data? + # TODO: should dielectric/electronic_structure be optional or required? + def __init__( + self, + tasks: Store, + defects: Store, + dielectric: Store, + electronic_structure: Store, + materials: Store, + electrostatic_potentials: Store, + task_validation: Optional[Store] = None, + query: Optional[Dict] = None, + bulk_query: Optional[Dict] = None, + allowed_task_types: Optional[List[str]] = DEFAULT_ALLOWED_TASKS, + settings: Optional[EmmetBuildSettings] = None, + defects_2d: Optional[bool] = False, + **kwargs, + ): + """ + Args: + tasks: Store of task documents + defects: Store of defect documents to generate + dielectric: Store of dielectric data + electronic_structure: Store of electronic structure data + materials: Store of materials documents + electrostatic_potentials: Store of electrostatic potential data. These + are generally stored in seperately from the tasks on GridFS due to their size. + task_validation: Store of task validation documents. + query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property + allowed_task_types: list of task_types that can be processed + settings: EmmetBuildSettings object + """ + + self.tasks = tasks + self.defects = defects + self.materials = materials + self.dielectric = dielectric + self.electronic_structure = electronic_structure + self.electrostatic_potentials = electrostatic_potentials + self.task_validation = task_validation + self.allowed_task_types = allowed_task_types #TODO How to incorporate into getitems? + + self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types} + self.settings = EmmetBuildSettings.autoload(settings) + self.query = query if query else {} + self.bulk_query = bulk_query if bulk_query else {} + self.timestamp = None + self._mpid_map = {} + self.defects_2d = defects_2d + self.kwargs = kwargs + + self._defect_query = 'transformations.history.0.defect' + + self._required_defect_properties = [ + self._defect_query, + 'output.energy', + 'output.structure', + 'input', + 'transformations', + 'task_id', + 'nsites' + ] + + self._required_bulk_properties = [ + 'output.energy', + 'output.structure', + 'output.vbm', + 'output.cbm', + 'input', + 'task_id', + ] + + self._optional_defect_properties = [] + self._optional_bulk_properties = [] + + # TODO This seems pretty inelegant + if defects_2d: + self._required_defect_properties.append('v_hartree') + self._required_bulk_properties.append('v_hartree') + else: + self._required_defect_properties.append('output.v_hartree_planar') + self._required_defect_properties.append('output.v_hartree_grid') + self._optional_defect_properties.append('output.v_hartree_sites') + self._required_bulk_properties.append('output.v_hartree_planar') + self._required_bulk_properties.append('output.v_hartree_grid') + self._optional_bulk_properties.append('output.v_hartree_sites') + + sources = [tasks, dielectric, electronic_structure, materials, electrostatic_potentials] + if self.task_validation: + sources.append(self.task_validation) + super().__init__(sources=sources, targets=[defects], **kwargs) + + @property + def defect_query(self) -> str: + """ + The standard query for defect tasks. + """ + return self._defect_query + + #TODO Hartree pot should be required but only for charged defects + @property + def required_defect_properties(self) -> List: + """ + Properties essential to processing a defect task. + """ + return self._required_defect_properties + + @property + def required_bulk_properties(self) -> List: + """ + Properties essential to processing a bulk task. + """ + return self._required_bulk_properties + + @property + def optional_defect_properties(self) -> List: + """ + Properties that are optional for processing a defect task. + """ + return self._optional_defect_properties + + @property + def optional_bulk_properties(self) -> List: + """ + Properties that are optional for bulk tasks. + """ + return self._optional_bulk_properties + + @property + def mpid_map(self) -> Dict: + return self._mpid_map + + def ensure_indexes(self): + """ + Ensures indicies on the tasks and materials collections + """ + + # Basic search index for tasks + self.tasks.ensure_index("task_id") + self.tasks.ensure_index("last_updated") + self.tasks.ensure_index("state") + self.tasks.ensure_index("formula_pretty") + + # Search index for materials + self.materials.ensure_index("material_id") + self.materials.ensure_index("last_updated") + self.materials.ensure_index("task_ids") + + # Search index for defects + self.defects.ensure_index("material_id") + self.defects.ensure_index("last_updated") + self.defects.ensure_index("task_ids") + + if self.task_validation: + self.task_validation.ensure_index("task_id") + self.task_validation.ensure_index("valid") + + def prechunk(self, number_splits: int) -> Iterator[Dict]: + + tag_query = {} + if len(self.settings.BUILD_TAGS) > 0 and len(self.settings.EXCLUDED_TAGS) > 0: + tag_query["$and"] = [ + {"tags": {"$in": self.settings.BUILD_TAGS}}, + {"tags": {"$nin": self.settings.EXCLUDED_TAGS}}, + ] + elif len(self.settings.BUILD_TAGS) > 0: + tag_query["tags"] = {"$in": self.settings.BUILD_TAGS} + + # Get defect tasks + temp_query = self.query.copy() + temp_query.update(tag_query) + temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_defect_properties}) + temp_query.update({self.defect_query: {'$exists': True}, "state": "successful"}) + defect_tasks = { + doc[self.tasks.key] + for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key]) + } + + # Get bulk tasks + temp_query = self.bulk_query.copy() + temp_query.update(tag_query) + temp_query.update({d: {'$exists': True} for d in self.required_bulk_properties}) + temp_query.update({self.defect_query: {'$exists': False}, "state": "successful"}) + bulk_tasks = { + doc[self.tasks.key] + for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key]) + } + + N = ceil(len(defect_tasks) / number_splits) + for task_chunk in grouper(defect_tasks, N): + yield {"query": {"task_id": {"$in": task_chunk + list(bulk_tasks)}}} + + def get_items(self) -> Iterator[List[Dict]]: + """ + Gets all items to process into defect documents. + This does no datetime checking; relying on on whether + task_ids are included in the Defect Collection. + + The procedure is as follows: + + 1. Get all tasks with standard "defect" query tag + 2. Filter all tasks by skipping tasks which are already in the Defect Store + 3. Get all tasks that could be used as bulk + 4. Filter all bulks which do not have corresponding Dielectric and + ElectronicStructure data (if a band gap exists for that task). + 5. Group defect tasks by defect matching + 6. Given defect object in a group, bundle them with bulk tasks + identified with structure matching + 7. Yield the item bundles + + Returns: + Iterator of (defect documents, task bundles) + + The defect document is an existing defect doc to be updated with new data, or None + + task bundles bundle are all the tasks that correspond to the same defect and all possible + bulk tasks that could be matched to them. +d """ + + self.logger.info("Defect builder started") + self.logger.info( + f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}" + ) + + self.logger.info("Setting indexes") + self.ensure_indexes() + + # Save timestamp to mark buildtime for material documents + self.timestamp = datetime.utcnow() + + self.logger.info("Finding tasks to process") + + # Get defect tasks + temp_query = self.query.copy() + temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_defect_properties}) + temp_query.update({self.defect_query: {'$exists': True}, "state": "successful"}) + defect_tasks = { + doc[self.tasks.key] + for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key]) + } + + # Get bulk tasks + temp_query = self.bulk_query.copy() + temp_query.update({d: {'$exists': True} for d in self.required_bulk_properties}) + temp_query.update({self.defect_query: {'$exists': False}, "state": "successful"}) + bulk_tasks = { + doc[self.tasks.key] + for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key]) + } + + # TODO Not the same validation behavior as material builders? + # If validation store exists, find tasks that are invalid and remove them + if self.task_validation: + validated = { + doc[self.tasks.key] + for doc in self.task_validation.query( + {}, [self.task_validation.key] + ) + } + + defect_tasks = defect_tasks.intersection(validated) + bulk_tasks = bulk_tasks.intersection(validated) + + invalid_ids = { + doc[self.tasks.key] + for doc in self.task_validation.query( + {"is_valid": False}, [self.task_validation.key] + ) + } + self.logger.info("Removing {} invalid tasks".format(len(invalid_ids))) + defect_tasks = defect_tasks - invalid_ids + bulk_tasks = bulk_tasks - invalid_ids + + processed_defect_tasks = { + t_id + for d in self.defects.query({}, ["task_ids"]) + for t_id in d.get("task_ids", []) + } + + all_tasks = defect_tasks | bulk_tasks + + self.logger.debug("All tasks: {}".format(len(all_tasks))) + self.logger.debug("Bulk tasks before filter: {}".format(len(bulk_tasks))) + bulk_tasks = set(filter(self.__preprocess_bulk, bulk_tasks)) + self.logger.debug("Bulk tasks after filter: {}".format(len(bulk_tasks))) + self.logger.debug("All defect tasks: {}".format(len(defect_tasks))) + unprocessed_defect_tasks = defect_tasks - processed_defect_tasks + + if not unprocessed_defect_tasks: + self.logger.info("No unprocessed defect tasks. Exiting") + return + elif not bulk_tasks: + self.logger.info("No compatible bulk calculations. Exiting.") + return + + self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks") + self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties") + + # Set total for builder bars to have a total + self.total = len(unprocessed_defect_tasks) + + # yield list of defects that are of the same type, matched to an appropriate bulk calc + self.logger.info(f"Starting defect matching.") + + for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks): + task_ids = self.__match_defects_to_bulks(bulk_tasks, defect_task_group) + doc = self.__get_defect_doc(defect) + item_bundle = self.__get_item_bundle(task_ids) + material_id = self.mpid_map[item_bundle[0][1]['task_id']] + yield doc, item_bundle, material_id + + def process_item(self, items): + """ + Process a group of defect tasks that correspond to the same defect into a single defect + document. If the DefectDoc already exists, then update it and return it. If it does not, + create a new DefectDoc + + Args: + items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ] + + returns: the defect document as a dictionary + """ + defect_doc, item_bundle, material_id = items + self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc") + if item_bundle: + tags = item_bundle[0][0].get('tags', []) + # TODO: This is a hack to get the 2d doc + DEFECT_DOC = DefectDoc2d if '2d' in tags else DefectDoc + if defect_doc: + defect_doc.update_all(item_bundle, query=self.defect_query) + else: + defect_doc = DEFECT_DOC.from_tasks(tasks=item_bundle, query=self.defect_query, material_id=material_id) + return defect_doc.dict() + return {} + + def update_targets(self, items): + """ + Inserts the new task_types into the task_types collection + """ + + items = [item for item in items if item] + + if len(items) > 0: + self.logger.info(f"Updating {len(items)} defects") + for item in items: + item.update({"_bt": self.timestamp}) + self.defects.remove_docs( + { + "task_ids": item['task_ids'], + } + ) + self.defects.update( + docs=jsanitize(items, allow_bson=True), + key='task_ids', + ) + else: + self.logger.info("No items to update") + + def __filter_and_group_tasks(self, tasks): + """ + Groups defect tasks. Tasks are grouped according to the reduced representation + of the defect, and so tasks with different settings (e.g. supercell size, functional) + will be grouped together. + + Args: + tasks: task_ids for unprocessed defects + + returns: + [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect + """ + + # TODO self.defect_query does not work here because mongo does not allow projection + # that retrieves a list index. So, we have to retrieve the whole transformations + props = [ + #self.defect_query, + 'transformations', + 'task_id', + 'output.structure' + ] + + self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects") + + pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False) + sm = StructureMatcher( + ltol=self.settings.LTOL, stol=self.settings.STOL, + angle_tol=self.settings.ANGLE_TOL, allow_subset=False + ) + defects = [ + { + 'task_id': t['task_id'], 'defect': self.__get_defect_from_task(t), + 'structure': Structure.from_dict(t['output']['structure']) + } + for t in self.tasks.query(criteria={'task_id': {'$in': list(tasks)}}, properties=props) + ] + for d in defects: + # TODO remove oxidation state because spins/oxidation cause errors in comparison. + # but they shouldnt if those props are close in value + d['structure'].remove_oxidation_states() + + def key(x): + s = x.get('defect').bulk_structure + return get_sg(s), s.composition.reduced_composition + + def are_equal(x, y): + """ + To decide if defects are equal. Either the defect objects are + equal, OR two different defect objects relaxed to the same final structure + (common with interstitials). + + TODO Need a way to do the output structure comparison for a X atom defect cell + TODO which can be embedded in a Y atom defect cell up to tolerance. + """ + if pdc.are_equal(x['defect'], y['defect']): + return True + + # TODO This is needed for ghost vacancy unfortunately, since sm.fit can't distinguish ghosts + if x['defect'].defect_composition == y['defect'].defect_composition and \ + x['defect'].charge == y['defect'].charge and \ + sm.fit(x['structure'], y['structure']): + return True + return False + + sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1])) + all_groups = [] + + # For each pre-grouped list of structures, perform actual matching. + for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])): + unmatched = list(g) + while len(unmatched) > 0: + i, refs = unmatched.pop(0) + matches = [i] + inds = list(filter(lambda j: are_equal(refs, unmatched[j][1]), list(range(len(unmatched))))) + matches.extend([unmatched[i][0] for i in inds]) + unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds] + all_groups.append( + (defects[i]['defect'], [defects[i]['task_id'] for i in matches]) + ) + + self.logger.debug(f"All groups {all_groups}") + return all_groups + + def __get_defect_from_task(self, task): + """ + Using the defect_query property, retrieve a pymatgen defect object from the task document + """ + defect = unpack(self.defect_query.split('.'), task) + needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name'] + return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys}) + + def __get_defect_doc(self, defect): + """ + Given a defect, find the DefectDoc corresponding to it in the defects store if it exists + + returns: DefectDoc or None + """ + material_id = self._get_mpid(defect.bulk_structure) + docs = [ + DefectDoc(**doc) + for doc in self.defects.query(criteria={'material_id': material_id}, properties=None) + ] + pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=True) + for doc in docs: + if pdc.are_equal(defect, doc.defect): + return doc + return None + + # TODO should move to returning dielectric doc or continue returning the total diel tensor? + def __get_dielectric(self, key): + """ + Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store + and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would + be the case for metallic systems, return None. + """ + for diel in self.dielectric.query(criteria={"material_id": key}, properties=['total']): + return diel['total'] + return None + + #TODO retrieving the electrostatic potential is by far the most expesive part of the builder. Any way to reduce? + def __get_item_bundle(self, task_ids): + """ + Gets a group of items that can be processed together into a defect document. + + Args: + bulk_tasks: possible bulk tasks to match to defects + defect_task_group: group of equivalent defects (defined by PointDefectComparator) + + returns: [(defect task dict, bulk_task_dict, dielectric dict), ...] + """ + return [ + ( + next( + synchronous_query( + self.tasks, + self.electrostatic_potentials, + query={'task_id': defect_tasks_id}, properties=None) #self.required_defect_properties) #TODO + ), + next( + synchronous_query( + self.tasks, + self.electrostatic_potentials, + query={'task_id': bulk_tasks_id}, properties=self.required_bulk_properties) + ), + self.__get_dielectric(self._mpid_map[bulk_tasks_id]), + ) + for defect_tasks_id, bulk_tasks_id in task_ids + ] + + def _get_mpid(self, structure): + """ + Given a structure, determine if an equivalent structure exists, with a material_id, + in the materials store. + + Args: + structure: Candidate structure + + returns: material_id, if one exists, else None + """ + sga = SpacegroupAnalyzer(structure, symprec=SETTINGS.SYMPREC, angle_tolerance=SETTINGS.ANGLE_TOL) + mats = self.materials.query( + criteria={ + 'chemsys': structure.composition.chemical_system, + }, properties=['structure', 'material_id'] + ) + # TODO coudl more than one material match true? + sm = StructureMatcher(ltol=SETTINGS.LTOL, stol=SETTINGS.STOL, angle_tol=SETTINGS.ANGLE_TOL) + for m in mats: + if sm.fit(structure, Structure.from_dict(m['structure'])): + return m['material_id'] + return None + + def __match_defects_to_bulks(self, bulk_ids, defect_ids): + """ + Given task_ids of bulk and defect tasks, match the defects to a bulk task that has + commensurate: + + - Composition + - Number of sites + - Symmetry + + """ + + self.logger.debug(f"Finding bulk/defect task combinations.") + self.logger.debug(f"Bulk tasks: {bulk_ids}") + self.logger.debug(f"Defect tasks: {defect_ids}") + + # TODO mongo projection on array doesn't work (see above) + props = [ + 'task_id', + 'input', + 'nsites', + 'output.structure', + 'transformations', + 'defect', + 'scale', + ] + defects = list(self.tasks.query(criteria={'task_id': {'$in': list(defect_ids)}}, properties=props)) + ps = DefectBuilder.__get_pristine_supercell(defects[0]) + bulks = list( + self.tasks.query( + criteria={ + 'task_id': {'$in': list(bulk_ids)}, + 'composition_reduced': ps.composition.reduced_composition.as_dict(), + }, + properties=props + ) + ) + + sm = StructureMatcher( + ltol=SETTINGS.LTOL, + stol=SETTINGS.STOL, + angle_tol=SETTINGS.ANGLE_TOL, + primitive_cell=False, + scale=True, + attempt_supercell=False, + allow_subset=False, + comparator=ElementComparator(), + ) + + def _compare(b, d): + if run_type(b.get('input').get('dft')).value.split('+U')[0] == \ + run_type(d.get('input').get('dft')).value.split('+U')[0] and \ + sm.fit(DefectBuilder.__get_pristine_supercell(d), DefectBuilder.__get_pristine_supercell(b)): + return True + return False + + pairs = [ + (defect['task_id'], bulk['task_id']) + for bulk in bulks + for defect in defects + if _compare(bulk, defect) + ] + + self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs") + return pairs + + def __preprocess_bulk(self, task): + """ + Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk + tasks must have: + + (1) Correspond to an existing material_id in the materials store + (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store + (3) If bulk is not a metal, electronic structure document must exist in the store + + """ + self.logger.debug("Preprocessing bulk task {}".format(task)) + t = next(self.tasks.query(criteria={'task_id': task}, properties=['output.structure', 'mpid'])) + + struc = Structure.from_dict(t.get('output').get('structure')) + mpid = self._get_mpid(struc) + if not mpid: + self.logger.debug(f"No material id found for bulk task {task}") + return False + self._mpid_map[task] = mpid + self.logger.debug(f"Material ID: {mpid}") + + elec = self.electronic_structure.query_one( + properties=['band_gap'], criteria={self.electronic_structure.key: mpid} + ) + if not elec: + self.logger.debug(f"Electronic structure data not found for {mpid}") + return False + + # TODO right now pulling dos from electronic structure, should just pull summary document + if elec['band_gap'] > 0: + diel = self.__get_dielectric(mpid) + if not diel: + self.logger.info(f"Task {task} for {mpid} ({struc.composition.reduced_formula}) requires " + f"dielectric properties, but none found in dielectric store") + return False + + return True + + @staticmethod + def __get_pristine_supercell(task): + """ + Given a task document for a defect calculation, retrieve the un-defective, pristine supercell. + - If defect transform exists, the following transform's input will be returned + - If no follow up transform exists, the calculation input will be returned + + If defect cannot be found in task, return the input structure. + """ + if task.get('transformations'): + for i, transformation in enumerate(task['transformations']['history']): + if transformation['@class'] == 'DefectTransformation': + tmp = Structure.from_dict(transformation['input_structure']) + tmp.make_supercell(transformation['scaling_matrix']) + return tmp + return Structure.from_dict(task['input']['structure']) + + +#TODO Major problem with this builder. materials store is used to sync the diel, elec, and pd with a single material id +#TODO This is a problem because the material id in vasp store is not synced to cp2k store +#TODO Also the chempots needed to adjust entries must come from cp2k, but you need to give vasp to sync the others +class DefectThermoBuilder(Builder): + + """ + This builder creates collections of the DefectThermoDoc object. + + (1) Find all DefectDocs that correspond to the same bulk material + given by material_id + (2) Create a new DefectThermoDoc for all of those documents + (3) Insert/Update the defect_thermos store with the new documents + """ + + def __init__( + self, + defects: Store, + defect_thermos: Store, + materials: Store, + thermo: Store, + electronic_structures: Store, + dos: Store, + query: Optional[Dict] = None, + **kwargs, + ): + """ + Args: + defects: Store of defect documents (generated by DefectBuilder) + defect_thermos: Store of DefectThermoDocs to generate. + materials: Store of MaterialDocs to construct phase diagram + electronic_structures: Store of DOS objects + query: dictionary to limit tasks to be analyzed + """ + + self.defects = defects + self.defect_thermos = defect_thermos + self.materials = materials + self.thermo = thermo + self.dos = dos + self.electronic_structures = electronic_structures + + self.query = query if query else {} + self.timestamp = None + self.kwargs = kwargs + + super().__init__(sources=[defects, materials, thermo, electronic_structures, dos], targets=[defect_thermos], **kwargs) + + def ensure_indexes(self): + """ + Ensures indicies on the collections + """ + + # Basic search index for tasks + self.defects.ensure_index("material_id") + self.defects.ensure_index("defect_id") + + # Search index for materials + self.defect_thermos.ensure_index("material_id") + + # TODO need to only process new tasks. Fast builder so currently is OK for small collections + def get_items(self) -> Iterator[List[Dict]]: + """ + Gets items to process into DefectThermoDocs. + + returns: + iterator yielding tuples containing: + - group of DefectDocs belonging to the same bulk material as indexed by material_id, + - materials in the chemsys of the bulk material for constructing phase diagram + - Dos of the bulk material for constructing phase diagrams/getting doping + + """ + + self.logger.info("Defect thermo builder started") + self.logger.info("Setting indexes") + self.ensure_indexes() + + # Save timestamp to mark build time for defect thermo documents + self.timestamp = datetime.utcnow() + + # Get all tasks + self.logger.info("Finding tasks to process") + temp_query = dict(self.query) + temp_query["state"] = "successful" + + #unprocessed_defect_tasks = all_tasks - processed_defect_tasks + + all_docs = [doc for doc in self.defects.query(self.query)] + + self.logger.debug(f"Found {len(all_docs)} defect docs to process") + + def filterfunc(x): + # material for defect x exists + if not list(self.materials.query(criteria={'material_id': x['material_id']}, properties=None)): + self.logger.debug(f"No material with MPID={x['material_id']} in the material store") + return False + + for el in load_class(x['defect']['@module'], x['defect']['@class']).from_dict(x['defect']).defect_composition: + if not list(self.thermo.query(criteria={'chemsys': str(el)}, properties=None)): + self.logger.debug(f"No entry for {el} in Thermo Store") + return False + + return True + + for key, group in groupby( + filter( + filterfunc, + sorted(all_docs, key=lambda x: x['material_id']) + ), key=lambda x: x['material_id'] + ): + group = [g for g in group] + try: + mat = self.__get_materials(key) + thermo = self.__get_thermos(mat.composition) + elec = self.__get_electronic_structure(group[0]['material_id']) + yield (group, mat, thermo, elec) + except LookupError as exception: + raise exception + + def process_item(self, docs): + """ + Process a group of defects belonging to the same material into a defect thermo doc + """ + self.logger.info(f"Processing defects") + defects, material, thermos, elec_struc = docs + defects = [DefectDoc(**d) for d in defects] + thermos = [ThermoDoc(**t) for t in thermos] + defect_thermo_doc = DefectThermoDoc.from_docs(defects, thermos=thermos, electronic_structure=elec_struc) + return defect_thermo_doc.dict() + + def update_targets(self, items): + """ + Inserts the new DefectThermoDocs into the defect_thermos store + """ + items = [item for item in items if item] + for item in items: + item.update({"_bt": self.timestamp}) + + if len(items) > 0: + self.logger.info(f"Updating {len(items)} defect thermo docs") + self.defect_thermos.update( + docs=jsanitize(items, allow_bson=True), + key=self.defect_thermos.key, + ) + else: + self.logger.info("No items to update") + + def __get_electronic_structure(self, material_id): + """ + Gets the electronic structure of the bulk material + """ + self.logger.info(f"Getting electronic structure for {material_id}") + + # TODO This is updated to return the whole query because a.t.m. the + # DOS part of the electronic builder isn't working, so I'm using + # this to pull direct from the store of dos objects with no processing. + dosdoc = self.electronic_structures.query_one( + criteria={self.electronic_structures.key: material_id}, + properties=None, + ) + t_id = ElectronicStructureDoc(**dosdoc).dos.total['1'].task_id + dos = self.dos.query_one(criteria={'task_id': int(t_id)}, properties=None) #TODO MPID str/int issues + return dos + + def __get_materials(self, key) -> List: + """ + Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the + materials store. + """ + bulk = self.materials.query_one(criteria={'material_id': key}, properties=None) + if not bulk: + raise LookupError( + f"The bulk material ({key}) for these defects cannot be found in the materials store" + ) + return MaterialsDoc(**bulk) + + def __get_thermos(self, composition) -> List: + return list(self.thermo.query(criteria={'elements': {"$size": 1}}, properties=None)) + + +def unpack(query, d): + """ + Unpack a mongo-style query into dictionary retrieval + """ + if not query: + return d + if isinstance(d, List): + return unpack(query[1:], d.__getitem__(int(query.pop(0)))) + return unpack(query[1:], d.__getitem__(query.pop(0))) diff --git a/emmet-builders/emmet/builders/cp2k/materials.py b/emmet-builders/emmet/builders/cp2k/materials.py new file mode 100644 index 0000000000..1846bcc57d --- /dev/null +++ b/emmet-builders/emmet/builders/cp2k/materials.py @@ -0,0 +1,275 @@ +from datetime import datetime +from itertools import chain +from typing import Dict, Iterator, List, Optional + +from maggma.builders import Builder +from maggma.stores import Store + +from emmet.builders.settings import EmmetBuildSettings +from emmet.core.utils import group_structures, jsanitize +from emmet.core.cp2k.material import MaterialsDoc +from emmet.core.cp2k.task import TaskDocument + +SETTINGS = EmmetBuildSettings() + +__author__ = "Nicholas Winner " +__maintainer__ = "Shyam Dwaraknath " + + +class MaterialsBuilder(Builder): + """ + The Materials Builder matches VASP task documents by structure similarity into materials + document. The purpose of this builder is group calculations and determine the best structure. + All other properties are derived from other builders. + + The process is as follows: + + 1.) Find all documents with the same formula + 2.) Select only task documents for the task_types we can select properties from + 3.) Aggregate task documents based on structure similarity + 4.) Convert task docs to property docs with metadata for selection and aggregation + 5.) Select the best property doc for each property + 6.) Build material document from best property docs + 7.) Post-process material document + 8.) Validate material document + + """ + + def __init__( + self, + tasks: Store, + materials: Store, + task_validation: Optional[Store] = None, + query: Optional[Dict] = None, + settings: Optional[EmmetBuildSettings] = None, + **kwargs, + ): + """ + Args: + tasks: Store of task documents + materials: Store of materials documents to generate + query: dictionary to limit tasks to be analyzed + allowed_task_types: list of task_types that can be processed + symprec: tolerance for SPGLib spacegroup finding + ltol: StructureMatcher tuning parameter for matching tasks to materials + stol: StructureMatcher tuning parameter for matching tasks to materials + angle_tol: StructureMatcher tuning parameter for matching tasks to materials + """ + + self.tasks = tasks + self.materials = materials + self.task_validation = task_validation + self.query = query if query else {} + self.settings = EmmetBuildSettings.autoload(settings) + self.kwargs = kwargs + + sources = [tasks] + if self.task_validation: + sources.append(self.task_validation) + super().__init__(sources=sources, targets=[materials], **kwargs) + + def ensure_indexes(self): + """ + Ensures indicies on the tasks and materials collections + """ + + # Basic search index for tasks + self.tasks.ensure_index("task_id") + self.tasks.ensure_index("last_updated") + self.tasks.ensure_index("state") + self.tasks.ensure_index("formula_pretty") + + # Search index for materials + self.materials.ensure_index("material_id") + self.materials.ensure_index("last_updated") + self.materials.ensure_index("sandboxes") + self.materials.ensure_index("task_ids") + + if self.task_validation: + self.task_validation.ensure_index("task_id") + self.task_validation.ensure_index("valid") + + def get_items(self) -> Iterator[List[Dict]]: + """ + Gets all items to process into materials documents. + This does no datetime checking; relying on on whether + task_ids are included in the Materials Collection + + Returns: + generator or list relevant tasks and materials to process into materials documents + """ + + self.logger.info("Materials builder started") + # TODO make a cp2k allowed type setting + self.logger.info( + f"Allowed task types: {[task_type.value for task_type in self.settings.CP2K_ALLOWED_TASK_TYPES]}" + ) + + self.logger.info("Setting indexes") + self.ensure_indexes() + + # Save timestamp to mark buildtime for material documents + self.timestamp = datetime.utcnow() + + # Get all processed tasks: + temp_query = dict(self.query) + temp_query["state"] = "successful" + + self.logger.info("Finding tasks to process") + all_tasks = { + doc[self.tasks.key] + for doc in self.tasks.query(temp_query, [self.tasks.key]) + } + processed_tasks = { + t_id + for d in self.materials.query({}, ["task_ids"]) + for t_id in d.get("task_ids", []) + } + to_process_tasks = all_tasks - processed_tasks + to_process_forms = self.tasks.distinct( + "formula_pretty", {self.tasks.key: {"$in": list(to_process_tasks)}} + ) + self.logger.info(f"Found {len(to_process_tasks)} unprocessed tasks") + self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas") + + # Set total for builder bars to have a total + self.total = len(to_process_forms) + + if self.task_validation: + invalid_ids = { + doc[self.tasks.key] + for doc in self.task_validation.query( + {"is_valid": False}, [self.task_validation.key] + ) + } + else: + invalid_ids = set() + + projected_fields = [ + "last_updated", + "completed_at", + "task_id", + "formula_pretty", + "output.energy_per_atom", + "output.structure", + "input", + "orig_inputs", + "input.structure", + "tags", + ] + + for formula in to_process_forms: + tasks_query = dict(temp_query) + tasks_query["formula_pretty"] = formula + tasks = list( + self.tasks.query(criteria=tasks_query, properties=None) + ) + for t in tasks: + if t[self.tasks.key] in invalid_ids: + t["is_valid"] = False + else: + t["is_valid"] = True + + yield tasks + + def process_item(self, tasks: List[Dict]) -> List[Dict]: + """ + Process the tasks into a list of materials + + Args: + tasks [dict] : a list of task docs + + Returns: + ([dict],list) : a list of new materials docs and a list of task_ids that were processsed + """ + + tasks = [TaskDocument(**task) for task in tasks] + formula = tasks[0].formula_pretty + task_ids = [task.task_id for task in tasks] + self.logger.debug(f"Processing {formula} : {task_ids}") + + grouped_tasks = self.filter_and_group_tasks(tasks) + materials = [] + for group in grouped_tasks: + try: + materials.append( + MaterialsDoc.from_tasks( + group, + quality_scores=self.settings.CP2K_QUALITY_SCORES, + ) + ) + except Exception as e: + # TODO construct deprecated + + failed_ids = list({t_.task_id for t_ in group}) + doc = MaterialsDoc.construct_deprecated_material(tasks) + doc.warnings.append(str(e)) + materials.append(doc) + self.logger.warn( + f"Failed making material for {failed_ids}." + f" Inserted as deprecated Material: {doc.material_id}" + ) + + self.logger.debug(f"Produced {len(materials)} materials for {formula}") + + return jsanitize([mat.dict() for mat in materials], allow_bson=True) + + def update_targets(self, items: List[List[Dict]]): + """ + Inserts the new task_types into the task_types collection + + Args: + items ([([dict],[int])]): A list of tuples of materials to update and the corresponding + processed task_ids + """ + + items = list(filter(None, chain.from_iterable(items))) + + for item in items: + item.update({"_bt": self.timestamp}) + + material_ids = list({item["material_id"] for item in items}) + + if len(items) > 0: + self.logger.info(f"Updating {len(items)} materials") + self.materials.remove_docs({self.materials.key: {"$in": material_ids}}) + self.materials.update( + docs=jsanitize(items, allow_bson=True), + key=["material_id"], + ) + else: + self.logger.info("No items to update") + + def filter_and_group_tasks(self, tasks: List[TaskDocument]) -> Iterator[List[Dict]]: + """ + Groups tasks by structure matching + """ + + # TODO why did the way vasp builder did it not work here? + filtered_tasks = [] + for task in tasks: + for allowed_type in self.settings.CP2K_ALLOWED_TASK_TYPES: + if task.task_type is allowed_type: + filtered_tasks.append(task) + continue + + structures = [] + + for idx, task in enumerate(filtered_tasks): + s = task.output.structure + s.index: int = idx # type: ignore + s.remove_oxidation_states() + s.remove_spin() + structures.append(s) + + grouped_structures = group_structures( + structures, + ltol=self.settings.LTOL, + stol=self.settings.STOL, + angle_tol=self.settings.ANGLE_TOL, + symprec=self.settings.SYMPREC, + ) + + for group in grouped_structures: + grouped_tasks = [filtered_tasks[struc.index] for struc in group] # type: ignore + yield grouped_tasks diff --git a/emmet-builders/emmet/builders/cp2k/task_validator.py b/emmet-builders/emmet/builders/cp2k/task_validator.py new file mode 100644 index 0000000000..6ae2e4b684 --- /dev/null +++ b/emmet-builders/emmet/builders/cp2k/task_validator.py @@ -0,0 +1,63 @@ +from typing import Dict, Optional + +from maggma.builders import MapBuilder +from maggma.core import Store + +from emmet.builders.settings import EmmetBuildSettings +from emmet.core.cp2k.task import TaskDocument +from emmet.core.cp2k.validation import DeprecationMessage, ValidationDoc + + +class TaskValidator(MapBuilder): + def __init__( + self, + tasks: Store, + task_validation: Store, + settings: Optional[EmmetBuildSettings] = None, + query: Optional[Dict] = None, + **kwargs, + ): + """ + Creates task_types from tasks and type definitions + + Args: + tasks: Store of task documents + task_validation: Store of task_types for tasks + """ + self.tasks = tasks + self.task_validation = task_validation + self.settings = EmmetBuildSettings.autoload(settings) + self.query = query + self.kwargs = kwargs + + super().__init__( + source=tasks, + target=task_validation, + projection=[ + "input", + "output.forces", + "tags", + ], + query=query, + **kwargs, + ) + + def unary_function(self, item): + """ + Find the task_type for the item + + Args: + item (dict): a (projection of a) task doc + """ + task_doc = TaskDocument(**item) + validation_doc = ValidationDoc.from_task_doc( + task_doc=task_doc, + ) + + bad_tags = list(set(task_doc.tags).intersection(self.settings.DEPRECATED_TAGS)) + if len(bad_tags) > 0: + validation_doc.warnings.append(f"Manual Deprecation by tags: {bad_tags}") + validation_doc.valid = False + validation_doc.reasons.append(DeprecationMessage.MANUAL) + + return validation_doc diff --git a/emmet-builders/emmet/builders/cp2k/thermo.py b/emmet-builders/emmet/builders/cp2k/thermo.py new file mode 100644 index 0000000000..9c8345a018 --- /dev/null +++ b/emmet-builders/emmet/builders/cp2k/thermo.py @@ -0,0 +1,368 @@ +import warnings +from collections import defaultdict +from itertools import chain +from typing import Dict, Iterable, Iterator, List, Optional, Set +from math import ceil + +from maggma.core import Builder, Store +from maggma.utils import grouper +from monty.json import MontyDecoder +from pymatgen.analysis.phase_diagram import PhaseDiagramError +from pymatgen.entries.compatibility import MaterialsProject2020Compatibility +from pymatgen.entries.computed_entries import ComputedStructureEntry + +from emmet.builders.utils import chemsys_permutations +from emmet.core.thermo import ThermoDoc, PhaseDiagramDoc +from emmet.core.utils import jsanitize + + +class ThermoBuilder(Builder): + def __init__( + self, + materials: Store, + thermo: Store, + phase_diagram: Optional[Store] = None, + oxidation_states: Optional[Store] = None, + query: Optional[Dict] = None, + compatibility=None, + num_phase_diagram_eles: Optional[int] = None, + **kwargs, + ): + """ + Calculates thermodynamic quantities for materials from phase + diagram constructions + + Args: + materials (Store): Store of materials documents + thermo (Store): Store of thermodynamic data such as formation + energy and decomposition pathway + phase_diagram (Store): Store of phase diagram data for each unique chemical system + oxidation_states (Store): Store of oxidation state data to use in correction scheme application + query (dict): dictionary to limit materials to be analyzed + compatibility (PymatgenCompatability): Compatability module + to ensure energies are compatible + num_phase_diagram_eles (int): Maximum number of elements to use in phase diagram construction + for data within the separate phase_diagram collection + """ + + self.materials = materials + self.thermo = thermo + self.query = query if query else {} + self.compatibility = compatibility + self.oxidation_states = oxidation_states + self.phase_diagram = phase_diagram + self.num_phase_diagram_eles = num_phase_diagram_eles + self._completed_tasks: Set[str] = set() + self._entries_cache: Dict[str, List[ComputedStructureEntry]] = defaultdict(list) + + sources = [materials] + if oxidation_states is not None: + sources.append(oxidation_states) + + targets = [thermo] + if phase_diagram is not None: + targets.append(phase_diagram) + + super().__init__(sources=sources, targets=targets, **kwargs) + + def ensure_indexes(self): + """ + Ensures indicies on the tasks and materials collections + """ + + # Search index for materials + self.materials.ensure_index("material_id") + self.materials.ensure_index("chemsys") + self.materials.ensure_index("last_updated") + + # Search index for thermo + self.thermo.ensure_index("material_id") + self.thermo.ensure_index("last_updated") + + # Search index for phase_diagram + if self.phase_diagram: + self.phase_diagram.ensure_index("chemsys") + + def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover + updated_chemsys = self.get_updated_chemsys() + new_chemsys = self.get_new_chemsys() + + affected_chemsys = self.get_affected_chemsys(updated_chemsys | new_chemsys) + + # Remove overlapping chemical systems + to_process_chemsys = set() + for chemsys in updated_chemsys | new_chemsys | affected_chemsys: + if chemsys not in to_process_chemsys: + to_process_chemsys |= chemsys_permutations(chemsys) + + N = ceil(len(to_process_chemsys) / number_splits) + + for chemsys_chunk in grouper(to_process_chemsys, N): + + yield {"query": {"chemsys": {"$in": list(chemsys_chunk)}}} + + def get_items(self) -> Iterator[List[Dict]]: + """ + Gets whole chemical systems of entries to process + """ + + self.logger.info("Thermo Builder Started") + + self.logger.info("Setting indexes") + self.ensure_indexes() + + updated_chemsys = self.get_updated_chemsys() + new_chemsys = self.get_new_chemsys() + + affected_chemsys = self.get_affected_chemsys(updated_chemsys | new_chemsys) + + # Remove overlapping chemical systems + processed = set() + to_process_chemsys = [] + for chemsys in sorted( + updated_chemsys | new_chemsys | affected_chemsys, + key=lambda x: len(x), + reverse=True, + ): + if chemsys not in processed: + processed |= chemsys_permutations(chemsys) + to_process_chemsys.append(chemsys) + + self.logger.info( + f"Found {len(to_process_chemsys)} chemical systems with new/updated materials to process" + ) + self.total = len(to_process_chemsys) + + # Yield the chemical systems in order of increasing size + # Will build them in a similar manner to fast Pourbaix + for chemsys in sorted( + to_process_chemsys, key=lambda x: len(x.split("-")), reverse=False + ): + entries = self.get_entries(chemsys) + yield entries + + def process_item(self, item: List[Dict]): + + if len(item) == 0: + return [] + + entries = [ComputedStructureEntry.from_dict(entry) for entry in item] + # determine chemsys + elements = sorted( + set([el.symbol for e in entries for el in e.composition.elements]) + ) + chemsys = "-".join(elements) + + self.logger.debug(f"Processing {len(entries)} entries for {chemsys}") + + material_entries: Dict[str, Dict[str, ComputedStructureEntry]] = defaultdict( + dict + ) + pd_entries = [] + for entry in entries: + material_entries[entry.entry_id][entry.data["run_type"]] = entry + + if self.compatibility: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message="Failed to guess oxidation states.*" + ) + pd_entries = self.compatibility.process_entries(entries) + else: + pd_entries = entries + self.logger.debug(f"{len(pd_entries)} remain in {chemsys} after filtering") + + try: + docs, pd = ThermoDoc.from_entries(pd_entries, deprecated=False) + for doc in docs: + doc.entries = material_entries[doc.material_id] + doc.entry_types = list(material_entries[doc.material_id].keys()) + + pd_data = None + + if self.phase_diagram: + if ( + self.num_phase_diagram_eles is None + or len(elements) <= self.num_phase_diagram_eles + ): + pd_doc = PhaseDiagramDoc(chemsys=chemsys, phase_diagram=pd) + pd_data = jsanitize(pd_doc.dict(), allow_bson=True) + + docs_pd_pair = ( + jsanitize([d.dict() for d in docs], allow_bson=True), + [pd_data], + ) + + except PhaseDiagramError as p: + elsyms = [] + for e in entries: + elsyms.extend([el.symbol for el in e.composition.elements]) + + self.logger.warning( + f"Phase diagram error in chemsys {'-'.join(sorted(set(elsyms)))}: {p}" + ) + return [] + except Exception as e: + self.logger.error( + f"Got unexpected error while processing {[ent_.entry_id for ent_ in entries]}: {e}" + ) + return [] + + return docs_pd_pair + + def update_targets(self, items): + """ + Inserts the thermo and phase diagram docs into the thermo collection + Args: + items ([[tuple(List[dict],List[dict])]]): a list of list of thermo dictionaries to update + """ + + # print(len(items)) + thermo_docs = [item[0] for item in items if item] + phase_diagram_docs = [item[1] for item in items if item] + + # flatten out lists + thermo_docs = list(filter(None, chain.from_iterable(thermo_docs))) + phase_diagram_docs = list(filter(None, chain.from_iterable(phase_diagram_docs))) + + # Check if already updated this run + thermo_docs = [ + i for i in thermo_docs if i["material_id"] not in self._completed_tasks + ] + + self._completed_tasks |= {i["material_id"] for i in thermo_docs} + + for item in thermo_docs: + if isinstance(item["last_updated"], dict): + item["last_updated"] = MontyDecoder().process_decoded( + item["last_updated"] + ) + + if self.phase_diagram: + self.phase_diagram.update(phase_diagram_docs) + + if len(thermo_docs) > 0: + self.logger.info(f"Updating {len(thermo_docs)} thermo documents") + self.thermo.update(docs=thermo_docs, key=["material_id"]) + else: + self.logger.info("No thermo items to update") + + def get_entries(self, chemsys: str) -> List[Dict]: + """ + Gets a entries from the tasks collection for the corresponding chemical systems + Args: + chemsys(str): a chemical system represented by string elements seperated by a dash (-) + Returns: + set(ComputedEntry): a set of entries for this system + """ + + self.logger.info(f"Getting entries for: {chemsys}") + # First check the cache + all_chemsys = chemsys_permutations(chemsys) + cached_chemsys = all_chemsys & set(self._entries_cache.keys()) + query_chemsys = all_chemsys - cached_chemsys + all_entries = list( + chain.from_iterable(self._entries_cache[c] for c in cached_chemsys) + ) + + self.logger.debug( + f"Getting {len(cached_chemsys)} sub-chemsys from cache for {chemsys}" + ) + self.logger.debug( + f"Getting {len(query_chemsys)} sub-chemsys from DB for {chemsys}" + ) + + # Second grab the materials docs + new_q = dict(self.query) + new_q["chemsys"] = {"$in": list(query_chemsys)} + new_q["deprecated"] = False + materials_docs = list( + self.materials.query( + criteria=new_q, properties=["material_id", "entries", "deprecated"] + ) + ) + + # Get Oxidation state data for each material + oxi_states_data = {} + if self.oxidation_states: + material_ids = [t["material_id"] for t in materials_docs] + oxi_states_data = { + d["material_id"]: d.get("average_oxidation_states", {}) + for d in self.oxidation_states.query( + properties=["material_id", "average_oxidation_states"], + criteria={ + "material_id": {"$in": material_ids}, + "state": "successful", + }, + ) + } + + self.logger.debug( + f"Got {len(materials_docs)} entries from DB for {len(query_chemsys)} sub-chemsys for {chemsys}" + ) + + # Convert the entries into ComputedEntries and store + for doc in materials_docs: + for r_type, entry_dict in doc.get("entries", {}).items(): + entry_dict["data"]["oxidation_states"] = oxi_states_data.get( + entry_dict["entry_id"], {} + ) + entry_dict["data"]["run_type"] = r_type + elsyms = sorted(set([el for el in entry_dict["composition"]])) + self._entries_cache["-".join(elsyms)].append(entry_dict) + all_entries.append(entry_dict) + + self.logger.info(f"Total entries in {chemsys} : {len(all_entries)}") + + return all_entries + + def get_updated_chemsys(self,) -> Set: + """Gets updated chemical system as defined by the updating of an existing material""" + + updated_mats = self.thermo.newer_in(self.materials, criteria=self.query) + updated_chemsys = set( + self.materials.distinct( + "chemsys", {"material_id": {"$in": list(updated_mats)}, **self.query} + ) + ) + self.logger.debug(f"Found {len(updated_chemsys)} updated chemical systems") + + return updated_chemsys + + def get_new_chemsys(self) -> Set: + """Gets newer chemical system as defined by introduction of a new material""" + + # All materials that are not present in the thermo collection + thermo_mat_ids = self.thermo.distinct("material_id") + mat_ids = self.materials.distinct("material_id", self.query) + dif_task_ids = list(set(mat_ids) - set(thermo_mat_ids)) + q = {"material_id": {"$in": dif_task_ids}} + new_mat_chemsys = set(self.materials.distinct("chemsys", q)) + self.logger.debug(f"Found {len(new_mat_chemsys)} new chemical systems") + + return new_mat_chemsys + + def get_affected_chemsys(self, chemical_systems: Set) -> Set: + """Gets chemical systems affected by changes in the supplied chemical systems""" + # First get all chemsys with any of the elements we've marked + affected_chemsys = set() + affected_els = list({el for c in chemical_systems for el in c.split("-")}) + possible_affected_chemsys = self.materials.distinct( + "chemsys", {"elements": {"$in": affected_els}} + ) + + sub_chemsys = defaultdict(list) + # Build a dictionary mapping sub_chemsys to all super_chemsys + for chemsys in possible_affected_chemsys: + for permutation in chemsys_permutations(chemsys): + sub_chemsys[permutation].append(chemsys) + + # Select and merge distinct super chemsys from sub_chemsys + for chemsys in chemical_systems: + affected_chemsys |= set(sub_chemsys[chemsys]) + + self.logger.debug( + f"Found {len(affected_chemsys)} chemical systems affected by this build" + ) + + return affected_chemsys diff --git a/emmet-builders/emmet/builders/cp2k/utils.py b/emmet-builders/emmet/builders/cp2k/utils.py new file mode 100644 index 0000000000..e3bd98c8a3 --- /dev/null +++ b/emmet-builders/emmet/builders/cp2k/utils.py @@ -0,0 +1,113 @@ +from typing import List +from copy import deepcopy +import numpy as np +from pymatgen.ext.matproj import MPRester +from pymatgen.symmetry.analyzer import SpacegroupAnalyzer +from pymatgen.analysis.structure_matcher import StructureMatcher + + +def unpack(query, d): + if not query: + return d + if isinstance(d, List): + return unpack(query[1:], d.__getitem__(int(query.pop(0)))) + return unpack(query[1:], d.__getitem__(query.pop(0))) + + +# TODO Move polaron compare to a different function to make validation easier +# TODO Matching should have have an easier way of setting distance tolerance + # can't this all be done inside of structure matcher to find mapping or something? +def matcher(bulk_struc, defect_struc, final_bulk_struc=None, final_defect_struc=None): + matching_indices = [] + dis = [] + + for j in range(len(defect_struc)): + for i in range(len(bulk_struc)): + if ( + bulk_struc[i].species == defect_struc[j].species + and bulk_struc[i].distance(defect_struc[j]) < .02 + and bulk_struc[i].properties.get('ghost', False) == defect_struc[j].properties.get('ghost', False) + ): + matching_indices.append((i, j)) + dis.append(j) + break + + def_index = list(set(range(len(defect_struc))).difference(set(dis))) + + # Consider a possible polaron + if len(def_index) == 0: + matching_indices = [] + + for j in range(len(defect_struc)): + for i in range(len(bulk_struc)): + if ( + bulk_struc[i].specie.as_dict().get('element') == defect_struc[j].specie.as_dict().get('element') + and bulk_struc[i].distance(defect_struc[j]) < .02 + and bulk_struc[i].properties.get('ghost', False) == defect_struc[j].properties.get('ghost', + False)): + matching_indices.append((i, j)) + break + + oxi_diff = [ + abs( + final_defect_struc.site_properties['oxidation_state'][d] - + final_bulk_struc.site_properties['oxidation_state'][b] + ) + for b, d in matching_indices + ] + def_index = np.argmax(oxi_diff) + matching_indices.pop(def_index) + return def_index, matching_indices + + elif len(def_index) > 1: + print(def_index) + raise ValueError("The provided defect structure and bulk structure " + "have more than one potential defect site") + + return def_index[0], matching_indices + + +def get_dielectric(mpid): + with MPRester() as mp: + dat = mp.get_data(mpid, prop='diel') + band_gap = mp.get_data(mpid, prop='band_gap')[0]['band_gap'] + if band_gap == 0.0: + return np.inf + try: + return dat[0]['diel']['e_total'] + except: + return None + +def get_mpid(s): + struc = deepcopy(s) + struc.remove_oxidation_states() + struc.remove_spin() + for p in struc.site_properties: + struc.remove_site_property(p) + sga = SpacegroupAnalyzer(struc) + sm = StructureMatcher() + with MPRester() as mp: + dat = mp.query( + criteria={ + 'chemsys': struc.composition.chemical_system, + 'spacegroup.symbol': sga.get_space_group_symbol() + }, + properties=['material_id', 'structure'] + ) + dat = list(filter(lambda x: sm.fit(s, x['structure']), dat)) + if len(dat) == 1: + return dat[0]['material_id'] + return None + + +def synchronous_query(store1, store2, query, properties=None): + for t in store1.query(query, properties): + for key in t.keys(): + if isinstance(t[key], dict) and t[key].get('gfs_id'): + t.update( + { + k: v for k,v in next(store2.query({"_id": t[key].get('gfs_id')}, properties)).items() + if properties is None or k in properties + } + ) + yield t diff --git a/emmet-builders/emmet/builders/materials/dielectric.py b/emmet-builders/emmet/builders/materials/dielectric.py index 68d9a8aa86..262dbcf5ae 100644 --- a/emmet-builders/emmet/builders/materials/dielectric.py +++ b/emmet-builders/emmet/builders/materials/dielectric.py @@ -74,7 +74,6 @@ def get_items(self): for mat in mats: doc = self._get_processed_doc(mat) - if doc is not None: yield doc else: @@ -151,7 +150,7 @@ def _get_processed_doc(self, mat): "output.epsilon_ionic", "output.bandgap", ], - criteria={self.tasks.key: str(task_id)}, + criteria={self.tasks.key: int(task_id)}, ) if task_query["output"]["bandgap"] > 0: diff --git a/emmet-builders/emmet/builders/materials/electronic_structure.py b/emmet-builders/emmet/builders/materials/electronic_structure.py index e84d6aa1e0..8bc99ff18b 100644 --- a/emmet-builders/emmet/builders/materials/electronic_structure.py +++ b/emmet-builders/emmet/builders/materials/electronic_structure.py @@ -17,7 +17,7 @@ from emmet.core.settings import EmmetSettings from emmet.core.electronic_structure import ElectronicStructureDoc from emmet.core.utils import jsanitize - +import bson SETTINGS = EmmetSettings() @@ -367,7 +367,6 @@ def _update_materials_doc(self, mat_id): if "NSCF Line" in mat["task_types"][task_id]: bs_type = None - task_query = self.tasks.query_one( properties=[ "calcs_reversed", @@ -378,7 +377,7 @@ def _update_materials_doc(self, mat_id): "input.parameters", "output.structure", ], - criteria={"task_id": str(task_id)}, + criteria={"task_id": int(task_id)}, ) fs_id = str(task_query["calcs_reversed"][0]["bandstructure_fs_id"]) @@ -441,7 +440,7 @@ def _update_materials_doc(self, mat_id): "input.parameters", "output.structure", ], - criteria={"task_id": str(task_id)}, + criteria={"task_id": int(task_id)}, ) fs_id = str(task_query["calcs_reversed"][0]["dos_fs_id"]) @@ -493,7 +492,7 @@ def _update_materials_doc(self, mat_id): "calcs_reversed", "output.structure", ], - criteria={"task_id": str(task_id)}, + criteria={"task_id": int(task_id)}, ) structure = Structure.from_dict(task_query["output"]["structure"]) @@ -566,11 +565,11 @@ def _obtain_blessed_calculations( ] bs_obj = self.bandstructure_fs.query_one( - criteria={"fs_id": sorted_bs_data[0]["fs_id"]} + criteria={"_id": bson.ObjectId(sorted_bs_data[0]["fs_id"])} ) materials_doc["bandstructure"][bs_type]["object"] = ( - bs_obj["data"] if bs_obj is not None else None + bs_obj if bs_obj is not None else None ) materials_doc["bandstructure"][bs_type][ @@ -593,12 +592,11 @@ def _obtain_blessed_calculations( materials_doc["dos"]["task_id"] = sorted_dos_data[0]["task_id"] materials_doc["dos"]["lmaxmix"] = sorted_dos_data[0]["lmaxmix"] - dos_obj = self.dos_fs.query_one( - criteria={"fs_id": sorted_dos_data[0]["fs_id"]} + criteria={"_id": bson.ObjectId(sorted_dos_data[0]["fs_id"])} ) materials_doc["dos"]["object"] = ( - dos_obj["data"] if dos_obj is not None else None + dos_obj if dos_obj is not None else None ) materials_doc["dos"]["output_structure"] = sorted_dos_data[0][ diff --git a/emmet-builders/emmet/builders/materials/materials.py b/emmet-builders/emmet/builders/materials/materials.py new file mode 100644 index 0000000000..322bdc1017 --- /dev/null +++ b/emmet-builders/emmet/builders/materials/materials.py @@ -0,0 +1,336 @@ +""" +Unified materials builder for tasks docs from different calc codes. +""" + +from datetime import datetime +from itertools import chain, groupby +import itertools +from math import ceil +from typing import Dict, Iterable, Iterator, List, Optional, Union + +from maggma.builders import Builder +from maggma.stores import Store +from maggma.utils import grouper + +from emmet.builders.settings import EmmetBuildSettings +from emmet.core.utils import group_structures, jsanitize +from pymatgen.core import Structure + +__author__ = "Nicholas Winner " + +SETTINGS = EmmetBuildSettings() + +class MaterialsBuilder(Builder): + """ + The Materials Builder matches VASP task documents by structure similarity into materials + document. The purpose of this builder is group calculations and determine the best structure. + All other properties are derived from other builders. + + The process is as follows: + + 1.) Find all documents with the same formula + 2.) Select only task documents for the task_types we can select properties from + 3.) Aggregate task documents based on strucutre similarity + 4.) Create a MaterialDoc from the group of task documents + 5.) Validate material document + + """ + + def __init__( + self, + tasks: Union[Store, Iterable[Store]], + materials: Store, + task_validation: Store, + query: Optional[Dict] = None, + settings: Optional[EmmetBuildSettings] = None, + **kwargs, + ): + """ + Args: + tasks: Store of task documents + materials: Store of materials documents to generate + task_validation: Store for storing task validation results + query: dictionary to limit tasks to be analyzed + settings: EmmetSettings to use in the build process + """ + + self.tasks = tasks if isinstance(tasks, Iterable) else [tasks] + self.materials = materials + self.task_validation = task_validation + self.query = query if query else {} + self.settings = EmmetBuildSettings.autoload(settings) + + if len(set(t.key for t in self.tasks)) != 1: + raise ValueError("All task stores must have the same key") + + self._tasks_key = self.tasks[0].key + self._TASK_DOCS = dict() + self._MAT_DOCS = dict() + for calc_code in self.settings.ALLOWED_CALC_CODES: + mod = __import__(f"emmet.core.{calc_code.lower()}.task", globals(), locals(), ["TaskDocument"], 0) + self._TASK_DOCS[calc_code] = getattr(mod, "TaskDocument") + mod = __import__(f"emmet.core.{calc_code.lower()}.material", globals(), locals(), ["MaterialsDoc"], 0) + self._MAT_DOCS[calc_code] = getattr(mod, "MaterialsDoc") + + self.kwargs = kwargs + + sources = self.tasks + [self.task_validation] + super().__init__(sources=sources, targets=[materials], **kwargs) + + def _get_allowed_task_types(self, calc_code): + if calc_code == 'vasp': + return self.settings.VASP_ALLOWED_VASP_TYPES + if calc_code == 'cp2k': + return self.settings.CP2K_ALLOWED_CP2K_TYPES + + @property + def tasks_key(self): + return self._tasks_key + + @property + def task_docs(self): + """ + Returns: + dict: A dictionary of task document classes + """ + return self._TASK_DOCS + + @property + def mat_docs(self): + """ + Returns: + dict: A dictionary of material document classes + """ + return self._MAT_DOCS + + def ensure_indexes(self): + """ + Ensures indicies on the tasks and materials collections + """ + + for tasks in self.tasks: + # Basic search index for tasks + tasks.ensure_index("task_id") + tasks.ensure_index("last_updated") + tasks.ensure_index("state") + tasks.ensure_index("formula_pretty") + + # Search index for materials + self.materials.ensure_index("material_id") + self.materials.ensure_index("last_updated") + self.materials.ensure_index("task_ids") + + if self.task_validation: + self.task_validation.ensure_index("task_id") + self.task_validation.ensure_index("valid") + + def prechunk(self, number_splits: int) -> Iterable[Dict]: # pragma: no cover + """Prechunk the materials builder for distributed computation""" + temp_query = dict(self.query) + temp_query["state"] = "successful" + if len(self.settings.BUILD_TAGS) > 0 and len(self.settings.EXCLUDED_TAGS) > 0: + temp_query["$and"] = [ + {"tags": {"$in": self.settings.BUILD_TAGS}}, + {"tags": {"$nin": self.settings.EXCLUDED_TAGS}}, + ] + elif len(self.settings.BUILD_TAGS) > 0: + temp_query["tags"] = {"$in": self.settings.BUILD_TAGS} + + self.logger.info("Finding tasks to process") + all_tasks = list( + self.tasks.query(temp_query, [self.tasks.key, "formula_pretty"]) + ) + + processed_tasks = set(self.materials.distinct("task_ids")) + to_process_tasks = {d[self.tasks.key] for d in all_tasks} - processed_tasks + to_process_forms = { + d["formula_pretty"] + for d in all_tasks + if d[self.tasks.key] in to_process_tasks + } + + N = ceil(len(to_process_forms) / number_splits) + + for formula_chunk in grouper(to_process_forms, N): + + yield {"query": {"formula_pretty": {"$in": list(formula_chunk)}}} + + def get_items(self) -> Iterator[List[Dict]]: + """ + Gets all items to process into materials documents. + This does no datetime checking; relying on on whether + task_ids are included in the Materials Colection + + Returns: + generator or list relevant tasks and materials to process into materials documents + """ + + self.logger.info("Materials builder started") + for t in self.settings.ALLOWED_CALC_CODES: + tt = getattr(self.settings, f"{t.upper()}_ALLOWED_TASK_TYPES") + self.logger.info( + f"Allowed {t} task types: {[task_type.value for task_type in tt]}" + ) + + self.logger.info("Setting indexes") + self.ensure_indexes() + + # Save timestamp to mark buildtime for material documents + self.timestamp = datetime.utcnow() + + # Get all processed tasks: + temp_query = dict(self.query) + temp_query["state"] = "successful" + if len(self.settings.BUILD_TAGS) > 0 and len(self.settings.EXCLUDED_TAGS) > 0: + temp_query["$and"] = [ + {"tags": {"$in": self.settings.BUILD_TAGS}}, + {"tags": {"$nin": self.settings.EXCLUDED_TAGS}}, + ] + elif len(self.settings.BUILD_TAGS) > 0: + temp_query["tags"] = {"$in": self.settings.BUILD_TAGS} + + self.logger.info("Finding tasks to process") + self.logger.info(self.tasks) + all_tasks = [] + for tasks in self.tasks: + all_tasks.extend( + [t for t in tasks.query(temp_query, [self.tasks_key, "formula_pretty"])] + ) + processed_tasks = set(self.materials.distinct("task_ids")) + to_process_tasks = {d[self.tasks_key] for d in all_tasks} - processed_tasks + to_process_forms = { + d["formula_pretty"] + for d in all_tasks + if d[self.tasks_key] in to_process_tasks + } + + self.logger.info(f"Found {len(to_process_tasks)} unprocessed tasks") + self.logger.info(f"Found {len(to_process_forms)} unprocessed formulas") + + # Set total for builder bars to have a total + self.total = len(to_process_forms) + + validation = { + doc[self.tasks_key]: {'valid': doc["valid"], 'calc_code': doc['calc_code']} + for doc in self.task_validation.query( + criteria={"calc_code": {"$exists": True}, "valid": {"$exists": True}}, + properties=[self.task_validation.key, "valid", "calc_code"], + ) + } + + for formula in to_process_forms: + tasks_query = dict(temp_query) + tasks_query["formula_pretty"] = formula + tasks = [] + for t in self.tasks: + tasks.extend( + [tsk for tsk in t.query(criteria=tasks_query, properties=None)] + ) + + for t in tasks: + t["is_valid"] = validation[t[self.tasks_key]]['valid'] + t["calc_code"] = validation[t[self.tasks_key]]['calc_code'] + + yield tasks + + def process_item(self, items: List[Dict]) -> List[Dict]: + """ + Process the tasks into a list of materials + + Args: + tasks [dict] : a list of task docs + + Returns: + ([dict],list) : a list of new materials docs and a list of task_ids that were processsed + """ + + materials = [] + for structure_group in self.group_tasks(items): + materials.append([]) + sorted_structures = sorted(structure_group, key=lambda x: x['calc_code']) + self.logger.debug(f"Processing {len(sorted_structures)} tasks for a structure") + + for key, task_group in groupby(sorted_structures, key=lambda x: x["calc_code"]): + self.logger.debug(f"Processing tasks for the calc_code {key}") + + tasks = [self.task_docs[key.upper()](**task) for task in task_group] + formula = tasks[0].formula_pretty + task_ids = [task.task_id for task in tasks] + self.logger.debug(f"Processing {formula} : {task_ids}") + + try: + doc = self.mat_docs[key.upper()].from_tasks( + tasks, + quality_scores=self.settings.VASP_QUALITY_SCORES, + use_statics=self.settings.VASP_USE_STATICS, + ) + materials[-1].append(doc) + + except Exception as e: + failed_ids = list({t_.task_id for t_ in tasks}) + doc = self.mat_docs[key.upper()].construct_deprecated_material(tasks) + doc.warnings.append(str(e)) + materials[-1].append(doc) + self.logger.warn( + f"Failed making material for {failed_ids}." + f" Inserted as deprecated Material: {doc.material_id}" + ) + + if key == 'vasp': + material_id = doc.material_id + + for m in materials[-1]: + m.material_id = material_id + + self.logger.debug(f"Produced {len(materials)} materials for {formula}") + mats = list(itertools.chain.from_iterable(materials)) + return jsanitize([m.dict() for m in mats], allow_bson=True) + + def update_targets(self, items: List[List[Dict]]): + """ + Inserts the new task_types into the task_types collection + + Args: + items ([([dict],[int])]): A list of tuples of materials to update and the corresponding + processed task_ids + """ + + docs = list(chain.from_iterable(items)) # type: ignore + + for item in docs: + item.update({"_bt": self.timestamp}) + + material_ids = list({item["material_id"] for item in docs}) + + if len(items) > 0: + self.logger.info(f"Updating {len(docs)} materials") + self.materials.remove_docs({self.materials.key: {"$in": material_ids}}) + self.materials.update( + docs=docs, key=["material_id"], + ) + else: + self.logger.info("No items to update") + + def group_tasks( + self, tasks: List + ) -> Iterator[List]: + """ + Groups tasks by structure matching + """ + structures = [] + + for idx, task in enumerate(tasks): + s = Structure.from_dict(task['output']['structure']) + s.index: int = idx # type: ignore + structures.append(s) + + grouped_structures = group_structures( + structures, + ltol=self.settings.LTOL, + stol=self.settings.STOL, + angle_tol=self.settings.ANGLE_TOL, + symprec=self.settings.SYMPREC, + ) + for group in grouped_structures: + grouped_tasks = [tasks[struc.index] for struc in group] # type: ignore + yield grouped_tasks diff --git a/emmet-builders/emmet/builders/settings.py b/emmet-builders/emmet/builders/settings.py index 83f48c4876..9cfadbad50 100644 --- a/emmet-builders/emmet/builders/settings.py +++ b/emmet-builders/emmet/builders/settings.py @@ -9,6 +9,7 @@ from emmet.core.settings import EmmetSettings from emmet.core.vasp.calc_types import TaskType as VaspTaskType from emmet.core.qchem.calc_types import TaskType as QChemTaskType +from emmet.core.cp2k.calc_types import TaskType as Cp2kTaskType class EmmetBuildSettings(EmmetSettings): @@ -30,7 +31,12 @@ class EmmetBuildSettings(EmmetSettings): [], description="Tags for calculations to deprecate" ) - VASP_ALLOWED_VASP_TYPES: List[VaspTaskType] = Field( + ALLOWED_CALC_CODES: List[str] = Field( + ['VASP', 'CP2K'], + description="Calculation codes to allow in the build pipeline" + ) + + VASP_ALLOWED_TASK_TYPES: List[VaspTaskType] = Field( [t.value for t in VaspTaskType], description="Allowed task_types to build materials from", ) @@ -40,6 +46,11 @@ class EmmetBuildSettings(EmmetSettings): description="Allowed task_types to build molecules from", ) + CP2K_ALLOWED_TASK_TYPES: List[Cp2kTaskType] = Field( + [t.value for t in Cp2kTaskType], + description="Allowed CP2K task_types to build materials from", + ) + DEFAULT_REFERENCE: str = Field( "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and " "Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and " diff --git a/emmet-builders/emmet/builders/vasp/defects.py b/emmet-builders/emmet/builders/vasp/defects.py new file mode 100644 index 0000000000..09424127e2 --- /dev/null +++ b/emmet-builders/emmet/builders/vasp/defects.py @@ -0,0 +1,936 @@ +""" +This module contains 2 builders for defect properties in non-metals. + +1. The DefectBuilder builds individual Defect documents + with bulk and dielectric properties included, along + with properties neccessary for accessing delocalization + and parsing further defect thermodynamics. +2. The DefectThermoBuilder builds DefectPhaseDiagram + documents from defect objects with identical bulk structures + and calculation metadata. +""" + +from datetime import datetime +import numpy as np + +from monty.json import MontyDecoder, jsanitize + +from pymatgen import Structure, MPRester, PeriodicSite +from pymatgen.analysis.structure_matcher import StructureMatcher, PointDefectComparator +from pymatgen.electronic_structure.bandstructure import BandStructure +from pymatgen.analysis.defects.core import Vacancy, Substitution, Interstitial, DefectEntry +from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram +from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility + +from maggma.builder import Builder + + +__author__ = "Danny Broberg, Shyam Dwaraknath" + + +class DefectBuilder(Builder): + def __init__(self, + tasks, + defects, + query=None, + compatibility=DefectCompatibility(), + ltol=0.2, + stol=0.3, + angle_tol=5, + max_items_size=0, + update_all=False, + **kwargs): + """ + Creates DefectEntry from vasp task docs + + Args: + tasks (Store): Store of tasks documents + defects (Store): Store of defect entries with all metadata required for followup decisions on defect thermo + query (dict): dictionary to limit materials to be analyzed + compatibility (DefectCompatability): Compatability module to ensure defect calculations are compatible + ltol (float): StructureMatcher tuning parameter for matching tasks to materials + stol (float): StructureMatcher tuning parameter for matching tasks to materials + angle_tol (float): StructureMatcher tuning parameter for matching tasks to materials + max_items_size (int): limits number of items approached from tasks (zero places no limit on number of items) + update_all (bool): Whether to consider all task ids from defects store. + Default is False (will not re-consider task-ids which have been considered previously. + """ + + self.tasks = tasks + self.defects = defects + self.query = query if query else {} + self.compatibility = compatibility + self.ltol = ltol + self.stol = stol + self.angle_tol = angle_tol + self.max_items_size = max_items_size + self.update_all=update_all + super().__init__(sources=[tasks], targets=[defects], **kwargs) + + def get_items(self): + """ + Gets sets of entries from chemical systems that need to be processed + + Returns: + generator of relevant entries from one chemical system + + """ + self.logger.info("Defect Builder Started") + + self.logger.info("Setting indexes") + self.ensure_indicies() + + # Save timestamp for update operation + self.time_stamp = datetime.utcnow() + + # Get all successful defect tasks that have been updated since + # defect_store was last updated + q = dict(self.query) + q["state"] = "successful" + if not self.update_all: + if 'task_id' in q: + if isinstance(q['task_id'], int) or isinstance(q['task_id'], float): + q['task_id'] = {'$nin': [], '$in': [q['task_id']]} + if '$nin' in q['task_id']: + q['task_id']['$nin'].extend( self.defects.distinct('entry_id')) + else: + q['task_id'].update( {'$nin': self.defects.distinct('entry_id')}) + else: + q.update({'task_id': {'$nin': self.defects.distinct('entry_id')}}) + + q.update({'transformations.history.@module': + {'$in': ['pymatgen.transformations.defect_transformations']}}) + defect_tasks = list(self.tasks.query(criteria=q, + properties=['task_id', 'transformations.history.0.defect.structure'])) + if self.max_items_size and len(defect_tasks) > self.max_items_size: + defect_tasks = [dtask for dind, dtask in enumerate(defect_tasks) if dind < self.max_items_size] + task_ids = [dtask['task_id'] for dtask in defect_tasks] + self.logger.info("Found {} new defect tasks to consider:\n{}".format( len(defect_tasks), task_ids)) + + # get a few other tasks which are needed for defect entries (regardless of when they were last updated): + # bulk_supercell, dielectric calc, BS calc, HSE-BS calc + bulksc = {"state": "successful", 'transformations.history.0.@module': + {'$in': ['pymatgen.transformations.standard_transformations']}} + dielq = {"state": "successful", "input.incar.LEPSILON": True, "input.incar.LPEAD": True} + HSE_BSq = {"state": "successful", 'calcs_reversed.0.input.incar.LHFCALC': True, + 'transformations.history.0.@module': + {'$nin': ['pymatgen.transformations.defect_transformations', + 'pymatgen.transformations.standard_transformations']}} + # TODO: add smarter capability for getting HSE bandstructure from tasks + # TODO: add capability for getting GGA bandstructure from tasks? + + # now load up all defect tasks with relevant information required for process_item step + # includes querying bulk, diel and hybrid calcs as you go along. + log_additional_tasks = dict() #to minimize number of bulk + diel + hse queries, log by chemsys + + needed_defect_properties = ['task_id', 'transformations', 'input', + 'task_label', 'last_updated', + 'output', 'calcs_reversed', 'chemsys'] + needed_bulk_properties = ['task_id', 'chemsys', 'task_label', + 'last_updated', 'transformations', + 'input', 'output', 'calcs_reversed'] + for d_task_id in task_ids: + d_task = list(self.tasks.query(criteria={"task_id": d_task_id}, + properties=needed_defect_properties))[0] + chemsys = "-".join(sorted((Structure.from_dict( + d_task['transformations']['history'][0]['defect']['structure']).symbol_set))) + + if chemsys not in log_additional_tasks.keys(): + #grab all bulk calcs for chemsys + q = bulksc.copy() + q.update( {"chemsys": chemsys}) + bulk_tasks = list(self.tasks.query(criteria=q, properties=needed_bulk_properties)) + + #grab all diel calcs for chemsys + q = dielq.copy() + q.update( {"chemsys": chemsys}) + diel_tasks = list(self.tasks.query(criteria=q, + properties=['task_id', 'task_label', 'last_updated', + 'input', 'output'])) + + #grab all hse bs calcs for chemsys + q = HSE_BSq.copy() + q.update( {"chemsys": chemsys}) + hybrid_tasks = list(self.tasks.query(criteria=q, + properties=['task_id', 'input', + 'output', 'task_label'])) + + self.logger.debug("\t{} has {} bulk loaded {} diel and {} hse" + "".format( chemsys, len(bulk_tasks), + len(diel_tasks), len(hybrid_tasks))) + + log_additional_tasks.update({chemsys: {'bulksc': bulk_tasks[:], + 'diel': diel_tasks[:], + 'hsebs': hybrid_tasks[:]}}) + + yield [d_task, log_additional_tasks[chemsys]] + + def process_item(self, item): + """ + Process a defect item (containing defect, bulk and dielectric information as processed in get_items) + + Args: + item (defect_task): a defect_task to process into a DefectEntry object + + Returns: + dict: a DefectEntry dictionary to update defect database with + """ + d_task, chemsys_additional_tasks = item + + defect_task = self.find_and_load_bulk_tasks(d_task, chemsys_additional_tasks) + if defect_task is None: + self.logger.error("Could not determine defect bulk properties for {}".format( d_task['task_id'])) + return + elif 'bulk_task' not in defect_task.keys(): + self.logger.error("bulk_task is not in item! Cannot parse task id = {}.".format( d_task['task_id'])) + return + elif 'diel_task_meta' not in defect_task.keys(): + self.logger.error("diel_task_meta is not in item! Cannot parse task id = {}.".format( d_task['task_id'])) + return + + #initialize parameters with dielectric data + eps_ionic = defect_task['diel_task_meta']['epsilon_ionic'] + eps_static = defect_task['diel_task_meta']['epsilon_static'] + eps_total = [] + for i in range(3): + eps_total.append([e[0]+e[1] for e in zip(eps_ionic[i], eps_static[i])]) + parameters = {'epsilon_ionic': eps_ionic, 'epsilon_static': eps_static, + 'dielectric': eps_total, + 'task_level_metadata': + {'diel_taskdb_task_id': defect_task['diel_task_meta']['diel_taskdb_task_id']}} + + parameters = self.get_run_metadata( defect_task, parameters) + + # add bulk data to parameters + bulk_task = defect_task['bulk_task'] + bulk_energy = bulk_task['output']['energy'] + bulk_sc_structure = Structure.from_dict( bulk_task['input']['structure']) + parameters.update( {'bulk_energy': bulk_energy, 'bulk_sc_structure': bulk_sc_structure}) + + parameters = self.get_bulk_gap_data( bulk_task, parameters) + parameters = self.get_bulk_chg_correction_metadata( bulk_task, parameters) + + + # Add defect data to parameters + defect_energy = defect_task['output']['energy'] + parameters.update({'defect_energy': defect_energy}) + + defect, parameters = self.load_defect_and_structure_data( defect_task, parameters) + + parameters = self.load_defect_chg_correction_metadata( defect, defect_task, parameters) + + if 'vr_eigenvalue_dict' in defect_task['calcs_reversed'][0]['output'].keys(): + eigenvalues = defect_task['calcs_reversed'][0]['output']['vr_eigenvalue_dict']['eigenvalues'] + kpoint_weights = defect_task['calcs_reversed'][0]['output']['vr_eigenvalue_dict']['kpoint_weights'] + parameters.update( {'eigenvalues': eigenvalues, + 'kpoint_weights': kpoint_weights} ) + else: + self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have eigenvalue data for parsing ' + 'bandfilling.'.format(defect_task['task_label'], defect_task['task_id'])) + + if 'defect' in defect_task['calcs_reversed'][0]['output'].keys(): + parameters.update( {'defect_ks_delocal_data': + defect_task['calcs_reversed'][0]['output']['defect'].copy()}) + else: + self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have defect data for parsing ' + 'delocalization.'.format(defect_task['task_label'], defect_task['task_id'])) + + defect_entry = DefectEntry( defect, parameters['defect_energy'] - parameters['bulk_energy'], + corrections = {}, parameters = parameters, entry_id= defect_task['task_id']) + + defect_entry = self.compatibility.process_entry( defect_entry) + defect_entry.parameters = jsanitize( defect_entry.parameters, strict=True, allow_bson=True) + defect_entry_as_dict = defect_entry.as_dict() + defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id'] #this seemed neccessary for legacy db + + return defect_entry_as_dict + + def update_targets(self, items): + """ + Inserts the defect docs into the defect collection + + Args: + items ([dict]): a list of defect entries as dictionaries + """ + items = [item for item in items if item] + self.logger.info("Updating {} defect documents".format(len(items))) + + self.defects.update(items, update_lu=True, key='entry_id') + + def ensure_indicies(self): + """ + Ensures indicies on the tasks and defects collections + :return: + """ + # Search indicies for tasks + self.tasks.ensure_index(self.tasks.key, unique=True) + self.tasks.ensure_index("chemsys") + + # Search indicies for defects + self.defects.ensure_index(self.defects.key, unique=True) + self.defects.ensure_index("chemsys") + + def find_and_load_bulk_tasks(self, defect_task, additional_tasks): + """ + This takes defect_task and finds bulk task, diel task, and other things as appropriate (example hse_data...) + + needs to make sure INCAR settings and structure matching works for all + + :param defect_task: + :param additional_tasks: + :return: + """ + out_defect_task = defect_task.copy() + + # get suitable BULK calc by matching structures (right lattice + # constant + same supercell size...) and matching essential INCAR + POTCAR settings... + bulk_tasks = additional_tasks['bulksc'] + bulk_sm = StructureMatcher( ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol, + primitive_cell=False, scale=False, attempt_supercell=False, allow_subset=False) + bulk_matched = [] + dstruct_withoutdefect = Structure.from_dict(out_defect_task['transformations']['history'][0]['defect']['structure']) + scaling_matrix = out_defect_task['transformations']['history'][0]['scaling_matrix'] + dstruct_withoutdefect.make_supercell( scaling_matrix) + dincar = out_defect_task["calcs_reversed"][0]["input"]["incar"] #identify essential INCAR properties which differentiate different calcs + dincar_reduced = {k: dincar.get(k, None) if dincar.get(k) not in ['None', 'False', False] else None + for k in ["LHFCALC", "HFSCREEN", "IVDW", "LUSE_VDW", "LDAU", "METAGGA"] + } + d_potcar_base = {'pot_spec': [potelt["titel"] for potelt in out_defect_task['input']['potcar_spec']], + 'pot_labels': out_defect_task['input']['pseudo_potential']['labels'][:], + 'pot_type': out_defect_task['input']['pseudo_potential']['pot_type'], + 'functional': out_defect_task['input']['pseudo_potential']['functional']} + + for b_task in bulk_tasks: + bstruct = Structure.from_dict(b_task['input']['structure']) + if bulk_sm.fit( bstruct, dstruct_withoutdefect): + #also match essential INCAR and POTCAR settings + bincar = b_task["calcs_reversed"][0]["input"]["incar"] + bincar_reduced = {k: bincar.get(k, None) if bincar.get(k) not in ['None', 'False', False] else None + for k in dincar_reduced.keys() + } + + b_potcar = {'pot_spec': set([potelt["titel"] for potelt in b_task['input']['potcar_spec']]), + 'pot_labels': set(b_task['input']['pseudo_potential']['labels'][:]), + 'pot_type': b_task['input']['pseudo_potential']['pot_type'], + 'functional': b_task['input']['pseudo_potential']['functional']} + d_potcar = d_potcar_base.copy() #need to reduce in cases of extrinsic species or reordering + d_potcar['pot_spec'] = set([d for d in d_potcar['pot_spec'] if d in b_potcar['pot_spec']]) + d_potcar['pot_labels'] = set([d for d in d_potcar['pot_labels'] if d in b_potcar['pot_labels']]) + + #track to make sure that cartesian coords are same (important for several levels of analysis in defect builder) + same_cart_positions = True + for bsite_coords in bstruct.cart_coords: + if not len( dstruct_withoutdefect.get_sites_in_sphere(bsite_coords, .1)): + same_cart_positions = False + + if bincar_reduced == dincar_reduced and b_potcar == d_potcar and same_cart_positions: + bulk_matched.append( b_task.copy()) + else: + try: + self.logger.debug("Bulk structure match was found for {} with {}, " + "but:".format( b_task['task_label'], out_defect_task['task_label'])) + except: + self.logger.debug("BULK STRUCT MATCH was found, but task_label did not exist AND:") + if bincar_reduced != dincar_reduced: + out_inc = {k:[v, bincar_reduced[k]] for k,v in dincar_reduced.items() if v != bincar_reduced[k]} + self.logger.debug("\tIncars were different: {} ".format( out_inc)) + if b_potcar != d_potcar: + out_pot = {k:[v, b_potcar[k]] for k,v in d_potcar.items() if v != b_potcar[k]} + self.logger.debug("\tPotcar specs were different: {} ".format( out_pot)) + if not same_cart_positions: + self.logger.debug("\tBulk site coords were different") + + #if bulk_task found then take most recently updated bulk_task for defect + if len( bulk_matched): + bulk_matched = sorted( bulk_matched, key=lambda x: x["last_updated"], reverse=True) + out_defect_task["bulk_task"] = bulk_matched[0].copy() + self.logger.debug("Found {} possible bulk supercell structures. Taking most recent entry updated " + "on: {}".format(len(bulk_matched), bulk_matched[0]['last_updated'])) + else: + self.logger.error("Bulk task doesnt exist for: {} ({})! Cant create defect " + "object...\nMetadata: {}\n{}".format( out_defect_task['task_label'], + out_defect_task['task_id'], + d_potcar, dincar_reduced)) + return None + + + # get suitable dielectric calc by matching structures (only lattice + # constant fitting needed - not supercell) and POTCAR settings... + diel_task_list = additional_tasks['diel'] + diel_sm = StructureMatcher( ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol, + primitive_cell=True, scale=False, attempt_supercell=True, allow_subset=False) + diel_matched = [] + for diel_task in diel_task_list: + diel_struct = Structure.from_dict( diel_task['input']['structure']) + if diel_sm.fit( diel_struct, dstruct_withoutdefect): + #also match essential POTCAR settings and confirm LVTOT = True and LVHAR = True + + diel_potcar = {'pot_spec': set([potelt["titel"] for potelt in diel_task['input']['potcar_spec']]), + 'pot_labels': set(diel_task['input']['pseudo_potential']['labels'][:]), + 'pot_type': diel_task['input']['pseudo_potential']['pot_type'], + 'functional': diel_task['input']['pseudo_potential']['functional']} + d_potcar = d_potcar_base.copy() #need to reduce in cases of extrinsic species or reordering + d_potcar['pot_spec'] = set([d for d in d_potcar['pot_spec'] if d in diel_potcar['pot_spec']]) + d_potcar['pot_labels'] = set([d for d in d_potcar['pot_labels'] if d in diel_potcar['pot_labels']]) + + if diel_potcar == d_potcar: + diel_matched.append( diel_task.copy()) + else: + try: + self.logger.debug("Dielectric structure match was found for {} with {}, " + "but:".format( diel_task['task_label'], out_defect_task['task_label'])) + except: + self.logger.debug("Dielectric STRUCT MATCH was found, but task_label did not exist AND:") + out_pot = {k:[v, diel_potcar[k]] for k,v in d_potcar.items() if v != diel_potcar[k]} + self.logger.debug("\tPotcar specs were different: {} ".format( out_pot)) + # else: + # self.logger.debug("{} ({}) had a structure which did not match {} for use " + # "as a dielectric calculation".format( diel_task['task_label'], + # diel_task['task_id'], + # out_defect_task['task_label'])) + + #if diel_tasks found then take most recently updated bulk_task for defect + if len( diel_matched): + diel_matched = sorted( diel_matched, key=lambda x: x["last_updated"], reverse=True) + diel_dict = {'diel_taskdb_task_id': diel_matched[0]['task_id'], + 'epsilon_static': diel_matched[0]['output']['epsilon_static'], + 'epsilon_ionic': diel_matched[0]['output']['epsilon_ionic']} + out_defect_task["diel_task_meta"] = diel_dict.copy() + self.logger.debug("Found {} possible dieletric calcs. Taking most recent entry updated " + "on: {}".format(len(diel_matched), diel_matched[0]['last_updated'])) + else: + try: + self.logger.error("Dielectric task doesnt exist for: {} ({})! Cant create defect " + "object...\nMetadata for defect: {}\n{}".format( out_defect_task['task_label'], + out_defect_task['task_id'], + d_potcar, dincar_reduced)) + except: + self.logger.debug("DIEL TASK DNE.") + + return None + + # FINALLY consider grabbing extra hybrid BS information... + # first confirm from the INCAR setting that this defect is NOT an HSE calculation itself... + if dincar_reduced['LHFCALC'] in [None, False, 'False'] and len(additional_tasks['hsebs']): + hse_bs_matched = [] + for hse_bs_task in additional_tasks['hsebs']: + hse_bs_struct = Structure.from_dict(hse_bs_task['input']['structure']) + if diel_sm.fit(hse_bs_struct, dstruct_withoutdefect): #can use same matching scheme as the dielectric structure matcher + hse_bs_matched.append( hse_bs_task.copy()) + else: + self.logger.debug("task id {} has a structure which did not match {} for use " + "as an HSE BS calculation".format( hse_bs_task['task_id'], + out_defect_task['task_label'])) + + if len(hse_bs_matched): #match the lowest CBM and highest VBM values, keeping track of their task_ids + hybrid_cbm_data = min([[htask['output']['cbm'], htask['task_id']] for htask in hse_bs_matched]) + hybrid_vbm_data = max([[htask['output']['vbm'], htask['task_id']] for htask in hse_bs_matched]) + hybrid_meta = {'hybrid_cbm': hybrid_cbm_data[0], 'hybrid_CBM_task_id': hybrid_cbm_data[1], + 'hybrid_vbm': hybrid_vbm_data[0], 'hybrid_VBM_task_id': hybrid_vbm_data[1]} + out_defect_task["hybrid_bs_meta"] = hybrid_meta.copy() + self.logger.debug("Found hybrid band structure properties for {}:\n\t{}".format( out_defect_task['task_label'], + hybrid_meta)) + else: + self.logger.debug("Could NOT find hybrid band structure properties for {} despite " + "there being {} eligible hse calculations".format( out_defect_task['task_label'], + len(additional_tasks['hsebs']))) + + return out_defect_task + + def get_run_metadata(self, item, parameters): + bulk_task = item['bulk_task'] + + # load INCAR, KPOINTS, POTCAR and task_id (for both bulk and defect) + potcar_summary = {'pot_spec': list([potelt["titel"] for potelt in item['input']['potcar_spec']]), + 'pot_labels': list(item['input']['pseudo_potential']['labels'][:]), + 'pot_type': item['input']['pseudo_potential']['pot_type'], + 'functional': item['input']['pseudo_potential'][ + 'functional']} # note bulk has these potcar values also, other wise it would not get to process_items + dincar = item["input"]["incar"].copy() + dincar_reduced = {k: dincar.get(k, None) for k in ["LHFCALC", "HFSCREEN", "IVDW", "LUSE_VDW", + "LDAU", "METAGGA"]} # same as bulk + bincar = item["input"]["incar"].copy() + d_kpoints = item['calcs_reversed'][0]['input']['kpoints'] + if type(d_kpoints) != dict: + d_kpoints = d_kpoints.as_dict() + b_kpoints = bulk_task['calcs_reversed'][0]['input']['kpoints'] + if type(b_kpoints) != dict: + b_kpoints = b_kpoints.as_dict() + + dir_name = item['calcs_reversed'][0]['dir_name'] + bulk_dir_name = bulk_task['calcs_reversed'][0]['dir_name'] + + parameters['task_level_metadata'].update({'defect_dir_name': dir_name, + 'bulk_dir_name': bulk_dir_name, + 'bulk_taskdb_task_id': bulk_task['task_id'], + 'potcar_summary': potcar_summary.copy(), + 'incar_calctype_summary': dincar_reduced.copy(), + 'defect_incar': dincar.copy(), + 'bulk_incar': bincar.copy(), + 'defect_kpoints': d_kpoints.copy(), + 'bulk_kpoints': b_kpoints.copy(), + 'defect_task_last_updated': item['last_updated']}) + + if (dincar_reduced['HFSCREEN'] in [None, False, 'False']) and ("hybrid_bs_meta" in item): + parameters.update({k: item["hybrid_bs_meta"][k] for k in ['hybrid_cbm', 'hybrid_vbm']}) + parameters.update({'hybrid_gap': parameters['hybrid_cbm'] - parameters['hybrid_vbm']}) + parameters['task_level_metadata'].update({k: item["hybrid_bs_meta"][k] for k in ['hybrid_CBM_task_id', + 'hybrid_VBM_task_id']}) + return parameters + + def get_bulk_chg_correction_metadata(self, bulk_task, parameters): + if 'locpot' in bulk_task['calcs_reversed'][0]['output'].keys(): + bulklpt = bulk_task['calcs_reversed'][0]['output']['locpot'] + axes = list(bulklpt.keys()) + axes.sort() + parameters.update( {'bulk_planar_averages': [bulklpt[ax] for ax in axes]} ) + else: + self.logger.error('BULKTYPEcalc: {} (task-id {}) does not ' + 'have locpot values for parsing'.format( bulk_task['task_label'], + bulk_task['task_id'])) + + if 'outcar' in bulk_task['calcs_reversed'][0]['output'].keys(): + bulkoutcar = bulk_task['calcs_reversed'][0]['output']['outcar'] + bulk_atomic_site_averages = bulkoutcar['electrostatic_potential'] + parameters.update( {'bulk_atomic_site_averages': bulk_atomic_site_averages}) + else: + self.logger.error('BULKTYPEcalc: {} (task-id {}) does not ' + 'have outcar values for parsing'.format( bulk_task['task_label'], + bulk_task['task_id'])) + return parameters + + def get_bulk_gap_data(self, bulk_task, parameters): + bulk_structure = parameters['bulk_sc_structure'] + try: + with MPRester() as mp: + # mplist = mp.find_structure(bulk_structure) #had to hack this because this wasnt working?? + tmp_mplist = mp.get_entries_in_chemsys(list(bulk_structure.symbol_set)) + mplist = [ment.entry_id for ment in tmp_mplist if ment.composition.reduced_composition == \ + bulk_structure.composition.reduced_composition] + #TODO: this is a hack because find_structure was data intensive. simplify the hack to do less queries... + except: + raise ValueError("Error with querying MPRester for {}".format( bulk_structure.composition.reduced_formula)) + + mpid_fit_list = [] + for trial_mpid in mplist: + with MPRester() as mp: + mpstruct = mp.get_structure_by_material_id(trial_mpid) + if StructureMatcher(ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol, + primitive_cell=True, scale=False, attempt_supercell=True, + allow_subset=False).fit(bulk_structure, mpstruct): + mpid_fit_list.append( trial_mpid) + + if len(mpid_fit_list) == 1: + mpid = mpid_fit_list[0] + self.logger.debug("Single mp-id found for bulk structure:{}.".format( mpid)) + elif len(mpid_fit_list) > 1: + num_mpid_list = [int(mp.split('-')[1]) for mp in mpid_fit_list] + num_mpid_list.sort() + mpid = 'mp-'+str(num_mpid_list[0]) + self.logger.debug("Multiple mp-ids found for bulk structure:{}\nWill use lowest number mpid " + "for bulk band structure = {}.".format(str(mpid_fit_list), mpid)) + else: + self.logger.debug("Could not find bulk structure in MP database after tying the " + "following list:\n{}".format( mplist)) + mpid = None + + if mpid and not parameters['task_level_metadata']['incar_calctype_summary']['LHFCALC']: + #TODO: NEED to be smarter about use of +U etc in MP gga band structure calculations... + with MPRester() as mp: + bs = mp.get_bandstructure_by_material_id(mpid) + + parameters['task_level_metadata'].update( {'MP_gga_BScalc_data': + bs.get_band_gap().copy()} ) #contains gap kpt transition + cbm = bs.get_cbm()['energy'] + vbm = bs.get_vbm()['energy'] + gap = bs.get_band_gap()['energy'] + else: + parameters['task_level_metadata'].update( {'MP_gga_BScalc_data': None}) #to signal no MP BS is used + cbm = bulk_task['output']['cbm'] + vbm = bulk_task['output']['vbm'] + gap = bulk_task['output']['bandgap'] + + parameters.update( {'mpid': mpid, + "cbm": cbm, "vbm": vbm, "gap": gap} ) + + return parameters + + def load_defect_and_structure_data(self, defect_task, parameters): + """ + This loads defect object from task_dict AND + loads initial and final structures (making sure their sites are indexed in an equivalent manner) + + if can't confirm that indices are equivalent, then initial_defect_structure = None + + :param defect_task: + :param parameters: + :return: + """ + + if type(defect_task['transformations']) != dict: + defect_task['transformations'] = defect_task['transformations'].as_dict() + + defect = defect_task['transformations']['history'][0]['defect'] + needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name'] + defect = MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys}) + + scaling_matrix = MontyDecoder().process_decoded(defect_task['transformations']['history'][0]['scaling_matrix']) + + final_defect_structure = defect_task['output']['structure'] + if type(final_defect_structure) != Structure: + final_defect_structure = Structure.from_dict(final_defect_structure) + + # build initial_defect_structure from very first ionic relaxation step (ensures they are indexed the same] + initial_defect_structure = Structure.from_dict( + defect_task['calcs_reversed'][-1]['output']['ionic_steps'][0]['structure']) + + # confirm structure matching + ids = defect.generate_defect_structure(scaling_matrix) + ids_sm = StructureMatcher( ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol, + primitive_cell=False, scale=False, attempt_supercell=False, + allow_subset=False) + if not ids_sm.fit( ids, initial_defect_structure): + self.logger.error("Could not match initial-to-final structure. Will not load initial structure.") + initial_defect_structure = None + + parameters.update({'final_defect_structure': final_defect_structure, + 'initial_defect_structure': initial_defect_structure, + 'scaling_matrix': scaling_matrix}) + + return defect, parameters + + def load_defect_chg_correction_metadata(self, defect, defect_task, parameters): + + # --> Load information for Freysoldt related parsing + if 'locpot' in defect_task['calcs_reversed'][0]['output'].keys(): + deflpt = defect_task['calcs_reversed'][0]['output']['locpot'] + axes = list(deflpt.keys()) + axes.sort() + defect_planar_averages = [deflpt[ax] for ax in axes] + abc = parameters['initial_defect_structure'].lattice.abc + axis_grid = [] + for ax in range(3): + num_pts = len(defect_planar_averages[ax]) + axis_grid.append([i / num_pts * abc[ax] for i in range(num_pts)]) + parameters.update({'axis_grid': axis_grid, + 'defect_planar_averages': defect_planar_averages}) + else: + self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have locpot values for ' + 'parsing Freysoldt correction'.format(defect_task['task_label'], defect_task['task_id'])) + + + # --> Load information for Kumagai related parsing + if 'outcar' in defect_task['calcs_reversed'][0]['output'].keys() and \ + parameters['initial_defect_structure']: + + defoutcar = defect_task['calcs_reversed'][0]['output']['outcar'] + defect_atomic_site_averages = defoutcar['electrostatic_potential'] + bulk_sc_structure = parameters['bulk_sc_structure'] + initial_defect_structure = parameters['initial_defect_structure'] + + bulksites = [site.frac_coords for site in bulk_sc_structure] + initsites = [site.frac_coords for site in initial_defect_structure] + distmatrix = initial_defect_structure.lattice.get_all_distances(bulksites, initsites) #first index of this list is bulk index + min_dist_with_index = [[min(distmatrix[bulk_index]), int(bulk_index), + int(distmatrix[bulk_index].argmin())] for bulk_index in range(len(distmatrix))] # list of [min dist, bulk ind, defect ind] + + found_defect = False + site_matching_indices = [] + poss_defect = [] + if isinstance(defect, (Vacancy, Interstitial)): + for mindist, bulk_index, defect_index in min_dist_with_index: + if mindist < self.ltol: + site_matching_indices.append( [bulk_index, defect_index]) + elif isinstance(defect, Vacancy): + poss_defect.append( [bulk_index, bulksites[ bulk_index][:]]) + + if isinstance(defect, Interstitial): + poss_defect = [ [ind, fc[:]] for ind, fc in enumerate(initsites) \ + if ind not in np.array(site_matching_indices)[:,1]] + + elif isinstance(defect, Substitution): + for mindist, bulk_index, defect_index in min_dist_with_index: + species_match = bulk_sc_structure[bulk_index].specie == \ + initial_defect_structure[defect_index].specie + if mindist < self.ltol and species_match: + site_matching_indices.append( [bulk_index, defect_index]) + + elif not species_match: + poss_defect.append( [defect_index, initsites[ defect_index][:]]) + + + if len(poss_defect) == 1: + found_defect = True + defect_index_sc_coords = poss_defect[0][0] + defect_frac_sc_coords = poss_defect[0][1] + else: + self.logger.error("Found {} possible defect sites when matching bulk and " + "defect structure".format(len(poss_defect))) + + + if len(set(np.array(site_matching_indices)[:, 0])) != len(set(np.array(site_matching_indices)[:, 1])): + self.logger.error("Error occured in site_matching routine. Double counting of site matching " + "occured:{}\nAbandoning Kumagai parsing.".format(site_matching_indices)) + found_defect = False + + # assuming Wigner-Seitz radius for sampling radius + wz = initial_defect_structure.lattice.get_wigner_seitz_cell() + dist = [] + for facet in wz: + midpt = np.mean(np.array(facet), axis=0) + dist.append(np.linalg.norm(midpt)) + sampling_radius = min(dist) + + if found_defect: + parameters.update({'defect_atomic_site_averages': defect_atomic_site_averages, + 'site_matching_indices': site_matching_indices, + 'sampling_radius': sampling_radius, + 'defect_frac_sc_coords': defect_frac_sc_coords, + 'defect_index_sc_coords': defect_index_sc_coords}) + else: + self.logger.error("Error in mapping procedure for bulk to initial defect structure.") + + else: + self.logger.error('DEFECTTYPEcalc: {} (task-id {}) does not have outcar values for ' + 'parsing Kumagai'.format(defect_task['task_label'], defect_task['task_id'])) + + return parameters + + +class DefectThermoBuilder(Builder): + def __init__(self, + defects, + defectthermo, + query=None, + ltol=0.2, + stol=0.3, + angle_tol=5, + update_all=False, + **kwargs): + """ + Creates DefectEntry from vasp task docs + + Args: + defects (Store): Store of defect entries + defectthermo (Store): Store of DefectPhaseDiagram documents + query (dict): dictionary to limit materials to be analyzed (note query is for defects Store) + ltol (float): StructureMatcher tuning parameter for matching tasks to materials + stol (float): StructureMatcher tuning parameter for matching tasks to materials + angle_tol (float): StructureMatcher tuning parameter for matching tasks to materials + update_all (bool): Whether to consider all task ids from defects store. + Default is False (will not re-consider task-ids which have been considered previously. + """ + + self.defects = defects + self.defectthermo = defectthermo + self.query = query if query else {} + self.ltol = ltol + self.stol = stol + self.angle_tol = angle_tol + self.update_all = update_all + super().__init__(sources=[defects], targets=[defectthermo], **kwargs) + + def get_items(self): + self.logger.info("DefectThermo Builder Started") + + # Save timestamp for update operation + self.time_stamp = datetime.utcnow() + + #get all new Defect Entries since last time DefectThermo was updated... + q = dict(self.query) + self.logger.debug('query is initially: {}'.format( q)) + if not self.update_all: + # if not update_all then grab entry_ids of defects that have been analyzed already... + prev_dpd = list(self.defectthermo.query(properties=['metadata.all_entry_ids_considered'])) + self.logger.debug('Found {} previous dpd objects'.format( len(prev_dpd))) + + if "entry_id" not in q.keys(): + q["entry_id"] = {"$nin": []} + elif "$nin" not in q["entry_id"].keys(): + q["entry_id"]["$nin"] = [] + for dpd in prev_dpd: + q["entry_id"]["$nin"].extend( list( dpd['metadata']['all_entry_ids_considered'])) + + self.logger.debug('query after removing previously considered entry_ids is: {}'.format( q)) + + # q.update(self.defects.lu_filter(self.defectthermo)) #TODO: does this work?? / is it needed? + + # restricted amount of defect info for PD, so not an overwhelming database query + entry_keys_needed_for_thermo = ['defect', 'parameters.task_level_metadata', 'parameters.last_updated', + 'task_id', 'entry_id', '@module', '@class', + 'uncorrected_energy', 'corrections', 'parameters.dielectric', + 'parameters.cbm', 'parameters.vbm', 'parameters.gap', + 'parameters.hybrid_cbm', 'parameters.hybrid_vbm', + 'parameters.hybrid_gap', 'parameters.potalign', + 'parameters.freysoldt_meta', + 'parameters.kumagai_meta', 'parameters.is_compatible', + 'parameters.delocalization_meta', 'parameters.phasediagram_meta'] + defect_entries = list(self.defects.query(criteria=q, + properties=entry_keys_needed_for_thermo)) + thermo_entries = list(self.defectthermo.query(properties=['bulk_chemsys', 'bulk_prim_struct', + 'run_metadata', 'entry_id'])) + self.logger.info("Found {} new defect entries to consider".format( len(defect_entries))) + + #group defect entries based on bulk types and task level metadata info + + sm = StructureMatcher(ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol, + primitive_cell=True, scale=False, attempt_supercell=True, + allow_subset=False) + grpd_entry_list = [] + for entry_dict in defect_entries: + #get bulk chemsys and struct + base_bulk_struct = Structure.from_dict(entry_dict['defect']['structure']) + base_bulk_struct = base_bulk_struct.get_primitive_structure() + bulk_chemsys = "-".join(sorted((base_bulk_struct.symbol_set))) + + #get run metadata info + entry_dict_md = entry_dict['parameters']['task_level_metadata'] + run_metadata = entry_dict_md['incar_calctype_summary'].copy() + if 'potcar_summary' in entry_dict_md: + run_metadata.update( entry_dict_md['potcar_summary'].copy()) + #only want to filter based on bulk POTCAR + pspec, plab = set(), set() + for symbol, full_potcar_label in zip(run_metadata['pot_labels'], + run_metadata['pot_spec']): + red_sym = symbol.split('_')[0] + if red_sym in base_bulk_struct.symbol_set: + pspec.add( symbol) + plab.add( full_potcar_label) + + run_metadata['pot_spec'] = "-".join(sorted(set(pspec))) #had to switch to this approach for proper dict comparison later on + run_metadata['pot_labels'] = "-".join(sorted(set(plab))) + + # see if defect_entry matches an grouped entry list item already in progress + # does this by matching bulk structure and run metadata + matched = False + for grp_ind, grpd_entry in enumerate(grpd_entry_list): + if grpd_entry[0] == bulk_chemsys and grpd_entry[1] == run_metadata: + if sm.fit( base_bulk_struct, grpd_entry[2]): + grpd_entry[3].append( entry_dict.copy()) + matched = True + break + else: + continue + else: + continue + + if not matched: #have not logged yet, see if previous thermo phase diagram was created for this system + for t_ent in thermo_entries: + if t_ent['bulk_chemsys'] == bulk_chemsys and t_ent['run_metadata'] == run_metadata: + if sm.fit(base_bulk_struct, Structure.from_dict(t_ent['bulk_prim_struct'])): + if not self.update_all: + #if not wanting to update everything, then get previous entries from thermo database + old_entry_set = list(self.defectthermo.query( {'entry_id': t_ent['entry_id']}, + properties=['entries']))[0]['entries'] + old_entries_list = [entry.as_dict() if type(entry) != dict else entry for entry in + old_entry_set] + else: + old_entries_list = [] + + old_entries_list.append( entry_dict.copy()) + grpd_entry_list.append( [bulk_chemsys, run_metadata.copy(), base_bulk_struct.copy(), + old_entries_list, t_ent['entry_id']]) + matched = True + break + else: + continue + else: + continue + + if not matched: + # create new thermo phase diagram for this system, + # entry_id will be generated later on + grpd_entry_list.append( [bulk_chemsys, run_metadata.copy(), base_bulk_struct.copy(), + [entry_dict.copy()], 'None']) + + + for new_defects_for_thermo_entry in grpd_entry_list: + yield new_defects_for_thermo_entry + + + def process_item(self, items): + #group defect entries into groups of same defect type (charge included) + distinct_entries_full = [] #for storing full defect_dict + distinct_entries = {} #for storing defect object + bulk_chemsys, run_metadata, bulk_prim_struct, entrylist, entry_id = items + self.logger.debug("Processing bulk_chemsys {}".format(bulk_chemsys)) + + needed_entry_keys = ['@module', '@class', 'defect', 'uncorrected_energy', 'corrections', + 'parameters', 'entry_id', 'task_id'] + pdc = PointDefectComparator(check_charge=True, check_primitive_cell=True, check_lattice_scale=False) + for entry_dict in entrylist: + entry = DefectEntry.from_dict({k:v for k,v in entry_dict.items() if k in needed_entry_keys}) + matched = False + for grpind, grpdefect in distinct_entries.items(): + if pdc.are_equal( entry.defect, grpdefect.defect): + matched = True + break + + if matched: + distinct_entries_full[grpind].append( entry_dict.copy()) + else: + distinct_entries_full.append( [entry_dict.copy()]) + nxt_ind = max( distinct_entries.keys()) + 1 if len(distinct_entries.keys()) else 0 + distinct_entries[nxt_ind] = entry.copy() + + #now sort through entries and pick one that has been updated last (but track all previous entry_ids) + all_entry_ids_considered, entries = [], [] + for ident_entries in distinct_entries_full: + all_entry_ids_considered.extend([ent['entry_id'] for ent in ident_entries]) + # sort based on which was done most recently + lu_list = [] + for ent_ind, ent in enumerate(ident_entries): + try: + lu = ent['parameters']['task_level_metadata']['defect_task_last_updated'] + except: + lu = ent['parameters']['last_updated'] + if isinstance( lu, dict): #deal with case when datetime objects were not properly loaded + lu = MontyDecoder().process_decoded( lu) + lu_list.append( [lu, ent_ind]) + try: + lu_list.sort(reverse=True) + except: + for_print_lu_list = [[lu, ident_entries[ent_ind]['entry_id']] for lu, ent_ind in lu_list] + raise ValueError("Error with last_updated sorting of list:\n{}".format( for_print_lu_list)) + recent_entry_dict = ident_entries[ lu_list[0][1]] + + new_entry = DefectEntry.from_dict( {k:v for k,v in recent_entry_dict.items() if k in needed_entry_keys}) + entries.append( new_entry.copy()) + + # get vbm and bandgap; also verify all vbms and bandgaps are the same + vbm = entries[0].parameters['vbm'] + band_gap = entries[0].parameters['gap'] + for ent in entries: + if vbm != ent.parameters['vbm']: + self.logger.error("Logging vbm = {} (retrieved from {}, task-id {}) but {} " + "(task-id {}) has vbm = {}. Be careful with this defectphasediagram " + "if these are very different".format( vbm, entries[0].name, entries[0].entry_id, + ent.name, ent.entry_id, ent.parameters['vbm'])) + if band_gap != ent.parameters['gap']: + self.logger.error("Logging gap = {} (retrieved from {}, task-id {}) but {} " + "(task-id {}) has gap = {}. Be careful with this defectphasediagram " + "if these are very different".format( band_gap, entries[0].name, entries[0].entry_id, + ent.name, ent.entry_id, ent.parameters['gap'])) + + defect_phase_diagram = DefectPhaseDiagram( entries, vbm, band_gap, filter_compatible=False, + metadata={'all_entry_ids_considered': all_entry_ids_considered}) + defect_phase_diagram_as_dict = defect_phase_diagram.as_dict() + defect_phase_diagram_as_dict.update( {'bulk_chemsys': bulk_chemsys, 'bulk_prim_struct': bulk_prim_struct.as_dict(), + 'run_metadata': run_metadata, 'entry_id': entry_id, 'last_updated': datetime.utcnow()}) + + return defect_phase_diagram_as_dict + + def update_targets(self, items): + list_entry_ids = list(self.defectthermo.distinct('entry_id')) + if len(list_entry_ids): + next_entry_id = max(list_entry_ids) + 1 + else: + next_entry_id = 1 + for item in items: + if item['entry_id'] == 'None': + item['entry_id'] = next_entry_id + next_entry_id += 1 + + self.logger.info("Updating {} DefectThermo documents".format(len(items))) + + self.defectthermo.update(items, update_lu=True, key='entry_id') diff --git a/emmet-core/emmet/core/cp2k/calc_types/__init__.py b/emmet-core/emmet/core/cp2k/calc_types/__init__.py new file mode 100644 index 0000000000..9503f160d7 --- /dev/null +++ b/emmet-core/emmet/core/cp2k/calc_types/__init__.py @@ -0,0 +1,9 @@ +from pathlib import Path + +try: + import emmet.core.cp2k.calc_types.enums +except ImportError: + import emmet.core.cp2k.calc_types.generate + +from emmet.core.cp2k.calc_types.enums import RunType, TaskType, CalcType +from emmet.core.cp2k.calc_types.utils import run_type, task_type, calc_type diff --git a/emmet-core/emmet/core/cp2k/calc_types/enums.py b/emmet-core/emmet/core/cp2k/calc_types/enums.py new file mode 100644 index 0000000000..8f82f8fcd7 --- /dev/null +++ b/emmet-core/emmet/core/cp2k/calc_types/enums.py @@ -0,0 +1,95 @@ +""" +Autogenerated Enums for CP2K RunType, TaskType, and CalcType +Do not edit this by hand. Edit generate.py or run_types.yaml instead +""" +from emmet.core.utils import ValueEnum + + +class RunType(ValueEnum): + """ CP2K calculation run types """ + + PBE = "PBE" + GGA = "GGA" + R2SCAN = "R2SCAN" + METAGGA = "METAGGA" + HYBRID = "HYBRID" + HSE06 = "HSE06" + PBE0 = "PBE0" + RSH = "RSH" + PBE_U = "PBE+U" + GGA_U = "GGA+U" + R2SCAN_U = "R2SCAN+U" + METAGGA_U = "METAGGA+U" + HYBRID_U = "HYBRID+U" + HSE06_U = "HSE06+U" + PBE0_U = "PBE0+U" + RSH_U = "RSH+U" + LDA = "LDA" + LDA_U = "LDA+U" + + +class TaskType(ValueEnum): + """ CP2K calculation task types """ + + Static = "Static" + Structure_Optimization = "Structure Optimization" + Constrained_Structure_Optimization = "Constrained Structure Optimization" + + +class CalcType(ValueEnum): + """ CP2K calculation types """ + + PBE_Static = "PBE Static" + PBE_Structure_Optimization = "PBE Structure Optimization" + PBE_Constrained_Structure_Optimization = "PBE Constrained Structure Optimization" + GGA_Static = "GGA Static" + GGA_Structure_Optimization = "GGA Structure Optimization" + GGA_Constrained_Structure_Optimization = "GGA Constrained Structure Optimization" + R2SCAN_Static = "R2SCAN Static" + R2SCAN_Structure_Optimization = "R2SCAN Structure Optimization" + R2SCAN_Constrained_Structure_Optimization = "R2SCAN Constrained Structure Optimization" + METAGGA_Static = "METAGGA Static" + METAGGA_Structure_Optimization = "METAGGA Structure Optimization" + METAGGA_Constrained_Structure_Optimization = "METAGGA Constrained Structure Optimization" + HYBRID_Static = "HYBRID Static" + HYBRID_Structure_Optimization = "HYBRID Structure Optimization" + HYBRID_Constrained_Structure_Optimization = "HYBRID Constrained Structure Optimization" + HSE06_Static = "HSE06 Static" + HSE06_Structure_Optimization = "HSE06 Structure Optimization" + HSE06_Constrained_Structure_Optimization = "HSE06 Constrained Structure Optimization" + PBE0_Static = "PBE0 Static" + PBE0_Structure_Optimization = "PBE0 Structure Optimization" + PBE0_Constrained_Structure_Optimization = "PBE0 Constrained Structure Optimization" + RSH_Static = "RSH Static" + RSH_Structure_Optimization = "RSH Structure Optimization" + RSH_Constrained_Structure_Optimization = "RSH Constrained Structure Optimization" + PBE_U_Static = "PBE+U Static" + PBE_U_Structure_Optimization = "PBE+U Structure Optimization" + PBE_U_Constrained_Structure_Optimization = "PBE+U Constrained Structure Optimization" + GGA_U_Static = "GGA+U Static" + GGA_U_Structure_Optimization = "GGA+U Structure Optimization" + GGA_U_Constrained_Structure_Optimization = "GGA+U Constrained Structure Optimization" + R2SCAN_U_Static = "R2SCAN+U Static" + R2SCAN_U_Structure_Optimization = "R2SCAN+U Structure Optimization" + R2SCAN_U_Constrained_Structure_Optimization = "R2SCAN+U Constrained Structure Optimization" + METAGGA_U_Static = "METAGGA+U Static" + METAGGA_U_Structure_Optimization = "METAGGA+U Structure Optimization" + METAGGA_U_Constrained_Structure_Optimization = "METAGGA+U Constrained Structure Optimization" + HYBRID_U_Static = "HYBRID+U Static" + HYBRID_U_Structure_Optimization = "HYBRID+U Structure Optimization" + HYBRID_U_Constrained_Structure_Optimization = "HYBRID+U Constrained Structure Optimization" + HSE06_U_Static = "HSE06+U Static" + HSE06_U_Structure_Optimization = "HSE06+U Structure Optimization" + HSE06_U_Constrained_Structure_Optimization = "HSE06+U Constrained Structure Optimization" + PBE0_U_Static = "PBE0+U Static" + PBE0_U_Structure_Optimization = "PBE0+U Structure Optimization" + PBE0_U_Constrained_Structure_Optimization = "PBE0+U Constrained Structure Optimization" + RSH_U_Static = "RSH+U Static" + RSH_U_Structure_Optimization = "RSH+U Structure Optimization" + RSH_U_Constrained_Structure_Optimization = "RSH+U Constrained Structure Optimization" + LDA_Static = "LDA Static" + LDA_Structure_Optimization = "LDA Structure Optimization" + LDA_Constrained_Structure_Optimization = "LDA Constrained Structure Optimization" + LDA_U_Static = "LDA+U Static" + LDA_U_Structure_Optimization = "LDA+U Structure Optimization" + LDA_U_Constrained_Structure_Optimization = "LDA+U Constrained Structure Optimization" diff --git a/emmet-core/emmet/core/cp2k/calc_types/generate.py b/emmet-core/emmet/core/cp2k/calc_types/generate.py new file mode 100644 index 0000000000..96b7f6f2cd --- /dev/null +++ b/emmet-core/emmet/core/cp2k/calc_types/generate.py @@ -0,0 +1,79 @@ +""" Module to define various calculation types as Enums for CP2K """ +from itertools import product +from pathlib import Path + +from monty.serialization import loadfn + +_RUN_TYPE_DATA = loadfn(str(Path(__file__).parent.joinpath("run_types.yaml").resolve())) +_TASK_TYPES = [ + "Static", + "Structure Optimization", + "Constrained Structure Optimization" +] + +_RUN_TYPES = ( + [ + rt + for functional_class in _RUN_TYPE_DATA + for rt in _RUN_TYPE_DATA[functional_class] + ] + + [ + f"{rt}+U" + for functional_class in _RUN_TYPE_DATA + for rt in _RUN_TYPE_DATA[functional_class] + ] + + ["LDA", "LDA+U"] +) + + +def get_enum_source(enum_name, doc, items): + header = f""" +class {enum_name}(ValueEnum): + \"\"\" {doc} \"\"\"\n +""" + items = [f' {const} = "{val}"' for const, val in items.items()] + + return header + "\n".join(items) + + +run_type_enum = get_enum_source( + "RunType", + "CP2K calculation run types", + dict( + { + "_".join(rt.split()).replace("+", "_").replace("-", "_"): rt + for rt in _RUN_TYPES + } + ), +) +task_type_enum = get_enum_source( + "TaskType", + "CP2K calculation task types", + {"_".join(tt.split()): tt for tt in _TASK_TYPES}, +) +calc_type_enum = get_enum_source( + "CalcType", + "CP2K calculation types", + { + f"{'_'.join(rt.split()).replace('+','_').replace('-','_')}_{'_'.join(tt.split())}": f"{rt} {tt}" + for rt, tt in product(_RUN_TYPES, _TASK_TYPES) + }, +) + + +with open(Path(__file__).parent / "enums.py", "w") as f: + f.write( + """\"\"\" +Autogenerated Enums for CP2K RunType, TaskType, and CalcType +Do not edit this by hand. Edit generate.py or run_types.yaml instead +\"\"\" +from emmet.core.utils import ValueEnum + +""" + ) + f.write(run_type_enum) + f.write("\n\n") + f.write(task_type_enum) + f.write("\n\n") + f.write(calc_type_enum) + f.write("\n") diff --git a/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml b/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml new file mode 100644 index 0000000000..8d52d481fb --- /dev/null +++ b/emmet-core/emmet/core/cp2k/calc_types/run_types.yaml @@ -0,0 +1,25 @@ +GGA: + PBE: + FUNCTIONAL: PBE + FRACTION: 0 + GGA: + GGA: -- +METAGGA: + R2SCAN: + FUNCTIONAL: + - MGGA_C_R2SCAN + - MGGA_X_R2SCAN + FRACTION: 0 + METAGGA: + METAGGA: -- +HYBRID: + HYBRID: + HYBRID: -- + HSE06: + FRACTION: 0.25 + INTERACTION_POTENTIAL: SHORTRANGE + PBE0: + FRACTION: 0.25 + INTERACTION_POTENTIAL: TRUNCATED + RSH: + INTERACTION_POTENTIAL: TRUNCATED_MIX_CL \ No newline at end of file diff --git a/emmet-core/emmet/core/cp2k/calc_types/utils.py b/emmet-core/emmet/core/cp2k/calc_types/utils.py new file mode 100644 index 0000000000..99a1112264 --- /dev/null +++ b/emmet-core/emmet/core/cp2k/calc_types/utils.py @@ -0,0 +1,133 @@ +""" Module to define various calculation types as Enums for CP2K """ +from pathlib import Path +from typing import Dict +from monty.serialization import loadfn +from typing import Iterable + +from pymatgen.io.cp2k.inputs import Cp2kInput + +from emmet.core.cp2k.calc_types import RunType, TaskType, CalcType + +_RUN_TYPE_DATA = loadfn(str(Path(__file__).parent.joinpath("run_types.yaml").resolve())) + + +def run_type(dft: Dict) -> RunType: + """ + Determines the run_type from the CP2K input dict + This is adapted from pymatgen to be far less unstable + + Args: + dft: dictionary of dft parameters (standard from task doc) + """ + + def _variant_equal(v1, v2) -> bool: + """ + helper function to deal with strings + """ + if isinstance(v1, str) and isinstance(v2, str): + return v1.strip().upper() == v2.strip().upper() + elif isinstance(v1, Iterable) and isinstance(v2, Iterable): + return set(v1) == set(v2) + else: + return v1 == v2 + + is_hubbard = '+U' if dft.get('dft_plus_u') else '' + + parameters = { + 'FUNCTIONAL': dft.get('functional'), + 'INTERACTION_POTENTIAL': dft.get('hfx', {}).get('Interaction_Potential'), + 'FRACTION': dft.get('hfx', {}).get('FRACTION', 0) + } + + # Standard calc will only have one functional. If there are multiple functionals + # used this is either a hybrid calc or a non-generic mixed calculation. + if len(parameters['FUNCTIONAL']) == 1: + parameters['FUNCTIONAL'] = parameters['FUNCTIONAL'][0] + + # If all parameters in for the functional_class.special_type located in + # run_types.yaml are met, then that is the run type. + for functional_class in _RUN_TYPE_DATA: + for special_type, params in _RUN_TYPE_DATA[functional_class].items(): + if all( + [ + _variant_equal(parameters.get(param, True), value) + for param, value in params.items() + ] + ): + return RunType(f"{functional_class}{is_hubbard}") + + # TODO elegant way to handle this? + # This is a hack to get the non-standard hybrids to work + if parameters.get('FRACTION'): + return RunType(f"HYBRID{is_hubbard}") + +def task_type( + inputs: Dict +) -> TaskType: + """ + Determines the task type + + Args: + inputs + """ + + calc_type = [] + cp2k_run_type = inputs.get('cp2k_global').get('Run_type', '') + + if cp2k_run_type.upper() in ['ENERGY', 'ENERGY_FORCE', 'WAVEFUNCTION_OPTIMIZATION', 'WFN_OPT']: + calc_type.append('Static') + + elif cp2k_run_type.upper() in ['GEO_OPT', 'GEOMETRY_OPTIMIZATION', 'CELL_OPT']: + calc_type.append('Structure Optimization') + + elif cp2k_run_type.upper() in ['BAND']: + calc_type.append('Band') + + elif cp2k_run_type.upper() in ['MOLECULAR_DYNAMICS', 'MD']: + calc_type.append('Molecular Dynamics') + + elif cp2k_run_type.upper() in ['MONTE_CARLO', 'MC', 'TMC', 'TAMC']: + calc_type.append('Monte Carlo') + + elif cp2k_run_type.upper() in ['LINEAR_RESPONSE', 'LR']: + calc_type.append('Linear Response') + + elif cp2k_run_type.upper() in ['VIBRATIONAL_ANALYSIS', 'NORMAL_MODES']: + calc_type.append('Vibrational Analysis') + + elif cp2k_run_type.upper() in ['ELECTRONIC_SPECTRA', 'SPECTRA']: + calc_type.append('Electronic Spectra') + + elif cp2k_run_type.upper() in ['NEGF']: + calc_type.append('Non-equilibrium Green\'s Function') + + elif cp2k_run_type.upper() in ['PINT', 'DRIVER']: + calc_type.append('Path Integral') + + elif cp2k_run_type.upper() in ['RT_PROPAGATION', 'EHRENFEST_DYN']: + calc_type.append('Real-time propagation') + + elif cp2k_run_type.upper() in ['BSSE']: + calc_type.append('Base set superposition error') + + elif cp2k_run_type.upper() in ['DEBUG']: + calc_type.append('Debug analysis') + + elif cp2k_run_type.upper() in ['NONE']: + calc_type.append('None') + + return TaskType(" ".join(calc_type)) + +def calc_type( + inputs: Dict, +) -> CalcType: + """ + Determines the calc type + + Args: + inputs: dict from InputSummary containing necessary data for determining + calc type + """ + rt = run_type(inputs.get("dft")).value + tt = task_type(inputs).value + return CalcType(f"{rt} {tt}") diff --git a/emmet-core/emmet/core/cp2k/material.py b/emmet-core/emmet/core/cp2k/material.py new file mode 100644 index 0000000000..46f633e14c --- /dev/null +++ b/emmet-core/emmet/core/cp2k/material.py @@ -0,0 +1,235 @@ +""" Core definition of a Materials Document """ +from typing import List, Mapping, Sequence, Tuple + +from pydantic import Field +from pymatgen.analysis.structure_matcher import StructureMatcher +from pymatgen.entries.computed_entries import ComputedStructureEntry + +from emmet.core.settings import EmmetSettings +from emmet.core.material import MaterialsDoc as CoreMaterialsDoc +from emmet.core.material import PropertyOrigin as PropertyOrigin +from emmet.core.structure import StructureMetadata +from emmet.core.cp2k.calc_types import CalcType, RunType, TaskType +from emmet.core.cp2k.task import TaskDocument +from emmet.builders.cp2k.utils import get_mpid + +from pymatgen.symmetry.analyzer import SpacegroupAnalyzer + +SETTINGS = EmmetSettings() + +class MaterialsDoc(CoreMaterialsDoc, StructureMetadata): + + calc_types: Mapping[str, CalcType] = Field( # type: ignore + None, + description="Calculation types for all the calculations that make up this material", + ) + task_types: Mapping[str, TaskType] = Field( + None, + description="Task types for all the calculations that make up this material", + ) + run_types: Mapping[str, RunType] = Field( + None, + description="Run types for all the calculations that make up this material", + ) + + origins: Sequence[PropertyOrigin] = Field( + None, description="Mappingionary for tracking the provenance of properties" + ) + + entries: Mapping[RunType, ComputedStructureEntry] = Field( + None, description="Dictionary for tracking entries for CP2K calculations" + ) + + @classmethod + def from_tasks( + cls, + task_group: List[TaskDocument], + quality_scores=SETTINGS.CP2K_QUALITY_SCORES, + special_tags=SETTINGS.CP2K_SPECIAL_TAGS, + ) -> "MaterialsDoc": + """ + Converts a group of tasks into one material + """ + + # Metadata + last_updated = max(task.last_updated for task in task_group) + created_at = min(task.completed_at for task in task_group) + task_ids = list({task.task_id for task in task_group}) + + deprecated_tasks = list( + {task.task_id for task in task_group if not task.is_valid} + ) + run_types = {task.task_id: task.run_type for task in task_group} + task_types = {task.task_id: task.task_type for task in task_group} + calc_types = {task.task_id: task.calc_type for task in task_group} + + # TODO: Fix the type checking by hardcoding the Enums? + structure_optimizations = [ + task + for task in task_group + if task.task_type == TaskType.Structure_Optimization # type: ignore + ] + statics = [task for task in task_group if task.task_type == TaskType.Static] # type: ignore + + # Material ID + possible_mat_ids = [task.task_id for task in structure_optimizations + statics] + material_id = min(possible_mat_ids) + + def _structure_eval(task: TaskDocument): + """ + Helper function to order structures optimization and statics calcs by + - Functional Type + - Spin polarization + - Special Tags + - Energy + """ + + task_run_type = task.run_type + + is_valid = task.task_id in deprecated_tasks + + return ( + -1 * is_valid, + -1 * quality_scores.get(task_run_type.value, 0), + -1 * (2 if task.input.dft.get('UKS') else 1), + -1 * task.nsites # TODO Make k-point density + -1 * sum(task.input.get(tag, False) for tag in special_tags), # TODO what is this? + task.output.energy_per_atom, + ) + + structure_calcs = structure_optimizations + statics + best_structure_calc = sorted(structure_calcs, key=_structure_eval)[0] + structure = best_structure_calc.output.structure + + # Initial Structures + initial_structures = [task.input.structure for task in task_group] + sm = StructureMatcher( + ltol=0.1, stol=0.1, angle_tol=0.1, scale=False, attempt_supercell=False + ) + initial_structures = [ + group[0] for group in sm.group_structures(initial_structures) + ] + + # Deprecated + deprecated = all( + task.task_id in deprecated_tasks for task in structure_optimizations + ) + + # Origins + origins = [ + PropertyOrigin( + name="structure", + task_id=best_structure_calc.task_id, + last_updated=best_structure_calc.last_updated, + ) + ] + + # entries + entries = {} + all_run_types = set(run_types.values()) + for rt in all_run_types: + relevant_calcs = sorted( + [doc for doc in structure_calcs if doc.run_type == rt], + key=_structure_eval, + ) + if len(relevant_calcs) > 0: + best_task_doc = relevant_calcs[0] + entry = best_task_doc.entry + entry.data["task_id"] = entry.entry_id + entry.entry_id = material_id + + # TODO Standard material doc doesn't have this + best_task_doc.output.structure.remove_oxidation_states() + entries[rt] = ComputedStructureEntry( + structure=best_task_doc.output.structure, + energy=entry.energy, + correction=entry.correction, + energy_adjustments=entry.energy_adjustments, + parameters=entry.parameters, + data=entry.data, + entry_id=entry.entry_id + ) + + # Warnings + # TODO: What warning should we process? + + return cls.from_structure( + structure=structure, + material_id=material_id, + last_updated=last_updated, + created_at=created_at, + task_ids=task_ids, + calc_types=calc_types, + run_types=run_types, + task_types=task_types, + initial_structures=initial_structures, + deprecated=deprecated, + deprecated_tasks=deprecated_tasks, + origins=origins, + entries=entries, + ) + + @classmethod + def construct_deprecated_material( + cls, task_group: List[TaskDocument], + ) -> "MaterialsDoc": + """ + Converts a group of tasks into a deprecated material + + Args: + task_group: List of task document + """ + if len(task_group) == 0: + raise Exception("Must have more than one task in the group.") + + # Metadata + last_updated = max(task.last_updated for task in task_group) + created_at = min(task.completed_at for task in task_group) + task_ids = list({task.task_id for task in task_group}) + + deprecated_tasks = {task.task_id for task in task_group} + run_types = {task.task_id: task.run_type for task in task_group} + task_types = {task.task_id: task.task_type for task in task_group} + calc_types = {task.task_id: task.calc_type for task in task_group} + + # Material ID + material_id = min([task.task_id for task in task_group]) + + # Choose any random structure for metadata + structure = SpacegroupAnalyzer( + task_group[0].output.structure, symprec=0.1 + ).get_conventional_standard_structure() + + # Deprecated + deprecated = True + + return cls.from_structure( + structure=structure, + material_id=material_id, + last_updated=last_updated, + created_at=created_at, + task_ids=task_ids, + calc_types=calc_types, + run_types=run_types, + task_types=task_types, + deprecated=deprecated, + deprecated_tasks=deprecated_tasks, + ) + + + +def ID_to_int(s_id: str) -> Tuple[str, int]: + """ + Converts a string id to tuple + falls back to assuming ID is an Int if it can't process + Assumes string IDs are of form "[chars]-[int]" such as mp-234 + """ + if isinstance(s_id, str): + return (s_id.split("-")[0], int(str(s_id).split("-")[-1])) + elif isinstance(s_id, (int, float)): + return ("", s_id) + else: + return None + + + diff --git a/emmet-core/emmet/core/cp2k/task.py b/emmet-core/emmet/core/cp2k/task.py new file mode 100644 index 0000000000..9d569fbac0 --- /dev/null +++ b/emmet-core/emmet/core/cp2k/task.py @@ -0,0 +1,219 @@ +""" Core definition of a CP2K Task Document """ +from datetime import datetime +from typing import Dict, List + +from pydantic import BaseModel, Field, validator +from pymatgen.analysis.structure_analyzer import oxide_type +from pymatgen.core import Structure +from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry + +from emmet.core.base import EmmetBaseModel +from emmet.core.structure import StructureMetadata +from emmet.core.utils import ValueEnum +from emmet.core.cp2k.calc_types import calc_type, run_type, task_type, CalcType, RunType, TaskType +from emmet.core.math import Matrix3D, Vector3D +from emmet.core.mpid import MPID + + +class BaseTaskDocument(EmmetBaseModel): + """ + Definition of base Task Document + """ + + calc_code: str = Field(description="The calculation code used to compute this task") + version: str = Field(None, description="The version of the calculation code") + dir_name: str = Field(None, description="The directory for this task") + task_id: MPID = Field(None, description="the Task ID For this document") + + completed: bool = Field(False, description="Whether this calcuation completed") + completed_at: datetime = Field( + None, description="Timestamp for when this task was completed" + ) + last_updated: datetime = Field( + default_factory=datetime.utcnow, + description="Timestamp for this task document was last updated", + ) + + tags: List[str] = Field([], description="Metadata tags for this task document") + + warnings: List[str] = Field( + None, description="Any warnings related to this property" + ) + + +class Status(ValueEnum): + """ + CP2K Calculation State + """ + + SUCESS = "successful" + FAILED = "failed" + + +class InputSummary(BaseModel): + """ + Summary of inputs for a CP2K calculation + """ + + structure: Structure = Field(None, description="The input structure object") + + atomic_kind_info: Dict = Field(None, description="Description of parameters used for each atomic kind") + + cp2k_input: Dict = Field(None, description="The cp2k input used for this task") + + dft: Dict = Field(None, description="Description of the DFT parameters used in the last calc of this task") + + cp2k_global: Dict = Field(None, description="CP2K global parameters used in the last calc of this task") + + @validator('atomic_kind_info') + def remove_unnecessary(cls, atomic_kind_info): + for k in atomic_kind_info: + if 'total_pseudopotential_energy' in atomic_kind_info[k]: + del atomic_kind_info[k]['total_pseudopotential_energy'] + return atomic_kind_info + + @validator('dft') + def cleanup_dft(cls, dft): + if any(v.upper() == 'UKS' for v in dft.values()): + dft['UKS'] = True + return dft + + +class OutputSummary(BaseModel): + """ + Summary of the outputs for a CP2K calculation + """ + + structure: Structure = Field(None, description="The output structure object") + energy: float = Field( + None, description="The final total DFT energy for the last calculation" + ) + energy_per_atom: float = Field( + None, description="The final DFT energy per atom for the last calculation" + ) + bandgap: float = Field(None, description="The DFT band gap for the last calculation") + forces: List[Vector3D] = Field( + None, description="Forces on atoms from the last calculation" + ) + stress: Matrix3D = Field( + None, description="Stress on the unit cell from the last calculation" + ) + + +class RunStatistics(BaseModel): + """ + Summary of the Run statistics for a CP2K calculation + """ + + average_memory: float = Field(None, description="The average memory used in kb") + max_memory: float = Field(None, description="The maximum memory used in kb") + elapsed_time: float = Field(None, description="The real time elapsed in seconds") + system_time: float = Field(None, description="The system CPU time in seconds") + user_time: float = Field( + None, description="The user CPU time spent by CP2K in seconds" + ) + total_time: float = Field( + None, description="The total CPU time for this calculation" + ) + cores: int = Field(None, description="The number of cores used by CP2K") + + +class TaskDocument(BaseTaskDocument, StructureMetadata): + """ + Definition of CP2K Task Document + """ + + calc_code = "CP2K" + dir_name: str = Field(None, description="The directory for this CP2K task") + run_stats: RunStatistics = Field( + None, + description="Summary of runtime statistics for each calculation in this task", + ) + completed_at: datetime = Field( + None, description="Timestamp for when this task was completed" + ) + last_updated: datetime = Field( + None, description="Timestamp for this task document was last updated" + ) + + is_valid: bool = Field( + True, description="Whether this task document passed validation or not" + ) + + input: InputSummary = Field(None) + + output: OutputSummary = Field(None) + + state: Status = Field(None, description="State of this calculation") + + orig_inputs: Dict = Field( + None, description="Summary of the original CP2K inputs" + ) + task_id: int = Field(None, description="the Task ID For this document") + tags: List[str] = Field([], description="Metadata tags for this task document") + + run_type: RunType = Field(None) + task_type: TaskType = Field(None) + calc_type: CalcType = Field(None) + + @validator('input', pre=True, always=True) + def rename_global(cls, input): + if 'cp2k_global' not in input: + input['cp2k_global'] = input.pop('global') + return input + + @validator('run_type', pre=True, always=True) + def find_run_type(cls, v, values): + return run_type(values.get('input').dft) + + @validator('task_type', pre=True, always=True) + def find_task_type(cls, v, values): + return task_type(values.get('input').dict()) + + @validator('calc_type', pre=True, always=True) + def find_calc_type(cls, v, values): + return calc_type(values.get("input").dict()) + + @property + def entry(self): + """ Turns a Task Doc into a ComputedEntry""" + entry_dict = { + "correction": 0.0, + "entry_id": self.task_id, + "composition": self.output.structure.composition, + "energy": self.output.energy, + "parameters": { + "atomic_kind_info": self.input.atomic_kind_info, + # This is done to be compatible with MontyEncoder for the ComputedEntry + "run_type": str(self.run_type), + }, + "data": { + "oxide_type": oxide_type(self.output.structure), + "last_updated": self.last_updated, + }, + } + + return ComputedEntry.from_dict(entry_dict) + + @property + def structure_entry(self) -> ComputedStructureEntry: + """Turns a Task Doc into a ComputedStructureEntry""" + entry_dict = { + "correction": 0.0, + "entry_id": self.task_id, + "composition": self.output.structure.composition, + "energy": self.output.energy, + "parameters": { + "atomic_kind_info": self.input.atomic_kind_info, + # This is done to be compatible with MontyEncoder for the ComputedEntry + "run_type": str(self.run_type), + }, + "data": { + "oxide_type": oxide_type(self.output.structure), + "last_updated": self.last_updated, + }, + "structure": self.output.structure, + } + + return ComputedStructureEntry.from_dict(entry_dict) + diff --git a/emmet-core/emmet/core/cp2k/validation.py b/emmet-core/emmet/core/cp2k/validation.py new file mode 100644 index 0000000000..f727158cee --- /dev/null +++ b/emmet-core/emmet/core/cp2k/validation.py @@ -0,0 +1,76 @@ +from datetime import datetime +from typing import Dict, List, Union + +import numpy as np +from pydantic import Field, PyObject + +from emmet.core.settings import EmmetSettings +from emmet.core.base import EmmetBaseModel +from emmet.core.mpid import MPID +from emmet.core.utils import DocEnum +from emmet.core.cp2k.task import TaskDocument + +SETTINGS = EmmetSettings() + + +class DeprecationMessage(DocEnum): + MANUAL = "M", "manual deprecation" + FORCES = "C001", "Forces too large" + CONVERGENCE = "E001", "Calculation did not converge" + +class ValidationDoc(EmmetBaseModel): + """ + Validation document for a VASP calculation + """ + calc_code = "cp2k" + + task_id: MPID = Field(..., description="The task_id for this validation document") + valid: bool = Field(False, description="Whether this task is valid or not") + last_updated: datetime = Field( + description="Last updated date for this document", + default_factory=datetime.utcnow, + ) + reasons: List[Union[DeprecationMessage, str]] = Field( + None, description="List of deprecation tags detailing why this task isn't valid" + ) + warnings: List[str] = Field( + [], description="List of potential warnings about this calculation" + ) + data: Dict = Field( + description="Dictioary of data used to perform validation." + " Useful for post-mortem analysis" + ) + + class Config: + extra = "allow" + + @classmethod + def from_task_doc( + cls, + task_doc: TaskDocument, + ) -> "ValidationDoc": + """ + + """ + + calc_type = task_doc.calc_type + + # Force check + forces = np.array(task_doc.output.forces) > 0.05 + if np.any(forces): + reasons = [DeprecationMessage.FORCES] + + reasons = [] + data = {} + warnings = [] + + doc = ValidationDoc( + task_id=task_doc.task_id, + calc_type=calc_type, + run_type=task_doc.run_type, + valid=len(reasons) == 0, + reasons=reasons, + data=data, + warnings=warnings, + ) + return doc diff --git a/emmet-core/emmet/core/defect.py b/emmet-core/emmet/core/defect.py new file mode 100644 index 0000000000..d82e19959e --- /dev/null +++ b/emmet-core/emmet/core/defect.py @@ -0,0 +1,539 @@ +""" Core definition for Defect property Document """ +from __future__ import annotations + +from datetime import datetime +from tokenize import group +from typing import ClassVar, Dict, Tuple, Mapping, List +from pydantic import BaseModel, Field +from pydantic import validator + +from monty.json import MontyDecoder +from monty.tempfile import ScratchDir +from itertools import groupby + +from pymatgen.core import Structure, Composition, Element +from pymatgen.analysis.defects.core import DefectEntry, Defect +from pymatgen.analysis.defects.defect_compatibility import DefectCompatibility +from pymatgen.analysis.defects.thermodynamics import DefectPhaseDiagram, BrouwerDiagram +from pymatgen.electronic_structure.dos import CompleteDos +from pymatgen.symmetry.analyzer import SpacegroupAnalyzer +from pymatgen.entries.computed_entries import CompositionEnergyAdjustment +from pymatgen.entries.compatibility import MaterialsProject2020Compatibility +from pymatgen.ext.matproj import MPRester + +from emmet.core.structure import StructureMetadata +from emmet.core.mpid import MPID +from emmet.core.thermo import ThermoDoc +from emmet.core.cp2k.task import TaskDocument +from emmet.core.cp2k.calc_types.enums import CalcType, TaskType, RunType +from emmet.core.cp2k.material import MaterialsDoc +from emmet.core.cp2k.calc_types.utils import run_type, task_type +from emmet.builders.cp2k.utils import matcher + +from pymatgen.analysis.defects.corrections import FreysoldtCorrection2d +from pymatgen.io.vasp.outputs import VolumetricData + +# TODO Update DefectDoc on defect entry level so you don't re-do uncessary corrections +class DefectDoc(StructureMetadata): + """ + A document used to represent a single defect. e.g. a O vacancy with a -2 charge. + + This document can contain an arbitrary number of defect entries, originating from + pairs (defect and bulk) of calculations. This document provides access to the "best" + calculation of each run_type. + """ + + # TODO VASP MatDocs dont need this, but I get error requiring arbitrary type + class Config: + arbitrary_types_allowed = True + + property_name: ClassVar[str] = "defect" + + defect: Defect = Field(None, description="Pymatgen defect object for this defect doc") + + name: str = Field(None, description="Name of this defect as generated by the defect object") + + material_id: MPID = Field(None, description="Unique material ID for the bulk material") + + task_ids: List[int] = Field( + None, description="All task ids used in creating this defect doc." + ) + + calc_types: Mapping[int, CalcType] = Field( # type: ignore + None, + description="Calculation types for all the calculations that make up this material", + ) + task_types: Mapping[int, TaskType] = Field( + None, + description="Task types for all the calculations that make up this material", + ) + run_types: Mapping[int, RunType] = Field( + None, + description="Run types for all the calculations that make up this material", + ) + + tasks: Mapping[RunType, Tuple[TaskDocument, TaskDocument]] = Field( + None, description="Task documents (defect task, bulk task) for the defect entry of RunType" + ) + + entries: Mapping[RunType, DefectEntry] = Field( + None, description="Dictionary for tracking entries for CP2K calculations" + ) + + last_updated: datetime = Field( + description="Timestamp for when this document was last updated", + default_factory=datetime.utcnow, + ) + + created_at: datetime = Field( + description="Timestamp for when this material document was first created", + default_factory=datetime.utcnow, + ) + + metadata: Dict = Field(description="Metadata for this defect") + + # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this + @validator("entries", pre=True) + def decode(cls, entries): + for e in entries: + if isinstance(entries[e], dict): + entries[e] = MontyDecoder().process_decoded({k: v for k, v in entries[e].items()}) + return entries + + def update(self, defect_task, bulk_task, dielectric, query='defect'): + + defect_task_doc = TaskDocument(**defect_task) + bulk_task_doc = TaskDocument(**bulk_task) + + rt = defect_task_doc.run_type + tt = defect_task_doc.task_type + ct = defect_task_doc.calc_type + + # Metadata + last_updated = max(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now() + created_at = min(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now() + + if defect_task_doc.task_id in self.task_ids: + return + else: + self.last_updated = last_updated + self.created_at = created_at + self.task_ids.append(defect_task_doc.task_id) + #self['deprecated_tasks'].update(defect_task.task_id) + + def _run_type(x): + return run_type(x[0]['input']['dft']).value + + def _compare(new, old): + # TODO return kpoint density + return new['nsites'] > old.nsites + + if defect_task_doc.run_type not in self.tasks or _compare(defect_task, self.tasks[rt][0]): + self.run_types.update({defect_task_doc.task_id: rt}) + self.task_types.update({defect_task_doc.task_id: tt}) + self.calc_types.update({defect_task_doc.task_id: ct}) + entry = self.__class__.get_defect_entry_from_tasks( + defect_task=defect_task, + bulk_task=bulk_task, + dielectric=dielectric, + query=query + ) + self.entries[rt] = entry + self.tasks[rt] = (defect_task_doc, bulk_task_doc) + + def update_all(self, tasks, query='defect'): + for defect_task, bulk_task, dielectric in tasks: + self.update(defect_task=defect_task, bulk_task=bulk_task, dielectric=dielectric, query=query) + + @classmethod + def from_tasks(cls, tasks: List, query='defect', material_id=None) -> "DefectDoc": + """ + The standard way to create this document. + + Args: + tasks: A list of defect,bulk task pairs which will be used to construct a + series of DefectEntry objects. + query: How to retrieve the defect object stored in the task. + """ + task_group = [TaskDocument(**defect_task) for defect_task, bulk_task, dielectric in tasks] + + # Metadata + last_updated = datetime.now() or max(task.last_updated for task in task_group) + created_at = datetime.now() or min(task.completed_at for task in task_group) + task_ids = {task.task_id for task in task_group} + + deprecated_tasks = list( + {task.task_id for task in task_group if not task.is_valid} + ) + + run_types = {task.task_id: task.run_type for task in task_group} + task_types = {task.task_id: task.task_type for task in task_group} + calc_types = {task.task_id: task.calc_type for task in task_group} + + def _run_type(x): + return run_type(x[0]['input']['dft']).value + + def _task_type(x): + return task_type(x[0]['input']['dft']).value + + def _sort(x): + # TODO return kpoint density, currently just does supercell size + return -x[0]['nsites'], x[0]['output']['energy'] + + entries = {} + final_tasks = {} + metadata = {} + for key, tasks_for_runtype in groupby(sorted(tasks, key=_run_type), key=_run_type): + sorted_tasks = sorted(tasks_for_runtype, key=_sort) + ents = [cls.get_defect_entry_from_tasks(t[0], t[1], t[2], query) for t in sorted_tasks] + best_entry = ents[0] + best_defect_task, best_bulk_task, dielectric = sorted_tasks[0] + metadata[key] = {'convergence': [(sorted_tasks[i][0]['nsites'], ents[i].energy) for i in range(len(ents))]} + best_defect_task, best_bulk_task = TaskDocument(**best_defect_task), TaskDocument(**best_bulk_task) + entries[best_defect_task.run_type] = best_entry + final_tasks[best_defect_task.run_type] = (best_defect_task, best_bulk_task) + + data = { + 'entries': entries, + 'run_types': run_types, + 'task_types': task_types, + 'calc_types': calc_types, + 'last_updated': last_updated, + 'created_at': created_at, + 'task_ids': task_ids, + 'deprecated_tasks': deprecated_tasks, + 'tasks': final_tasks, + 'material_id': material_id if material_id else best_entry.parameters['material_id'], + 'entry_ids': {rt: entries[rt].entry_id for rt in entries}, + 'defect': best_entry.defect, + 'name': best_entry.defect.name, + 'metadata': metadata, + } + prim = SpacegroupAnalyzer(best_entry.defect.bulk_structure).get_primitive_standard_structure() + data.update(StructureMetadata.from_structure(prim).dict()) + return cls(**data) + + @classmethod + def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='transformations.history.0.defect'): + """ + Extract a defect entry from a single pair (defect and bulk) of tasks. + + Args: + defect_task: task dict for the defect calculation + bulk_task: task dict for the bulk calculation + dielectric: Dielectric doc if the defect is charged. If not present, no dielectric + corrections will be performed, even if the defect is charged. + query: Mongo-style query to retrieve the defect object from the defect task + """ + parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task) + if dielectric: + parameters['dielectric'] = dielectric + + defect_entry = DefectEntry( + cls.get_defect_from_task(query=query, task=defect_task), + uncorrected_energy=parameters['defect_energy'] - parameters['bulk_energy'], + parameters=parameters, + entry_id=parameters['entry_id'] + ) + DefectCompatibility().process_entry(defect_entry, perform_corrections=True) + defect_entry_as_dict = defect_entry.as_dict() + defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id'] # this seemed necessary for legacy db + + return defect_entry + + @classmethod + def get_defect_from_task(cls, query, task): + """ + Unpack a Mongo-style query and retrieve a defect object from a task. + """ + defect = unpack(query.split('.'), task) + needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name'] + return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys}) + + @classmethod + def get_parameters_from_tasks(cls, defect_task, bulk_task): + """ + Get parameters necessary to create a defect entry from defect and bulk task dicts + + Args: + defect_task: task dict for the defect calculation + bulk_task: task dict for the bulk calculation + """ + + def get_init(x): + """ + Helper function. If transformations were applied, get the structure post defect + transformation + """ + if x.get('transformations', {}).get('history'): + for i, y in enumerate(x['transformations']['history']): + if y['@class'] == 'DefectTransformation': + if len(x['transformations']['history']) == 1: + return Structure.from_dict(x['input']['structure']) + else: + return Structure.from_dict(x['transformations']['history'][i+1]['input_structure']) + return Structure.from_dict(x['transformations']['history'][0]['input_structure']) + return Structure.from_dict(x['input']['structure']) + + init_defect_structure = get_init(defect_task) + init_bulk_structure = get_init(bulk_task) # use init to avoid site_properties in matching + + final_defect_structure = Structure.from_dict(defect_task['output']['structure']) + final_bulk_structure = Structure.from_dict(bulk_task['output']['structure']) + + dfi, site_matching_indices = matcher( + init_bulk_structure, init_defect_structure, + final_bulk_struc=final_bulk_structure, final_defect_struc=final_defect_structure + ) + defect_frac_sc_coords = final_defect_structure[dfi].frac_coords + + parameters = { + 'defect_energy': defect_task['output']['energy'], + 'bulk_energy': bulk_task['output']['energy'], + 'initial_defect_structure': init_defect_structure, + 'final_defect_structure': final_defect_structure, + 'vbm': bulk_task['output']['vbm'], + 'cbm': bulk_task['output']['cbm'], + 'defect_frac_sc_coords': defect_frac_sc_coords, + 'entry_id': defect_task.get('task_id') + } + + # TODO Should probably get these even if not needed for corrections + if init_defect_structure.charge != 0: + axis_grid = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree_grid']] if bulk_task['output'].get('v_hartree_grid') else None + bulk_planar_averages = [[float(x) for x in _] for _ in bulk_task['output']['v_hartree_planar']] if bulk_task['output'].get('v_hartree_planar') else None + defect_planar_averages = [[float(x) for x in _] for _ in defect_task['output']['v_hartree_planar']] if defect_task['output'].get('v_hartree_planar') else None + bulk_site_averages = [float(x) for x in bulk_task['output']['v_hartree_sites']] if bulk_task['output'].get('v_hartree_sites') else None + defect_site_averages = [float(x) for x in defect_task['output']['v_hartree_sites']] if defect_task['output'].get('v_hartree_sites') else None + + parameters['axis_grid'] = axis_grid + parameters['bulk_planar_averages'] = bulk_planar_averages + parameters['defect_planar_averages'] = defect_planar_averages + parameters['defect_frac_sc_coords'] = defect_frac_sc_coords + parameters['site_matching_indices'] = site_matching_indices + parameters['bulk_site_averages'] = bulk_site_averages + parameters['defect_site_averages'] = defect_site_averages + + return parameters + + +# TODO Some of this should be done by DefectCompatibility, +# but it's not clear how to do that since 2d materials +# are not tagged in any particular way to allow defect compatibility +# to decide which correction to apply +class DefectDoc2d(DefectDoc): + """ + DefectDoc subclass for 2D defects + """ + + @classmethod + def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='transformations.history.0.defect'): + """ + Get defect entry from defect and bulk tasks. + + Args: + defect_task: task dict for the defect calculation + bulk_task: task dict for the bulk calculation + dielectric: dielectric tensor for the defect calculation + query: query string for defect entry + """ + parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task) + if dielectric: + eps_parallel = (dielectric[0][0] + dielectric[1][1]) / 2 + eps_perp = dielectric[2][2] + parameters['dielectric'] = (eps_parallel - 1) / (1 - 1/eps_perp) + + defect_entry = DefectEntry( + cls.get_defect_from_task(query=query, task=defect_task), + uncorrected_energy=parameters.pop('defect_energy') - parameters.pop('bulk_energy'), + parameters=parameters, + entry_id=parameters.pop('entry_id') + ) + + DefectCompatibility().process_entry(defect_entry, perform_corrections=False) + with ScratchDir('.'): + fc = FreysoldtCorrection2d( + defect_entry.parameters.get('dielectric'), + "LOCPOT.ref", "LOCPOT.def", encut=520, buffer=2 + ) + lref = VolumetricData( + structure=Structure.from_dict(bulk_task['input']['structure']), + data={'total': MontyDecoder().process_decoded(bulk_task['v_hartree'])} + ) + ldef = VolumetricData( + structure=Structure.from_dict(defect_task['input']['structure']), + data={'total': MontyDecoder().process_decoded(defect_task['v_hartree'])} + ) + lref.write_file("LOCPOT.ref") + ldef.write_file("LOCPOT.def") + ecorr = fc.get_correction(defect_entry) + defect_entry.corrections.update(ecorr) + defect_entry.parameters['freysoldt2d_meta'] = fc.metadata + + defect_entry_as_dict = defect_entry.as_dict() + defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id'] # this seemed necessary for legacy db + + return defect_entry + +class DefectThermoDoc(BaseModel): + + class Config: + arbitrary_types_allowed = True + + property_name: ClassVar[str] = "Defect Thermo" + + material_id: str = Field(None, description="Unique material ID for the host material") + + task_ids: Dict = Field( + None, description="All task ids used in creating these phase diagrams" + ) + + #TODO not by run type... in principle shouldn't be this way, but DOS is almost always GGA + bulk_dos: CompleteDos = Field(None, description="Complete Density of States for the bulk Structure") + + defect_phase_diagrams: Mapping[RunType, DefectPhaseDiagram] = Field( + None, description="Defect phase diagrams for each run type" + ) + + #brouwer_diagrams: Mapping[RunType, BrouwerDiagram] = Field( + # None, description="Brouwer diagrams" + #) + + ''' + # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this + @validator("brouwer_diagrams", pre=True) + def decode(cls, brouwer_diagrams): + for e in brouwer_diagrams: + if isinstance(brouwer_diagrams[e], dict): + brouwer_diagrams[e] = MontyDecoder().process_decoded( + {k: v for k, v in brouwer_diagrams[e].items()} + ) + return brouwer_diagrams + ''' + @classmethod + def from_docs(cls, defects: List[DefectDoc], thermos: List[ThermoDoc], electronic_structure: CompleteDos) -> "DefectThermoDoc": + + DEFAULT_RT = RunType('GGA') # TODO NEED A procedure for getting all GGA or GGA+U keys + DEFAULT_RT_U = RunType('GGA+U') + + mpid = defects[0].material_id + + #chempots = { + # m.structure.composition.elements[0]: + # {rt: m.entries[rt].energy_per_atom for rt in m.entries} + # for m in materials if m.structure.composition.is_element + #} + + chempots = { + td.composition.elements[0]: td.energy_per_atom + for td in thermos if td.composition.is_element + } + + defect_entries = {} + defect_phase_diagram = {} + vbms = {} + band_gaps = {} + brouwer_diagrams = {} + task_ids = {} + + dos = CompleteDos.from_dict(electronic_structure) + bg = dos.get_gap() + + """ + for m in materials: + for rt, ent in m.entries.items(): + __found_chempots__ = True + + # Chempot shift + for el, amt in ent.composition.element_composition.items(): + if Element(el) not in chempots: + __found_chempots__ = False + break + _rt = DEFAULT_RT if rt not in chempots[Element(el)] else rt + adj = CompositionEnergyAdjustment(-chempots[Element(el)][_rt], + n_atoms=amt, + name=f"Elemental shift {el} to formation energy space" + ) + ent.energy_adjustments.append(adj) + + if not __found_chempots__: + continue + + # Other stuff + band_gaps[rt] = bg + ent.parameters['software'] = 'cp2k' + ent.structure.remove_spin() + ent.structure.remove_oxidation_states() + MaterialsProject2020Compatibility().process_entry(ent) + """ + for d in defects: + for rt, ent in d.entries.items(): + # Chempot shift + __found_chempots__ = True + comp = Composition(ent.defect.defect_composition, allow_negative=True) - \ + ent.defect.bulk_structure.composition + for el, amt in comp.items(): + if Element(el) not in chempots: + __found_chempots__ = False + break + ent.corrections[f"Elemental shift {el} to formation energy space"] = -amt * chempots[Element(el)] + + if not __found_chempots__: + continue + + # Other stuff + if rt not in defect_entries: + defect_entries[rt] = [] + if rt not in task_ids: + task_ids[rt] = set() + defect_entries[rt].append(d.entries[rt]) + vbms[rt] = d.entries[rt].parameters['vbm'] # TODO Need to find best vbm + task_ids[rt].update(d.task_ids) + + for run_type in defect_entries: + # TODO MUST FILTER COMPATIBLE AT SOME POINT + defect_phase_diagram[run_type] = DefectPhaseDiagram( + entries=defect_entries[run_type], + vbm=vbms[run_type], + band_gap=bg, #band_gaps[run_type], # TODO Need to do rt dependent band gap + filter_compatible=False + ) + """ + brouwer_diagrams[run_type] = BrouwerDiagram( + defect_phase_diagram=defect_phase_diagram[run_type], + bulk_dos=dos, + entries=[ + m.entries[run_type] + if run_type in m.entries else m.entries[DEFAULT_RT_U] + if DEFAULT_RT_U in m.entries else m.entries[DEFAULT_RT] + for m in materials + ] + ) + """ + + data = { + 'material_id': mpid, + 'task_ids': task_ids, + 'bulk_dos': dos, + 'defect_phase_diagrams': defect_phase_diagram, + } + + return cls(**{k: v for k, v in data.items()}) + + +def get_dos(mpid): + with MPRester() as mp: + return mp.get_dos_by_material_id(mpid) + + +def get_entries(chemsys): + with MPRester() as mp: + return mp.get_entries_in_chemsys(chemsys) + + +def unpack(query, d): + if not query: + return d + if isinstance(d, List): + return unpack(query[1:], d.__getitem__(int(query.pop(0)))) + return unpack(query[1:], d.__getitem__(query.pop(0))) diff --git a/emmet-core/emmet/core/mpid.py b/emmet-core/emmet/core/mpid.py index 8a74b8fb91..69e22cfed2 100644 --- a/emmet-core/emmet/core/mpid.py +++ b/emmet-core/emmet/core/mpid.py @@ -32,7 +32,7 @@ def __init__(self, val: Union["MPID", int, str]): elif isinstance(val, str): parts = val.split("-") - parts[1] = int(parts[1]) # type: ignore + parts[-1] = int(parts[-1]) # type: ignore self.parts = tuple(parts) self.string = val diff --git a/emmet-core/emmet/core/settings.py b/emmet-core/emmet/core/settings.py index 9f6e606f13..a732f4f792 100644 --- a/emmet-core/emmet/core/settings.py +++ b/emmet-core/emmet/core/settings.py @@ -30,6 +30,24 @@ class EmmetSettings(BaseSettings): DEFAULT_CONFIG_FILE_PATH, description="File to load alternative defaults from" ) + CP2K_SPECIAL_TAGS: List[str] = Field( + [], description="Special tags to prioritize for CP2K task documents" + ) + + CP2K_QUALITY_SCORES: Dict[str, int] = Field( + {"HYBRID": 4, "SCAN": 3, "GGA+U": 2, "GGA": 1}, + description="Dictionary Mapping CP2K calculation run types to rung level for CP2K materials builders", + ) + + CP2K_DEFAULT_INPUT_SETS: Dict = Field( + { + "Static DFT": "pymatgen.io.cp2k.sets.StaticSet", + "GGA Structure Optimization": "pymatgen.io.cp2k.sets.RelaxSet", + "GGA+U Structure Optimization": "pymatgen.io.cp2k.sets.RelaxSet", + }, + description="Default input sets for task validation", + ) + LTOL: float = Field( 0.2, description="Fractional length tolerance for structure matching" ) diff --git a/emmet-core/emmet/core/vasp/calc_types/enums.py b/emmet-core/emmet/core/vasp/calc_types/enums.py index b946992d1c..6f300a1a33 100644 --- a/emmet-core/emmet/core/vasp/calc_types/enums.py +++ b/emmet-core/emmet/core/vasp/calc_types/enums.py @@ -21,7 +21,7 @@ class RunType(ValueEnum): HF = "HF" HSE03 = "HSE03" HSE06 = "HSE06" - PB0 = "PB0" + PBE0 = "PBE0" M06L = "M06L" MBJL = "MBJL" MS0 = "MS0" @@ -52,7 +52,7 @@ class RunType(ValueEnum): HF_U = "HF+U" HSE03_U = "HSE03+U" HSE06_U = "HSE06+U" - PB0_U = "PB0+U" + PBE0_U = "PBE0+U" M06L_U = "M06L+U" MBJL_U = "MBJL+U" MS0_U = "MS0+U" @@ -237,17 +237,17 @@ class CalcType(ValueEnum): HSE06_Structure_Optimization = "HSE06 Structure Optimization" HSE06_Deformation = "HSE06 Deformation" HSE06_Unrecognized = "HSE06 Unrecognized" - PB0_NSCF_Line = "PB0 NSCF Line" - PB0_NSCF_Uniform = "PB0 NSCF Uniform" - PB0_Dielectric = "PB0 Dielectric" - PB0_DFPT = "PB0 DFPT" - PB0_DFPT_Dielectric = "PB0 DFPT Dielectric" - PB0_NMR_Nuclear_Shielding = "PB0 NMR Nuclear Shielding" - PB0_NMR_Electric_Field_Gradient = "PB0 NMR Electric Field Gradient" - PB0_Static = "PB0 Static" - PB0_Structure_Optimization = "PB0 Structure Optimization" - PB0_Deformation = "PB0 Deformation" - PB0_Unrecognized = "PB0 Unrecognized" + PBE0_NSCF_Line = "PBE0 NSCF Line" + PBE0_NSCF_Uniform = "PBE0 NSCF Uniform" + PBE0_Dielectric = "PBE0 Dielectric" + PBE0_DFPT = "PBE0 DFPT" + PBE0_DFPT_Dielectric = "PBE0 DFPT Dielectric" + PBE0_NMR_Nuclear_Shielding = "PBE0 NMR Nuclear Shielding" + PBE0_NMR_Electric_Field_Gradient = "PBE0 NMR Electric Field Gradient" + PBE0_Static = "PBE0 Static" + PBE0_Structure_Optimization = "PBE0 Structure Optimization" + PBE0_Deformation = "PBE0 Deformation" + PBE0_Unrecognized = "PBE0 Unrecognized" M06L_NSCF_Line = "M06L NSCF Line" M06L_NSCF_Uniform = "M06L NSCF Uniform" M06L_Dielectric = "M06L Dielectric" @@ -582,17 +582,17 @@ class CalcType(ValueEnum): HSE06_U_Structure_Optimization = "HSE06+U Structure Optimization" HSE06_U_Deformation = "HSE06+U Deformation" HSE06_U_Unrecognized = "HSE06+U Unrecognized" - PB0_U_NSCF_Line = "PB0+U NSCF Line" - PB0_U_NSCF_Uniform = "PB0+U NSCF Uniform" - PB0_U_Dielectric = "PB0+U Dielectric" - PB0_U_DFPT = "PB0+U DFPT" - PB0_U_DFPT_Dielectric = "PB0+U DFPT Dielectric" - PB0_U_NMR_Nuclear_Shielding = "PB0+U NMR Nuclear Shielding" - PB0_U_NMR_Electric_Field_Gradient = "PB0+U NMR Electric Field Gradient" - PB0_U_Static = "PB0+U Static" - PB0_U_Structure_Optimization = "PB0+U Structure Optimization" - PB0_U_Deformation = "PB0+U Deformation" - PB0_U_Unrecognized = "PB0+U Unrecognized" + PBE0_U_NSCF_Line = "PBE0+U NSCF Line" + PBE0_U_NSCF_Uniform = "PBE0+U NSCF Uniform" + PBE0_U_Dielectric = "PBE0+U Dielectric" + PBE0_U_DFPT = "PBE0+U DFPT" + PBE0_U_DFPT_Dielectric = "PBE0+U DFPT Dielectric" + PBE0_U_NMR_Nuclear_Shielding = "PBE0+U NMR Nuclear Shielding" + PBE0_U_NMR_Electric_Field_Gradient = "PBE0+U NMR Electric Field Gradient" + PBE0_U_Static = "PBE0+U Static" + PBE0_U_Structure_Optimization = "PBE0+U Structure Optimization" + PBE0_U_Deformation = "PBE0+U Deformation" + PBE0_U_Unrecognized = "PBE0+U Unrecognized" M06L_U_NSCF_Line = "M06L+U NSCF Line" M06L_U_NSCF_Uniform = "M06L+U NSCF Uniform" M06L_U_Dielectric = "M06L+U Dielectric" diff --git a/emmet-core/emmet/core/vasp/validation.py b/emmet-core/emmet/core/vasp/validation.py index 817d315ed1..b3dc6af633 100644 --- a/emmet-core/emmet/core/vasp/validation.py +++ b/emmet-core/emmet/core/vasp/validation.py @@ -43,6 +43,8 @@ class ValidationDoc(EmmetBaseModel): Validation document for a VASP calculation """ + calc_code = "vasp" + task_id: MPID = Field(..., description="The task_id for this validation document") valid: bool = Field(False, description="Whether this task is valid or not") last_updated: datetime = Field(