diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..7781a1d6 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,2 @@ +[run] +concurrency = multiprocessing \ No newline at end of file diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index cb055073..9c22af6a 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -24,8 +24,7 @@ jobs: run: | python -m pip install --upgrade pip pip install mypy pydocstyle pylint black flake8 pyproject-flake8==6.0.0 - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install -e . + pip install -e .[featurizer] pip install types-setuptools - name: mypy run: | diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 4b8d54a3..b6530963 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -30,8 +30,7 @@ jobs: python -m pip install --upgrade pip python -m pip install flake8 pytest pytest-mock pytest-split pytest-cov python -m pip install types-setuptools - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install -e . + pip install -e .[featurizer] - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names @@ -48,7 +47,7 @@ jobs: with: name: coverage-${{ matrix.split }} path: .coverage - + coverage: needs: build runs-on: ubuntu-latest diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 97134b4f..e8c8477e 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -29,8 +29,7 @@ jobs: python -m pip install flake8 pytest pytest-mock python -m pip install types-setuptools python -m pip install coverage - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install -e . + pip install -e .[featurizer] pip install build - name: Build package run: python -m build diff --git a/docs/source/conf.py b/docs/source/conf.py index bae17433..20a984c2 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,14 +12,15 @@ # import os import sys + sys.path.insert(0, os.path.abspath("../")) # -- Project information ----------------------------------------------------- -project = 'Lobsterpy' -copyright = '2022, Janine George' -author = 'Janine George' +project = "Lobsterpy" +copyright = "2022, Janine George" +author = "Janine George" # -- General configuration --------------------------------------------------- @@ -43,12 +44,18 @@ source_suffix = [".rst", ".md"] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["../../lobsterpy/test","../../lobsterpy/cohp/test","../../lobsterpy/plotting/test","../../lobsterpy/TestData","Thumbs.db", ".DS_Store",] +exclude_patterns = ["../../lobsterpy/test", + "../../lobsterpy/cohp/test", + "../../lobsterpy/plotting/test", + "../../lobsterpy/TestData", + "Thumbs.db", + ".DS_Store", +] def run_apidoc(_): import subprocess @@ -60,7 +67,15 @@ def run_apidoc(_): excludes2 = glob.glob(os.path.join(output_path, "../../lobsterpy/plotting/test")) module = os.path.join(output_path, "../../lobsterpy") cmd_path = "sphinx-apidoc" - command = [cmd_path, "-e", "-o", output_path, module, " ".join(excludes)," ".join(excludes1)," ".join(excludes2), "--force"] + command = [cmd_path, + "-e", "-o", + output_path, + module, + " ".join(excludes), + " ".join(excludes1), + " ".join(excludes2), + "--force" + ] subprocess.check_call(command) @@ -73,9 +88,9 @@ def setup(app): # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_book_theme' +html_theme = "sphinx_book_theme" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] diff --git a/lobsterpy/TestData/Featurizer_test_data/JSONS/mp-1000.json.gz b/lobsterpy/TestData/Featurizer_test_data/JSONS/mp-1000.json.gz new file mode 100755 index 00000000..aea9ab34 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/JSONS/mp-1000.json.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/JSONS/mp-2176.json.gz b/lobsterpy/TestData/Featurizer_test_data/JSONS/mp-2176.json.gz new file mode 100755 index 00000000..43ae871f Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/JSONS/mp-2176.json.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/JSONS/mp-463.json.gz b/lobsterpy/TestData/Featurizer_test_data/JSONS/mp-463.json.gz new file mode 100755 index 00000000..36618971 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/JSONS/mp-463.json.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/CHARGE.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/CHARGE.lobster.gz new file mode 100755 index 00000000..2fbf5f2a Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/CHARGE.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/COBICAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/COBICAR.lobster.gz new file mode 100755 index 00000000..0d4cc072 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/COBICAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/COHPCAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/COHPCAR.lobster.gz new file mode 100755 index 00000000..83b5e584 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/COHPCAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/COOPCAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/COOPCAR.lobster.gz new file mode 100755 index 00000000..9ec6bf0e Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/COOPCAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/ICOBILIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/ICOBILIST.lobster.gz new file mode 100755 index 00000000..a29fc55d Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/ICOBILIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/ICOHPLIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/ICOHPLIST.lobster.gz new file mode 100755 index 00000000..cb448536 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/ICOHPLIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/ICOOPLIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/ICOOPLIST.lobster.gz new file mode 100755 index 00000000..dbced562 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/ICOOPLIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/MadelungEnergies.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/MadelungEnergies.lobster.gz new file mode 100755 index 00000000..76507fb3 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/MadelungEnergies.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/POSCAR.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/POSCAR.gz new file mode 100755 index 00000000..e83fec4b Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-1000/POSCAR.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/CHARGE.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/CHARGE.lobster.gz new file mode 100755 index 00000000..d53030f3 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/CHARGE.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/COBICAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/COBICAR.lobster.gz new file mode 100755 index 00000000..c22ebf8a Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/COBICAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/COHPCAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/COHPCAR.lobster.gz new file mode 100755 index 00000000..5d094331 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/COHPCAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/COOPCAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/COOPCAR.lobster.gz new file mode 100755 index 00000000..325d826c Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/COOPCAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/ICOBILIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/ICOBILIST.lobster.gz new file mode 100755 index 00000000..bd6701ef Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/ICOBILIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/ICOHPLIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/ICOHPLIST.lobster.gz new file mode 100755 index 00000000..caf735f0 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/ICOHPLIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/ICOOPLIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/ICOOPLIST.lobster.gz new file mode 100755 index 00000000..0a41274f Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/ICOOPLIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/MadelungEnergies.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/MadelungEnergies.lobster.gz new file mode 100755 index 00000000..191eee05 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/MadelungEnergies.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/POSCAR.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/POSCAR.gz new file mode 100755 index 00000000..73690d39 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-2176/POSCAR.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/CHARGE.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/CHARGE.lobster.gz new file mode 100755 index 00000000..66fbebab Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/CHARGE.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/COBICAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/COBICAR.lobster.gz new file mode 100755 index 00000000..8a22f4a0 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/COBICAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/COHPCAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/COHPCAR.lobster.gz new file mode 100755 index 00000000..129ad9d0 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/COHPCAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/COOPCAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/COOPCAR.lobster.gz new file mode 100755 index 00000000..1d9df23e Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/COOPCAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/ICOBILIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/ICOBILIST.lobster.gz new file mode 100755 index 00000000..c033b13b Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/ICOBILIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/ICOHPLIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/ICOHPLIST.lobster.gz new file mode 100755 index 00000000..c9c70763 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/ICOHPLIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/ICOOPLIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/ICOOPLIST.lobster.gz new file mode 100755 index 00000000..c5b22bf9 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/ICOOPLIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/MadelungEnergies.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/MadelungEnergies.lobster.gz new file mode 100755 index 00000000..61381fa5 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/MadelungEnergies.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/POSCAR.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/POSCAR.gz new file mode 100755 index 00000000..dd7ab942 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs/mp-463/POSCAR.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/CHARGE.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/CHARGE.lobster.gz new file mode 100755 index 00000000..d53030f3 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/CHARGE.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/COHPCAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/COHPCAR.lobster.gz new file mode 100755 index 00000000..5d094331 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/COHPCAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/COOPCAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/COOPCAR.lobster.gz new file mode 100755 index 00000000..325d826c Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/COOPCAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/ICOHPLIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/ICOHPLIST.lobster.gz new file mode 100755 index 00000000..caf735f0 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/ICOHPLIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/ICOOPLIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/ICOOPLIST.lobster.gz new file mode 100755 index 00000000..0a41274f Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/ICOOPLIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/MadelungEnergies.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/MadelungEnergies.lobster.gz new file mode 100755 index 00000000..191eee05 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/MadelungEnergies.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/POSCAR.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/POSCAR.gz new file mode 100755 index 00000000..73690d39 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/mp-2176/POSCAR.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/CHARGE.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/CHARGE.lobster.gz new file mode 100755 index 00000000..2fbf5f2a Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/CHARGE.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/COBICAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/COBICAR.lobster.gz new file mode 100755 index 00000000..0d4cc072 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/COBICAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/COHPCAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/COHPCAR.lobster.gz new file mode 100755 index 00000000..83b5e584 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/COHPCAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/ICOBILIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/ICOBILIST.lobster.gz new file mode 100755 index 00000000..a29fc55d Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/ICOBILIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/ICOHPLIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/ICOHPLIST.lobster.gz new file mode 100755 index 00000000..cb448536 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/ICOHPLIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/MadelungEnergies.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/MadelungEnergies.lobster.gz new file mode 100755 index 00000000..76507fb3 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/MadelungEnergies.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/POSCAR.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/POSCAR.gz new file mode 100755 index 00000000..e83fec4b Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/mp-1000/POSCAR.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/COBICAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/COBICAR.lobster.gz new file mode 100755 index 00000000..8a22f4a0 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/COBICAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/COHPCAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/COHPCAR.lobster.gz new file mode 100755 index 00000000..129ad9d0 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/COHPCAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/COOPCAR.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/COOPCAR.lobster.gz new file mode 100755 index 00000000..1d9df23e Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/COOPCAR.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/ICOBILIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/ICOBILIST.lobster.gz new file mode 100755 index 00000000..c033b13b Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/ICOBILIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/ICOHPLIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/ICOHPLIST.lobster.gz new file mode 100755 index 00000000..c9c70763 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/ICOHPLIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/ICOOPLIST.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/ICOOPLIST.lobster.gz new file mode 100755 index 00000000..c5b22bf9 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/ICOOPLIST.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/MadelungEnergies.lobster.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/MadelungEnergies.lobster.gz new file mode 100755 index 00000000..61381fa5 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/MadelungEnergies.lobster.gz differ diff --git a/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/POSCAR.gz b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/POSCAR.gz new file mode 100755 index 00000000..dd7ab942 Binary files /dev/null and b/lobsterpy/TestData/Featurizer_test_data/Lobster_calcs_exceptions/3/mp-463/POSCAR.gz differ diff --git a/lobsterpy/TestData/JSONS/mp-1249.json.gz b/lobsterpy/TestData/JSONS/mp-1249.json.gz new file mode 100755 index 00000000..0b3801c2 Binary files /dev/null and b/lobsterpy/TestData/JSONS/mp-1249.json.gz differ diff --git a/lobsterpy/TestData/JSONS/mp-14652.json.gz b/lobsterpy/TestData/JSONS/mp-14652.json.gz new file mode 100755 index 00000000..04f3deea Binary files /dev/null and b/lobsterpy/TestData/JSONS/mp-14652.json.gz differ diff --git a/lobsterpy/TestData/JSONS/mp-1958.json.gz b/lobsterpy/TestData/JSONS/mp-1958.json.gz new file mode 100755 index 00000000..b0b65bf5 Binary files /dev/null and b/lobsterpy/TestData/JSONS/mp-1958.json.gz differ diff --git a/lobsterpy/featurize/__init__.py b/lobsterpy/featurize/__init__.py new file mode 100644 index 00000000..67a30e86 --- /dev/null +++ b/lobsterpy/featurize/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) lobsterpy development team +# Distributed under the terms of a BSD 3-Clause "New" or "Revised" License + +""" +This package provides the modules for featurzing Lobster data ready for ML +""" diff --git a/lobsterpy/featurize/batch.py b/lobsterpy/featurize/batch.py new file mode 100644 index 00000000..f3151c64 --- /dev/null +++ b/lobsterpy/featurize/batch.py @@ -0,0 +1,659 @@ +# Copyright (c) lobsterpy development team +# Distributed under the terms of a BSD 3-Clause "New" or "Revised" License + +""" +This module defines wrapper classes to quickly obtain similarity matrix of input fingerprint objects +""" +from __future__ import annotations +import os +from typing import NamedTuple, List +import multiprocessing as mp +from pathlib import Path +import warnings +import numpy as np +import pandas as pd +from tqdm.autonotebook import tqdm +from lobsterpy.featurize.core import ( + FeaturizeLobsterpy, + FeaturizeCharges, + FeaturizeCOXX, +) + +warnings.filterwarnings("ignore") + + +class BatchSummaryFeaturizer: + """ + Batch Featurizer sets that generates summary features from lobster data. + + Args: + path_to_lobster_calcs: path to root directory consisting of all lobster calc + path_to_jsons: path to root directory consisting of all lobster lightweight jsons + feature_type: set the feature type for moment features. + Possible options are "bonding", "antibonding" or "overall" + charge_type : set charge type used for computing ionicity. Possible options are + "mulliken", "loewdin or "both" + bonds: "all_bonds" or "cation_anion_bonds" + include_cobi_data : bool stating to include COBICAR.lobster features + include_coop_data: bool stating to include COOPCAR.lobster features + e_range : range of energy relative to fermi for which moment features needs to be computed + n_jobs : parallel processes to run + + Attributes: + get_df: A pandas dataframe with summary features + """ + + def __init__( + self, + path_to_lobster_calcs: str, + path_to_jsons: str | None = None, + feature_type: str = "antibonding", + charge_type: str = "both", + bonds: str = "all", + include_cobi_data: bool = False, + include_coop_data: bool = False, + e_range: List[float] = [-5.0, 0.0], + n_jobs: int = 4, + ): + self.path_to_lobster_calcs = path_to_lobster_calcs + self.path_to_jsons = path_to_jsons + self.feature_type = feature_type + self.charge_type = charge_type + self.bonds = bonds + self.include_cobi_data = include_cobi_data + self.include_coop_data = include_coop_data + self.e_range = e_range + self.n_jobs = n_jobs + + def _featurizelobsterpy(self, file_name_or_path) -> pd.DataFrame: + """ + Wrapper method to featurize Lobsterpy condensed bonding analysis data by loading lightweight json + if json file exists or invokes lobsterpy.analzye.Analysis module + + Returns: + A pandas dataframe with ICOHP stats like mean, min, max of relevant bonds and + madelung energies + + """ + if Path(file_name_or_path).is_file(): + featurize_lobsterpy = FeaturizeLobsterpy( + path_to_json=file_name_or_path, + bonds=self.bonds, + ) + + else: + featurize_lobsterpy = FeaturizeLobsterpy( + path_to_lobster_calc=file_name_or_path, + bonds=self.bonds, + ) + + df = featurize_lobsterpy.get_df() + + return df + + def _featurizecoxx(self, path_to_lobster_calc) -> pd.DataFrame: + """ + Wrapper method to featurize COHP/COBI/COOPCAR data that uses FeaturizeCOXX under the hood + + Returns: + A pandas dataframe with COHP summary stats data mainly weighted ICOHP/ICOOP/ICOBI, + Effective interaction number and moment features (center, width, skewness and kurtosis) + + """ + dir_name = Path(path_to_lobster_calc) + + req_files = { + "structure_path": "POSCAR", + "coxxcar_path": "COHPCAR.lobster", + "icoxxlist_path": "ICOHPLIST.lobster", + } + for file, default_value in req_files.items(): + file_path = dir_name / default_value + req_files[file] = file_path # type: ignore + if not file_path.exists(): + gz_file_path = file_path.with_name(file_path.name + ".gz") + if gz_file_path.exists(): + req_files[file] = gz_file_path # type: ignore + + coxxcar_path = req_files.get("coxxcar_path") + structure_path = req_files.get("structure_path") + icoxxlist_path = req_files.get("icoxxlist_path") + + if ( + coxxcar_path.exists() # type: ignore + and structure_path.exists() # type: ignore + and icoxxlist_path.exists() # type: ignore + ): + coxx = FeaturizeCOXX( + path_to_coxxcar=str(coxxcar_path), + path_to_icoxxlist=str(icoxxlist_path), + path_to_structure=str(structure_path), + feature_type=self.feature_type, + e_range=self.e_range, + ) + + df_cohp = coxx.get_summarized_coxx_df() + else: + raise Exception( + "COHPCAR.lobster or POSCAR or ICOHPLIST.lobster file " + "not found in {}".format(dir_name.name) + ) + + if self.include_cobi_data: + req_files = { + "coxxcar_path": "COBICAR.lobster", + "icoxxlist_path": "ICOBILIST.lobster", + } + for file, default_value in req_files.items(): + file_path = dir_name / default_value + req_files[file] = file_path # type: ignore + if not file_path.exists(): + gz_file_path = file_path.with_name(file_path.name + ".gz") + if gz_file_path.exists(): + req_files[file] = gz_file_path # type: ignore + + coxxcar_path = req_files.get("coxxcar_path") + icoxxlist_path = req_files.get("icoxxlist_path") + + if coxxcar_path.exists() and icoxxlist_path.exists(): # type: ignore + coxx = FeaturizeCOXX( + path_to_coxxcar=str(coxxcar_path), + path_to_icoxxlist=str(icoxxlist_path), + path_to_structure=str(structure_path), + feature_type=self.feature_type, + e_range=self.e_range, + are_cobis=True, + ) + + df_cobi = coxx.get_summarized_coxx_df() + + else: + raise Exception( + "COBICAR.lobster or ICOBILIST.lobster file " + "not found in {}".format(dir_name.name) + ) + + if self.include_coop_data: + req_files = { + "coxxcar_path": "COOPCAR.lobster", + "icoxxlist_path": "ICOOPLIST.lobster", + } + for file, default_value in req_files.items(): + file_path = dir_name / default_value + req_files[file] = file_path # type: ignore + if not file_path.exists(): + gz_file_path = file_path.with_name(file_path.name + ".gz") + if gz_file_path.exists(): + req_files[file] = gz_file_path # type: ignore + + coxxcar_path = req_files.get("coxxcar_path") + icoxxlist_path = req_files.get("icoxxlist_path") + + if coxxcar_path.exists() and icoxxlist_path.exists(): # type: ignore + coxx = FeaturizeCOXX( + path_to_coxxcar=str(coxxcar_path), + path_to_icoxxlist=str(icoxxlist_path), + path_to_structure=str(structure_path), + feature_type=self.feature_type, + e_range=self.e_range, + are_coops=True, + ) + + df_coop = coxx.get_summarized_coxx_df() + + else: + raise Exception( + "COOPCAR.lobster or ICOOPLIST.lobster file " + "not found in {}".format(dir_name.name) + ) + + if self.include_cobi_data and self.include_coop_data: + df = pd.concat([df_cohp, df_cobi, df_coop], axis=1) + elif self.include_cobi_data and not self.include_coop_data: + df = pd.concat([df_cohp, df_cobi], axis=1) + elif not self.include_cobi_data and self.include_coop_data: + df = pd.concat([df_cohp, df_coop], axis=1) + else: + df = df_cohp + + return df + + def _featurizecharges(self, path_to_lobster_calc) -> pd.DataFrame: + """ + Wrapper method to featurize CHARGE.lobster.gz data that uses FeaturizeCharges under the hood + + Returns: + A pandas dataframe with computed ionicity for the structure + + """ + dir_name = Path(path_to_lobster_calc) + + req_files = { + "charge_path": "CHARGE.lobster", + "structure_path": "POSCAR", + } + for file, default_value in req_files.items(): + file_path = dir_name / default_value + req_files[file] = file_path # type: ignore + if not file_path.exists(): + gz_file_path = file_path.with_name(file_path.name + ".gz") + if gz_file_path.exists(): + req_files[file] = gz_file_path # type: ignore + + charge_path = req_files.get("charge_path") + structure_path = req_files.get("structure_path") + + if charge_path.exists() and structure_path.exists(): # type: ignore + if self.charge_type == "mulliken": + charge_mull = FeaturizeCharges( + path_to_charge=str(charge_path), + path_to_structure=str(structure_path), + charge_type="mulliken", + ) + df = charge_mull.get_df() + elif self.charge_type == "loewdin": + charge_loew = FeaturizeCharges( + path_to_charge=str(charge_path), + path_to_structure=str(structure_path), + charge_type="loewdin", + ) + df = charge_loew.get_df() + elif self.charge_type == "both": + charge_mull = FeaturizeCharges( + path_to_charge=str(charge_path), + path_to_structure=str(structure_path), + charge_type="mulliken", + ) + df_mull = charge_mull.get_df() + + charge_loew = FeaturizeCharges( + path_to_charge=str(charge_path), + path_to_structure=str(structure_path), + charge_type="loewdin", + ) + df_loew = charge_loew.get_df() + + df = pd.concat([df_mull, df_loew], axis=1) + + return df + else: + raise Exception( + "CHARGE.lobster or POSCAR not found in {}".format(dir_name.name) + ) + + def get_df(self) -> pd.DataFrame: + """ + This method will return a pandas dataframe with summary features extracted from LOBSTER files + as columns. Uses multiprocessing to speed up the process. + + Returns: + Returns a pandas dataframe + + """ + if self.path_to_jsons: + file_name_or_path = [ + os.path.join(self.path_to_jsons, f) + for f in os.listdir(self.path_to_jsons) + if not f.startswith("t") + and not f.startswith(".") + and not os.path.isdir(f) + ] + + elif self.path_to_lobster_calcs and not self.path_to_jsons: + file_name_or_path = [ + os.path.join(self.path_to_lobster_calcs, f) + for f in os.listdir(self.path_to_lobster_calcs) + if not f.startswith("t") + and not f.startswith(".") + and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f)) + ] + + with mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool: + results = tqdm( + pool.imap_unordered( + self._featurizelobsterpy, file_name_or_path, chunksize=1 + ), + total=len(file_name_or_path), + desc="Generating LobsterPy summary stats", + ) + pool.close() + pool.join() + row = [] + for result in results: + row.append(result) + + df_lobsterpy = pd.concat(row) + df_lobsterpy.sort_index(inplace=True) + + paths = [ + os.path.join(self.path_to_lobster_calcs, f) + for f in os.listdir(self.path_to_lobster_calcs) + if not f.startswith("t") + and not f.startswith(".") + and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f)) + ] + + with mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool: + results = tqdm( + pool.imap_unordered(self._featurizecoxx, paths, chunksize=1), + total=len(paths), + desc="Generating COHP/COOP/COBI summary stats", + ) + pool.close() + pool.join() + row = [] + for result in results: + row.append(result) + + df_coxx = pd.concat(row) + df_coxx.sort_index(inplace=True) + + with mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool: + results = tqdm( + pool.imap_unordered(self._featurizecharges, paths, chunksize=1), + total=len(paths), + desc="Generating charge based features", + ) + pool.close() + pool.join() + row = [] + for result in results: + row.append(result) + + df_charges = pd.concat(row) + df_charges.sort_index(inplace=True) + + df = pd.concat([df_lobsterpy, df_coxx, df_charges], axis=1) + + return df + + +class BatchCoxxFingerprint: + """ + BatchFeaturizer that generates COHP/COOP/COBI fingerprints and + Tanimoto index similarity matrix from fingerprint objects. + + Args: + path_to_lobster_calcs: path to root directory consisting of all lobster calc + feature_type: set the feature type for moment features. + Possible options are "bonding", "antibonding" or "overall" + label_list: bond labels list for which fingerprints needs to be generated. + tanimoto : bool to state to compute tanimoto index betweeen fingerprint objects + normalize: bool to state to normalize the fingerprint data + n_bins: sets number for bins for fingerprint objects + e_range : range of energy relative to fermi for which moment features needs to be computed + n_jobs : number of parallel processes to run + fingerprint_for: Possible options are 'cohp/cobi/coop'. + Based on this fingerprints will be computed for COHPCAR/COOBICAR/COOPCAR.lobster files + + Attributes: + fingerprint_df: A pandas dataframe with fingerprint objects + get_similarity_matrix_df: A symmetric pandas dataframe consisting of + similarity index (tanimoto/normalized dot product/dot product) + computed between all pairs of compunds + """ + + def __init__( + self, + path_to_lobster_calcs: str, + feature_type: str = "overall", + label_list: List[str] | None = None, + tanimoto: bool = True, + normalize: bool = True, + spin_type: str = "summed", + n_bins: int = 56, + e_range: List[float] = [-15.0, 0.0], + n_jobs=4, + fingerprint_for: str = "cohp", + ): + self.path_to_lobster_calcs = path_to_lobster_calcs + self.feature_type = feature_type + self.tanimoto = tanimoto + self.normalize = normalize + self.label_list = label_list + self.spin_type = spin_type + self.n_bins = n_bins + self.e_range = e_range + self.n_jobs = n_jobs + self.fingerprint_for = fingerprint_for + + self.fingerprint_df = self._get_fingerprints_df() + + def get_similarity_matrix_df(self) -> pd.DataFrame: + """ + This function will compute pairwise similarity index for each fingerprint object in input dataframe + + Returns: + A Pandas dataframe + """ + matrix = np.full( + (self.fingerprint_df.shape[0], self.fingerprint_df.shape[0]), np.nan + ) + for i, (row, col) in enumerate(self.fingerprint_df.iterrows()): + for j, (row1, col1) in enumerate(self.fingerprint_df.iterrows()): + if self.tanimoto: + simi = self._get_fp_similarity( + col["COXX_FP"], + col1["COXX_FP"], + tanimoto=self.tanimoto, + normalize=False, + ) + else: + simi = self._get_fp_similarity( + col["COXX_FP"], + col1["COXX_FP"], + tanimoto=False, + normalize=True, + ) + matrix[i][j] = simi + + df = pd.DataFrame( + matrix, + index=list(self.fingerprint_df.index), + columns=list(self.fingerprint_df.index), + ) + + return df + + @staticmethod + def _fp_to_dict(fp) -> dict: + """ + Converts a fingerprint into a dictionary + + Args: + fp: The fingerprint to be converted into a dictionary + + Returns: + dict: A dict of the fingerprint Keys=type, Values=np.ndarray(energies, cohp) + """ + fp_dict = {} + fp_dict[fp[2]] = np.array([fp[0], fp[1]], dtype="object").T + + return fp_dict + + @staticmethod + def _get_fp_similarity( + fp1: NamedTuple, + fp2: NamedTuple, + col: int = 1, + pt: int | str = "All", + normalize: bool = False, + tanimoto: bool = True, + ) -> float: + """ + Calculates the similarity index (dot product) of two fingerprints + + Args: + fp1 (NamedTuple): The 1st dos fingerprint object + fp2 (NamedTuple): The 2nd dos fingerprint object + col (int): The item in the fingerprints (0:energies,1: coxxs) to take the dot product of (default is 1) + pt (int or str) : The index of the point that the dot product is to be taken (default is All) + normalize (bool): If True normalize the scalar product to 1 (default is False) + tanimoto (bool): If True will compute Tanimoto index (default is False) + + Raises: + ValueError: If both tanimoto and normalize are set to True. + + Returns: + Similarity index (float): The value of dot product + + """ + fp1_dict = ( + BatchCoxxFingerprint._fp_to_dict(fp1) if not isinstance(fp1, dict) else fp1 + ) + + fp2_dict = ( + BatchCoxxFingerprint._fp_to_dict(fp2) if not isinstance(fp2, dict) else fp2 + ) + + if pt == "All": + vec1 = np.array([pt[col] for pt in fp1_dict.values()]).flatten() + vec2 = np.array([pt[col] for pt in fp2_dict.values()]).flatten() + else: + vec1 = fp1_dict[fp1[2][pt]][col] + vec2 = fp2_dict[fp2[2][pt]][col] + + if not normalize and tanimoto: + rescale = ( + np.linalg.norm(vec1) ** 2 + + np.linalg.norm(vec2) ** 2 + - np.dot(vec1, vec2) + ) + return np.dot(vec1, vec2) / rescale + + elif not tanimoto and normalize: + rescale = np.linalg.norm(vec1) * np.linalg.norm(vec2) + return np.dot(vec1, vec2) / rescale + + elif not tanimoto and not normalize: + rescale = 1.0 + return np.dot(vec1, vec2) / rescale + + else: + raise ValueError( + "Cannot compute similarity index. Please set either normalize=True or tanimoto=True or both to False." + ) + + def _fingerprint_df(self, path_to_lobster_calc) -> pd.DataFrame: + """ + Wrapper method to get fingerprint object dataframe using FeaturizeCOXX.get_coxx_fingerprint_df method. + Also helps switching the data used for fingerprint generation + + Returns: + A pandas dataframe with COXX fingerprint object + + """ + dir_name = Path(path_to_lobster_calc) + + if self.fingerprint_for.upper() == "COBI": + req_files = { + "coxxcar_path": "COBICAR.lobster", + "icoxxlist_path": "ICOBILIST.lobster", + } + for file, default_value in req_files.items(): + file_path = dir_name / default_value + req_files[file] = file_path # type: ignore + if not file_path.exists(): + gz_file_path = file_path.with_name(file_path.name + ".gz") + if gz_file_path.exists(): + req_files[file] = gz_file_path # type: ignore + + coxxcar_path = req_files.get("coxxcar_path") + icoxxlist_path = req_files.get("icoxxlist_path") + are_cobis = True + are_coops = False + + elif self.fingerprint_for.upper() == "COOP": + req_files = { + "coxxcar_path": "COOPCAR.lobster", + "icoxxlist_path": "ICOOPLIST.lobster", + } + for file, default_value in req_files.items(): + file_path = dir_name / default_value + req_files[file] = file_path # type: ignore + if not file_path.exists(): + gz_file_path = file_path.with_name(file_path.name + ".gz") + if gz_file_path.exists(): + req_files[file] = gz_file_path # type: ignore + + coxxcar_path = req_files.get("coxxcar_path") + icoxxlist_path = req_files.get("icoxxlist_path") + are_cobis = False + are_coops = True + + else: + req_files = { + "coxxcar_path": "COHPCAR.lobster", + "icoxxlist_path": "ICOHPLIST.lobster", + } + for file, default_value in req_files.items(): + file_path = dir_name / default_value + req_files[file] = file_path # type: ignore + if not file_path.exists(): + gz_file_path = file_path.with_name(file_path.name + ".gz") + if gz_file_path.exists(): + req_files[file] = gz_file_path # type: ignore + + coxxcar_path = req_files.get("coxxcar_path") + icoxxlist_path = req_files.get("icoxxlist_path") + are_cobis = False + are_coops = False + + structure_path = dir_name / "POSCAR" + if not structure_path.exists(): + gz_file_path = structure_path.with_name(structure_path.name + ".gz") + if gz_file_path.exists(): + structure_path = gz_file_path + + coxx = FeaturizeCOXX( + path_to_coxxcar=str(coxxcar_path), + path_to_icoxxlist=str(icoxxlist_path), + path_to_structure=str(structure_path), + feature_type=self.feature_type, + e_range=self.e_range, + are_coops=are_coops, + are_cobis=are_cobis, + ) + + df_fp = coxx.get_coxx_fingerprint_df( + spin_type=self.spin_type, + n_bins=self.n_bins, + normalize=self.normalize, + label_list=self.label_list, + ) + + return df_fp + + def _get_fingerprints_df(self) -> pd.DataFrame: + """ + Batch wrapper method to get fingerprint objects dataframe using + BatchCoxxFingerprint._fingerprint_df method. + + Returns: + A pandas dataframe with COXX fingerprint objects + + """ + paths = [ + os.path.join(self.path_to_lobster_calcs, f) + for f in os.listdir(self.path_to_lobster_calcs) + if not f.startswith("t") + and not f.startswith(".") + and os.path.isdir(os.path.join(self.path_to_lobster_calcs, f)) + ] + + with mp.Pool(processes=self.n_jobs, maxtasksperchild=1) as pool: + results = tqdm( + pool.imap_unordered(self._fingerprint_df, paths, chunksize=1), + total=len(paths), + desc="Generating {} fingerprints".format(self.fingerprint_for.upper()), + ) + pool.close() + pool.join() + row = [] + for result in results: + row.append(result) + + df = pd.concat(row) + df.sort_index(inplace=True) + + return df diff --git a/lobsterpy/featurize/core.py b/lobsterpy/featurize/core.py new file mode 100644 index 00000000..d9e68bc1 --- /dev/null +++ b/lobsterpy/featurize/core.py @@ -0,0 +1,896 @@ +# Copyright (c) lobsterpy development team +# Distributed under the terms of a BSD 3-Clause "New" or "Revised" License + +""" +This module defines classes to featurize Lobster data ready to be used for ML studies +""" + +from __future__ import annotations +import gzip +import json +import os +import warnings +from pathlib import Path +from typing import List, Tuple +from collections import namedtuple +import numpy as np +import numpy.typing as npt +import pandas as pd +from mendeleev import element +from pymatgen.core.structure import Structure +from pymatgen.io.lobster import Charge, Icohplist, MadelungEnergies +from pymatgen.electronic_structure.cohp import CompleteCohp +from pymatgen.electronic_structure.core import Spin +from scipy.integrate import trapezoid +from lobsterpy.cohp.analyze import Analysis + +warnings.filterwarnings("ignore") + + +class FeaturizeLobsterpy: + """ + class to featurize lobsterpy data + + Args: + path_to_lobster_calc: path containing lobster calc outputs + path_to_json: path to lobster lightweight json + bonds: "all" or "cation-anion" bonds + Attributes: + get_df: returns a pandas dataframe with relevant icohp statistical data as columns from + lobsterpy automatic bonding analysis + + """ + + def __init__( + self, + path_to_lobster_calc: str | None = None, + path_to_json: str | None = None, + bonds: str = "all", + ): + self.path_to_json = path_to_json + self.path_to_lobster_calc = path_to_lobster_calc + self.bonds = bonds + + def get_df(self, ids: str | None = None) -> pd.DataFrame: + """ + This function featurizes LobsterPy condensed bonding analysis data from + lobster lightweight json.gz files + + Returns: + Returns a pandas dataframe with lobsterpy icohp statistics + + """ + if self.path_to_json and not self.path_to_lobster_calc: + # read the lightweight lobster json files using read_lobster_lightweight_json method + data = FeaturizeLobsterpy.read_lobster_lightweight_json( + path_to_json=self.path_to_json + ) + if not ids: + ids = Path(self.path_to_json).name.split(".")[0] + + elif self.path_to_lobster_calc and not self.path_to_json: + # get lobsterpy condensed bonding analysis data using get_lobsterpy_cba_dict method + data = FeaturizeLobsterpy.get_lobsterpy_cba_dict( + path_to_lobster_calc=self.path_to_lobster_calc, bonds=self.bonds + ) + + if not ids: + ids = Path(self.path_to_lobster_calc).name + + else: + raise ValueError( + "Please provide either path to lightweight lobster jsons or path to lobster calc" + ) + # define a pandas dataframe + df = pd.DataFrame(index=[ids]) + + icohp_mean = [] + icohp_sum = [] + bond = [] + antibond = [] + # extract lobsterpy icohp related data for bond type specified + # Results will differ for "all" and "cation-anion" mode. + # In "all" bonds mode, the bonds will come up twice, also + # cation-cation, anion-anion bonds will also be considered + + if self.bonds == "all": + bond_type = "all_bonds" + elif self.bonds == "cation-anion": + bond_type = "cation_anion_bonds" + + if ( + not data[bond_type]["lobsterpy_data"] + or not data[bond_type]["lobsterpy_data"]["sites"] + ): + raise Exception( + "No {} bonds detected for {} structure. " + "Please switch to ´all´ bonds mode".format(self.bonds, ids) + ) + + for k, v in data[bond_type]["lobsterpy_data"]["sites"].items(): + if v["bonds"]: + for k1, v1 in v["bonds"].items(): + icohp_mean.append(float(v1["ICOHP_mean"])) + icohp_sum.append(float(v1["ICOHP_sum"])) + bond.append(v1["bonding"]["perc"]) + antibond.append(v1["antibonding"]["perc"]) + + # add ICOHP stats data (mean, min, max, standard deviation) as columns to the dataframe + df.loc[ids, "Icohp_mean_avg"] = np.mean(icohp_mean) + df.loc[ids, "Icohp_mean_max"] = np.max(icohp_mean) + df.loc[ids, "Icohp_mean_min"] = np.min(icohp_mean) + df.loc[ids, "Icohp_mean_std"] = np.std(icohp_mean) + + df.loc[ids, "Icohp_sum_avg"] = np.mean(icohp_sum) + df.loc[ids, "Icohp_sum_max"] = np.max(icohp_sum) + df.loc[ids, "Icohp_sum_min"] = np.min(icohp_sum) + df.loc[ids, "Icohp_sum_std"] = np.std(icohp_sum) + + df.loc[ids, "bonding_perc_avg"] = np.mean(bond) + df.loc[ids, "bonding_perc_max"] = np.max(bond) + df.loc[ids, "bonding_perc_min"] = np.min(bond) + df.loc[ids, "bonding_perc_std"] = np.std(bond) + + df.loc[ids, "antibonding_perc_avg"] = np.mean(antibond) + df.loc[ids, "antibonding_perc_min"] = np.min(antibond) + df.loc[ids, "antibonding_perc_max"] = np.max(antibond) + df.loc[ids, "antibonding_perc_std"] = np.std(antibond) + + # add madelung energies for the structure + df.loc[ids, "Madelung_Mull"] = data["madelung_energies"]["Mulliken"] + df.loc[ids, "Madelung_Loew"] = data["madelung_energies"]["Loewdin"] + + return df + + @staticmethod + def read_lobster_lightweight_json(path_to_json: str) -> dict: + """ + This method reads loads the lightweight json.gz files and returns a python dictionary object + with lobster summmarized bonding analysis data. + + Args: + path_to_json: path to lobsterpy lightweight json file + + Returns: + Returns a dictionary with lobster summmarized bonding analysis data + + """ + with gzip.open(str(path_to_json), "rb") as f: + data = json.loads(f.read().decode("utf-8")) + + lobster_data = {} + for item in data: + lobster_data.update(item) + + return lobster_data + + @staticmethod + def get_lobsterpy_cba_dict(path_to_lobster_calc: str, bonds: str) -> dict: + """ + This function uses lobsterpy.cohp.analyze.Analysis class to generate a python dictionary object + with lobster summmarized bonding analysis data. + + Args: + path_to_lobster_calc: path to lobsterpy lightweight json file + bonds: "all" or "cation-anion" bonds + + Returns: + Returns a dictionary with lobster summmarized bonding analysis data + + """ + dir_name = Path(str(path_to_lobster_calc)) + + # check if files are compressed (.gz) and update file paths + req_files_lobsterpy = { + "structure_path": "POSCAR", + "cohpcar_path": "COHPCAR.lobster", + "icohplist_path": "ICOHPLIST.lobster", + "charge_path": "CHARGE.lobster", + } + + for file, default_value in req_files_lobsterpy.items(): + file_path = dir_name / default_value + req_files_lobsterpy[file] = file_path # type: ignore + if not file_path.exists(): + gz_file_path = file_path.with_name(file_path.name + ".gz") + if gz_file_path.exists(): + req_files_lobsterpy[file] = gz_file_path # type: ignore + else: + raise Exception( + "Path provided for Lobster calc directory seems incorrect." + "It does not contain COHPCAR.lobster, ICOHPLIST.lobster, POSCAR and " + "CHARGE.lobster files needed for automatic analysis using LobsterPy" + ) + + cohpcar_path = req_files_lobsterpy.get("cohpcar_path") + charge_path = req_files_lobsterpy.get("charge_path") + structure_path = req_files_lobsterpy.get("structure_path") + icohplist_path = req_files_lobsterpy.get("icohplist_path") + + which_bonds = bonds + + if which_bonds == "all": + bond_type = "all_bonds" + elif which_bonds == "cation-anion": + bond_type = "cation_anion_bonds" + + try: + analyse = Analysis( + path_to_poscar=str(structure_path), + path_to_icohplist=str(icohplist_path), + path_to_cohpcar=str(cohpcar_path), + path_to_charge=str(charge_path), + summed_spins=False, # we will always use spin polarization here + cutoff_icohp=0.10, + whichbonds=which_bonds, + ) + + data = {bond_type: {"lobsterpy_data": analyse.condensed_bonding_analysis}} + except ValueError: + data = {bond_type: {"lobsterpy_data": {}}} + + madelung_energies_path = dir_name / "MadelungEnergies.lobster" + # check if .gz file exists and update Madelung Energies path + if not madelung_energies_path.exists(): + gz_file_path = madelung_energies_path.with_name( + madelung_energies_path.name + ".gz" + ) + if gz_file_path.exists(): + madelung_energies_path = gz_file_path + + if madelung_energies_path.exists(): + madelung_obj = MadelungEnergies(filename=str(madelung_energies_path)) + + madelung_energies = { + "Mulliken": madelung_obj.madelungenergies_Mulliken, + "Loewdin": madelung_obj.madelungenergies_Loewdin, + "Ewald_splitting": madelung_obj.ewald_splitting, + } + data["madelung_energies"] = madelung_energies + + else: + warnings.warn( + "MadelungEnergies.lobster file not found in Lobster calc directory provided" + " Will set Madelung Engeries for crystal structure values to NaN" + ) + madelung_energies = { + "Mulliken": np.nan, + "Loewdin": np.nan, + "Ewald_splitting": np.nan, + } + + data["madelung_energies"] = madelung_energies + + return data + + +coxx_fingerprint = namedtuple( + "coxx_fingerprint", "energies coxx fp_type spin_type n_bins bin_width" +) + + +class FeaturizeCOXX: + """ + class to generate features from COHPCAR/COBICAR/COOPCAR data + + Args: + path_to_coxxcar: path to COXXCAR.lobster (e.g., "COXXCAR.lobster") + path_to_icoxxlist : path to ICOXXLIST.lobster (e.g., "ICOXXLIST.lobster") + path_to_structure : path to structure file (e.g., "POSCAR") + feature_type: set the feature type for moment features and fingerprints. + Possible options are "bonding", "antibonding" or "overall" + are_cobis : bool indicating if file contains COBI/ICOBI data + are_coops : bool indicating if file contains COOP/ICOOP data + e_range : range of energy relative to fermi for which moment features needs to be computed + + Attributes: + get_df: pandas dataframe + get_coxx_fingerprint_df: pandas dataframe + + """ + + def __init__( + self, + path_to_coxxcar: str, + path_to_icoxxlist: str, + path_to_structure: str, + feature_type: str, + e_range: List[float] = [-10.0, 0.0], + are_cobis: bool = False, + are_coops: bool = False, + ): + self.path_to_coxxcar = path_to_coxxcar + self.path_to_icoxxlist = path_to_icoxxlist + self.path_to_structure = path_to_structure + self.feature_type = feature_type + self.e_range = e_range + self.are_cobis = are_cobis + self.are_coops = are_coops + self.icoxxlist = Icohplist( + filename=self.path_to_icoxxlist, + are_cobis=self.are_cobis, + are_coops=self.are_coops, + ) + self.completecoxx = CompleteCohp.from_file( + fmt="LOBSTER", + filename=self.path_to_coxxcar, + structure_file=self.path_to_structure, + are_cobis=self.are_cobis, + are_coops=self.are_coops, + ) + + def get_coxx_fingerprint_df( + self, + ids: str | None = None, + label_list: List[str] | None = None, + per_bond: bool = True, + spin_type: str = "summed", + binning: bool = True, + n_bins: int = 56, + normalize: bool = True, + ) -> pd.DataFrame: + """ + Generates the COXX fingerprint + + Args: + ids: sets index of pandas dataframe + spin_type: Specify spin type. Can accept '{summed/up/down}' + (default is summed) + binning: If true coxxs will be binned + n_bins: Number of bins to be used in the fingerprint (default is 256) + normalize: If true, normalizes the area under fp to equal to 1 (default is True) + label_list: Specify bond lables as a list for which cohp fingerprints are needed + per_bond: Will scale cohp values by number of bonds i.e length of label_list arg + (Only affects when label_list is not None) + + + Raises: + Exception: If spin_type is not one of the accepted values {summed/up/down}. + ValueError: If feature_type is not one of the accepted values {bonding/antibonding/overall}. + + Returns: + A pandas dataframe with the COXX fingerprint + of format (energies, coxx, fp_type, spin_type, n_bins, bin_width) + + """ + coxxcar_obj = self.completecoxx + + energies = coxxcar_obj.energies - coxxcar_obj.efermi + + min_e = self.e_range[0] + max_e = self.e_range[-1] + + if max_e is None: + max_e = np.max(energies) + + if min_e is None: + min_e = np.min(energies) + + if label_list: + divisor = len(label_list) if per_bond else 1 + coxxcar_obj = coxxcar_obj.get_summed_cohp_by_label_list( + label_list, divisor=divisor + ).get_cohp() + else: + coxxcar_obj = coxxcar_obj.get_cohp() + + if spin_type == "up": + coxx_all = coxxcar_obj[Spin.up] + elif spin_type == "down": + if Spin.down in coxxcar_obj: + coxx_all = coxxcar_obj[Spin.down] + else: + raise ValueError( + "LOBSTER calculation is non-spin polarized. Please switch spin_type to `up`" + ) + elif spin_type == "summed": + if Spin.down in coxxcar_obj: + coxx_all = coxxcar_obj[Spin.up] + coxxcar_obj[Spin.down] + else: + coxx_all = coxxcar_obj[Spin.up] + else: + raise Exception( + "Check the spin_type argument." "Possible options are summed/up/down" + ) + + coxx_dict = {} + tol = 1e-6 + if not self.are_cobis and not self.are_coops: + coxx_dict["bonding"] = np.array( + [scohp if scohp <= tol else 0 for scohp in coxx_all] + ) + coxx_dict["antibonding"] = np.array( + [scohp if scohp >= tol else 0 for scohp in coxx_all] + ) + else: + coxx_dict["antibonding"] = np.array( + [scohp if scohp <= tol else 0 for scohp in coxx_all] + ) + coxx_dict["bonding"] = np.array( + [scohp if scohp >= tol else 0 for scohp in coxx_all] + ) + + coxx_dict["overall"] = coxx_all + + try: + if ids: + df = pd.DataFrame(index=[ids], columns=["COXX_FP"]) + else: + ids = Path(self.path_to_coxxcar).parent.name + df = pd.DataFrame(index=[ids], columns=["COXX_FP"]) + + coxxs = coxx_dict[self.feature_type] + if len(energies) < n_bins: + inds = np.where((energies >= min_e - tol) & (energies <= max_e + tol)) + fp = coxx_fingerprint( + energies[inds], + coxxs[inds], + self.feature_type, + spin_type, + len(energies), + np.diff(energies)[0], + ) + + df.at[ids, "COXX_FP"] = fp + + return df + + if binning: + ener_bounds = np.linspace(min_e, max_e, n_bins + 1) + ener = ener_bounds[:-1] + (ener_bounds[1] - ener_bounds[0]) / 2.0 + bin_width = np.diff(ener)[0] + else: + ener_bounds = np.array(energies) + ener = np.append(energies, [energies[-1] + np.abs(energies[-1]) / 10]) + n_bins = len(energies) + bin_width = np.diff(energies)[0] + + coxx_rebin = np.zeros(ener.shape) + + for ii, e1, e2 in zip(range(len(ener)), ener_bounds[0:-1], ener_bounds[1:]): + inds = np.where((energies >= e1) & (energies < e2)) + coxx_rebin[ii] = np.sum(coxxs[inds]) + if normalize: # scale DOS bins to make area under histogram equal 1 + area = np.sum([abs(coxx) * bin_width for coxx in coxx_rebin]) + coxx_rebin_sc = coxx_rebin / area + else: + coxx_rebin_sc = coxx_rebin + + fp = coxx_fingerprint( + np.array([ener]), + coxx_rebin_sc, + self.feature_type, + spin_type, + n_bins, + bin_width, + ) + + df.at[ids, "COXX_FP"] = fp + + return df + + except KeyError: + raise ValueError( + "Please recheck fp_type requested argument.Possible options are bonding/antibonding/overall" + ) + + def _calculate_wicoxx_ein(self) -> Tuple[float, float, float, float]: + """ + Method to calculate weighted icoxx (xx ==hp,op,bi) and ein of crystal structure based on work by Peter Müller + + {:[ bar(COHP)(E)=sum _(i)(w_(i)*COHP_(i)(E))],[w_(i)=(ICOHP_(i))/(ICOHP_("total "))]:} + + w_(i)=(ICOHP_(i))/(ICOHP_("total ")) + + bar(ICOHP)=int_(-oo)^(epsi_(F)) bar(COHP)(E)dE + + Returns: + Percent bonding, Percent anti-bonding, weighted icoxx, effective interaction number + + """ + list_labels = list(self.icoxxlist.icohplist.keys()) + # Compute sum of icohps + icoxx_total = self.icoxxlist.icohpcollection.get_summed_icohp_by_label_list( + list_labels + ) + + summed_weighted_coxx = [] + + for lab in list_labels: + for k, v in self.completecoxx.get_cohp_by_label( + "{}".format(lab), summed_spin_channels=True + ).cohp.items(): + coxx = v + icoxx = self.icoxxlist.icohpcollection.get_icohp_by_label( + lab, summed_spin_channels=True + ) + weight = ( + icoxx / icoxx_total + ) # calculate the weights based on icohp contri to total icohp of the structure + weighted_coxx = weight * coxx + summed_weighted_coxx.append(weighted_coxx) + + summed = np.sum(summed_weighted_coxx, axis=0) + # below fermi + indices = self.completecoxx.energies <= self.completecoxx.efermi + en_bf = self.completecoxx.energies[indices] + coxx_bf = summed[indices] + + w_icoxx = trapezoid(coxx_bf, en_bf) + + ein = (icoxx_total / w_icoxx) * ( + 2 / self.completecoxx.structure.num_sites + ) # calc effective interaction number + + # percent bonding-anitbonding + tol = 1e-6 + if not self.icoxxlist.are_cobis and not self.icoxxlist.are_coops: + bonding_indices = coxx_bf <= tol + antibonding_indices = coxx_bf >= tol + bnd = abs(trapezoid(en_bf[bonding_indices], coxx_bf[bonding_indices])) + antibnd = abs( + trapezoid(en_bf[antibonding_indices], coxx_bf[antibonding_indices]) + ) + per_bnd = (bnd / (bnd + antibnd)) * 100 + per_antibnd = (antibnd / (bnd + antibnd)) * 100 + + elif self.icoxxlist.are_cobis or self.icoxxlist.are_coops: + bonding_indices = coxx_bf >= tol + antibonding_indices = coxx_bf <= tol + bnd = abs(trapezoid(coxx_bf[bonding_indices], en_bf[bonding_indices])) + antibnd = abs( + trapezoid(coxx_bf[antibonding_indices], en_bf[antibonding_indices]) + ) + per_bnd = (bnd / (bnd + antibnd)) * 100 + per_antibnd = (antibnd / (bnd + antibnd)) * 100 + + return per_bnd, per_antibnd, w_icoxx, ein + + def _calc_moment_features( + self, label_list: List[str] | None = None, per_bond=True + ) -> Tuple[float, float, float, float]: + """ + Wrapper method to calculate band center,width, skewness, and kurtosis of the COXX + Args: + label_list: List of bond labels + per_bond: Will scale cohp values by number of bonds i.e length of label_list arg + (Only affects when label_list is not None) + + Returns: + coxx center,width, skewness, and kurtosis in eV + """ + if label_list: + divisor = len(label_list) if per_bond else 1 + coxxcar = self.completecoxx.get_summed_cohp_by_label_list( + label_list, divisor=divisor + ).get_cohp() + + else: + coxxcar = self.completecoxx.get_cohp() + + if Spin.down in coxxcar: + coxx_all = coxxcar[Spin.up] + coxxcar[Spin.down] + else: + coxx_all = coxxcar[Spin.up] + + energies = self.completecoxx.energies - self.completecoxx.efermi + + coxx_dict = {} + tol = 1e-6 + if not self.are_cobis and not self.are_coops: + coxx_dict["bonding"] = np.array( + [scohp if scohp <= tol else 0 for scohp in coxx_all] + ) + coxx_dict["antibonding"] = np.array( + [scohp if scohp >= tol else 0 for scohp in coxx_all] + ) + coxx_dict["overall"] = coxx_all + else: + coxx_dict["antibonding"] = np.array( + [scohp if scohp <= tol else 0 for scohp in coxx_all] + ) + coxx_dict["bonding"] = np.array( + [scohp if scohp >= tol else 0 for scohp in coxx_all] + ) + coxx_dict["overall"] = coxx_all + + try: + coxx_center = self._get_coxx_center( + coxx=coxx_dict[self.feature_type], + energies=energies, + e_range=self.e_range, + ) + coxx_width = np.sqrt( + self._get_n_moment( + n=2, + coxx=coxx_dict[self.feature_type], + energies=energies, + e_range=self.e_range, + ) + ) + coxx_skew = self._get_n_moment( + n=3, + coxx=coxx_dict[self.feature_type], + energies=energies, + e_range=self.e_range, + ) / self._get_n_moment( + n=2, + coxx=coxx_dict[self.feature_type], + energies=energies, + e_range=self.e_range, + ) ** ( + 3 / 2 + ) + + coxx_kurt = ( + self._get_n_moment( + n=4, + coxx=coxx_dict[self.feature_type], + energies=energies, + e_range=self.e_range, + ) + / self._get_n_moment( + n=2, + coxx=coxx_dict[self.feature_type], + energies=energies, + e_range=self.e_range, + ) + ** 2 + ) + except KeyError: + raise ValueError( + "Please recheck fp_type requested argument.Possible options are bonding/antibonding/overall" + ) + + return coxx_center, coxx_width, coxx_skew, coxx_kurt + + def _get_coxx_center( + self, + coxx: npt.NDArray[np.floating], + energies: npt.NDArray[np.floating], + e_range: List[float], + ) -> float: + """ + Get the band width, defined as the first moment of the COXX + Args: + coxx: COXX array + energies: energies corresponding COXX + e_range: range of energy to compute coxx center + + Returns: + coxx center in eV + """ + coxx_center = self._get_n_moment( + n=1, coxx=coxx, energies=energies, e_range=e_range, center=False + ) + + return coxx_center + + def _get_n_moment( + self, + n: float, + coxx: npt.NDArray[np.floating], + energies: npt.NDArray[np.floating], + e_range: list[float] | None, + center: bool = True, + ) -> float: + """ + + Get the nth moment of COXX + + Args: + n: The order for the moment + coxx: COXX array + energies: energies array + center: Take moments with respect to the COXX center + + Returns: + COXX nth moment in eV + """ + if e_range: + min_e = self.e_range[0] + max_e = self.e_range[1] + if min_e is None: + min_e = min(energies) + if max_e is None: + max_e = max(energies) + else: + min_e = min(energies) + max_e = max(energies) + + tol = 1e-6 + + mask = (energies >= min_e - tol) & (energies <= max_e + tol) + + coxx = coxx[mask] + energies = energies[mask] + + if center: + coxx_center = self._get_coxx_center( + coxx=coxx, energies=energies, e_range=[min_e, max_e] + ) + p = energies - coxx_center + else: + p = energies + + nth_moment = np.trapz(p**n * coxx, x=energies) / np.trapz(coxx, x=energies) # type: ignore + + return nth_moment + + def get_summarized_coxx_df( + self, + ids: str | None = None, + label_list: List[str] | None = None, + per_bond=True, + ) -> pd.DataFrame: + """ + This function returns a pandas dataframe with weighted ICOXX, effective interaction number + and moment features (center, width, skewness and kurtosis) of COXX in selected energy range + + Returns: + Returns a pandas dataframe with cohp/cobi/coop related features as per input file + + """ + if ids: + df = pd.DataFrame(index=[ids]) + else: + ids = Path(self.path_to_coxxcar).parent.name + df = pd.DataFrame(index=[ids]) + + ( + per_bnd_xx, + per_antibnd_xx, + w_icoxx, + ein_xx, + ) = self._calculate_wicoxx_ein() + + if self.are_coops: + cc, cw, cs, ck = self._calc_moment_features( + label_list=label_list, per_bond=per_bond + ) + df.loc[ids, "bnd_wICOOP"] = per_bnd_xx + df.loc[ids, "antibnd_wICOOP"] = per_antibnd_xx + df.loc[ids, "w_ICOOP"] = w_icoxx + df.loc[ids, "EIN_ICOOP"] = ein_xx + df.loc[ids, "center_COOP"] = cc + df.loc[ids, "width_COOP"] = cw + df.loc[ids, "skewness_COOP"] = cs + df.loc[ids, "kurtosis_COOP"] = ck + elif self.are_cobis: + cc, cw, cs, ck = self._calc_moment_features( + label_list=label_list, per_bond=per_bond + ) + df.loc[ids, "bnd_wICOBI"] = per_bnd_xx + df.loc[ids, "antibnd_wICOBI"] = per_antibnd_xx + df.loc[ids, "w_ICOBI"] = w_icoxx + df.loc[ids, "EIN_ICOBI"] = ein_xx + df.loc[ids, "center_COBI"] = cc + df.loc[ids, "width_COBI"] = cw + df.loc[ids, "skewness_COBI"] = cs + df.loc[ids, "kurtosis_COBI"] = ck + else: + cc, cw, cs, ck = self._calc_moment_features( + label_list=label_list, per_bond=per_bond + ) + df.loc[ids, "bnd_wICOHP"] = per_bnd_xx + df.loc[ids, "antibnd_wICOHP"] = per_antibnd_xx + df.loc[ids, "w_ICOHP"] = w_icoxx + df.loc[ids, "EIN_ICOHP"] = ein_xx + df.loc[ids, "center_COHP"] = cc + df.loc[ids, "width_COHP"] = cw + df.loc[ids, "skewness_COHP"] = cs + df.loc[ids, "kurtosis_COHP"] = ck + + return df + + +class FeaturizeCharges: + """ + class to compute ionicity from CHARGE.lobster data + + Args: + path_to_strucutre: path to POSCAR + path_to_charge : path to CHARGE.lobster (e.g., "CHARGE.lobster") + charge_type : set charge type used for computing ionicity. Possible options are "Mulliken" or "Loewdin" + + Attributes: + get_df: pandas dataframe + + """ + + def __init__( + self, + path_to_structure: str, + path_to_charge: str, + charge_type: str, + ): + self.path_to_structure = path_to_structure + self.path_to_charge = path_to_charge + self.charge_type = charge_type + + def _calc_ionicity(self) -> float: + """ + Method to calculate ionicity of crystal structure based on quantum chemical charges + + I_("Charges ")=(1)/(N_("Atoms "))sum _(i)^(N_("Atoms "))((q_(i))/(v_("eff ",i))) + + Returns: + Ionicity of the structure + + """ + chargeobj = Charge(filename=self.path_to_charge) + structure = Structure.from_file(self.path_to_structure) + + if self.charge_type.lower() not in ["mulliken", "loewdin"]: + raise ValueError( + "Please check the requested charge_type. " + "Possible options are `Mulliken` or `Loewdin`" + ) + + ch_veff = [] + tol = 1e-6 + for i, j in enumerate(getattr(chargeobj, self.charge_type.capitalize())): + if ( + j > tol + and not structure.species[i].is_transition_metal + and ( + not structure.species[i].is_actinoid + and not structure.species[i].is_lanthanoid + ) + ): + valence_elec = element(structure.species[i].value) + val = j / (valence_elec.nvalence() - 0) + ch_veff.append(val) + + elif ( + j < tol + and not structure.species[i].is_transition_metal + and ( + not structure.species[i].is_actinoid + and not structure.species[i].is_lanthanoid + ) + ): + valence_elec = element(structure.species[i].value) + val = j / (valence_elec.nvalence() - 8) + ch_veff.append(val) + + elif j > tol and ( + structure.species[i].is_transition_metal + or structure.species[i].is_actinoid + or structure.species[i].is_lanthanoid + ): + val = j / (structure.species[i].max_oxidation_state - 0) + ch_veff.append(val) + + elif j < tol and ( + structure.species[i].is_transition_metal + or structure.species[i].is_actinoid + or structure.species[i].is_lanthanoid + ): + val = j / (structure.species[i].min_oxidation_state - 8) + ch_veff.append(val) + + ionicity = sum(ch_veff) / structure.num_sites + + return ionicity + + def get_df(self, ids: str | None = None) -> pd.DataFrame: + """ + This function returns a pandas dataframe with computed ionicity as column + + Returns: + Returns a pandas dataframe with ionicity + + """ + if ids: + df = pd.DataFrame(index=[ids]) + else: + ids = Path(self.path_to_charge).parent.name + df = pd.DataFrame(index=[ids]) + + if self.charge_type.lower() == "mulliken": + df.loc[ids, "Ionicity_Mull"] = self._calc_ionicity() + else: + df.loc[ids, "Ionicity_Loew"] = self._calc_ionicity() + + return df diff --git a/lobsterpy/featurize/test/__init__.py b/lobsterpy/featurize/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/lobsterpy/featurize/test/test_batch.py b/lobsterpy/featurize/test/test_batch.py new file mode 100644 index 00000000..2e3c4fb5 --- /dev/null +++ b/lobsterpy/featurize/test/test_batch.py @@ -0,0 +1,466 @@ +import unittest +import pandas as pd +from pathlib import Path +from lobsterpy.featurize.batch import BatchSummaryFeaturizer, BatchCoxxFingerprint + +CurrentDir = Path(__file__).absolute().parent +TestDir = CurrentDir / "../../" + + +class TestBatchSummaryFeaturizer(unittest.TestCase): + def setUp(self): + self.summary_featurize_with_json = BatchSummaryFeaturizer( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs", + bonds="all", + path_to_jsons=TestDir / "TestData/Featurizer_test_data/JSONS", + feature_type="antibonding", + include_cobi_data=False, + include_coop_data=False, + e_range=[-15, 0], + n_jobs=3, + ) + + self.summary_featurize_without_json = BatchSummaryFeaturizer( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs", + bonds="all", + include_cobi_data=False, + include_coop_data=False, + e_range=[-15, 0], + n_jobs=3, + ) + + self.summary_featurize_with_json_overall = BatchSummaryFeaturizer( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs", + bonds="all", + path_to_jsons=TestDir / "TestData/Featurizer_test_data/JSONS", + feature_type="overall", + include_cobi_data=True, + include_coop_data=True, + e_range=[-15, 0], + n_jobs=3, + ) + + self.summary_featurize_with_json_bonding = BatchSummaryFeaturizer( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs", + bonds="all", + path_to_jsons=TestDir / "TestData/Featurizer_test_data/JSONS", + feature_type="bonding", + include_cobi_data=False, + include_coop_data=False, + e_range=[-15, 0], + charge_type="mulliken", + n_jobs=3, + ) + + self.summary_featurize_with_json_antibonding = BatchSummaryFeaturizer( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs", + bonds="cation-anion", + path_to_jsons=TestDir / "TestData/Featurizer_test_data/JSONS", + feature_type="antibonding", + include_cobi_data=False, + include_coop_data=False, + e_range=[-15, 0], + charge_type="loewdin", + n_jobs=3, + ) + + def test_summary_featurize_with_json(self): + df = self.summary_featurize_with_json.get_df() + + self.assertIsInstance(df, pd.DataFrame) + + expected_cols = [ + "Icohp_mean_avg", + "Icohp_mean_max", + "Icohp_mean_min", + "Icohp_mean_std", + "Icohp_sum_avg", + "Icohp_sum_max", + "Icohp_sum_min", + "Icohp_sum_std", + "bonding_perc_avg", + "bonding_perc_max", + "bonding_perc_min", + "bonding_perc_std", + "antibonding_perc_avg", + "antibonding_perc_min", + "antibonding_perc_max", + "antibonding_perc_std", + "Madelung_Mull", + "Madelung_Loew", + "bnd_wICOHP", + "antibnd_wICOHP", + "w_ICOHP", + "EIN_ICOHP", + "center_COHP", + "width_COHP", + "skewness_COHP", + "kurtosis_COHP", + "Ionicity_Mull", + "Ionicity_Loew", + ] + + self.assertEqual(list(df.columns), expected_cols) + + expected_index = ["mp-1000", "mp-2176", "mp-463"] + + self.assertEqual(list(df.index), expected_index) + + def test_summary_featurize_without_json(self): + df = self.summary_featurize_without_json.get_df() + + self.assertIsInstance(df, pd.DataFrame) + + expected_cols = [ + "Icohp_mean_avg", + "Icohp_mean_max", + "Icohp_mean_min", + "Icohp_mean_std", + "Icohp_sum_avg", + "Icohp_sum_max", + "Icohp_sum_min", + "Icohp_sum_std", + "bonding_perc_avg", + "bonding_perc_max", + "bonding_perc_min", + "bonding_perc_std", + "antibonding_perc_avg", + "antibonding_perc_min", + "antibonding_perc_max", + "antibonding_perc_std", + "Madelung_Mull", + "Madelung_Loew", + "bnd_wICOHP", + "antibnd_wICOHP", + "w_ICOHP", + "EIN_ICOHP", + "center_COHP", + "width_COHP", + "skewness_COHP", + "kurtosis_COHP", + "Ionicity_Mull", + "Ionicity_Loew", + ] + + self.assertEqual(list(df.columns), expected_cols) + + expected_index = ["mp-1000", "mp-2176", "mp-463"] + + self.assertEqual(list(df.index), expected_index) + + def test_summary_featurize_with_json_overall(self): + df = self.summary_featurize_with_json_overall.get_df() + + self.assertIsInstance(df, pd.DataFrame) + + expected_cols = [ + "Icohp_mean_avg", + "Icohp_mean_max", + "Icohp_mean_min", + "Icohp_mean_std", + "Icohp_sum_avg", + "Icohp_sum_max", + "Icohp_sum_min", + "Icohp_sum_std", + "bonding_perc_avg", + "bonding_perc_max", + "bonding_perc_min", + "bonding_perc_std", + "antibonding_perc_avg", + "antibonding_perc_min", + "antibonding_perc_max", + "antibonding_perc_std", + "Madelung_Mull", + "Madelung_Loew", + "bnd_wICOHP", + "antibnd_wICOHP", + "w_ICOHP", + "EIN_ICOHP", + "center_COHP", + "width_COHP", + "skewness_COHP", + "kurtosis_COHP", + "bnd_wICOBI", + "antibnd_wICOBI", + "w_ICOBI", + "EIN_ICOBI", + "center_COBI", + "width_COBI", + "skewness_COBI", + "kurtosis_COBI", + "bnd_wICOOP", + "antibnd_wICOOP", + "w_ICOOP", + "EIN_ICOOP", + "center_COOP", + "width_COOP", + "skewness_COOP", + "kurtosis_COOP", + "Ionicity_Mull", + "Ionicity_Loew", + ] + + self.assertEqual(list(df.columns), expected_cols) + + expected_index = ["mp-1000", "mp-2176", "mp-463"] + + self.assertEqual(list(df.index), expected_index) + + def test_summary_featurize_with_json_bonding(self): + df = self.summary_featurize_with_json_bonding.get_df() + + self.assertIsInstance(df, pd.DataFrame) + + expected_cols = [ + "Icohp_mean_avg", + "Icohp_mean_max", + "Icohp_mean_min", + "Icohp_mean_std", + "Icohp_sum_avg", + "Icohp_sum_max", + "Icohp_sum_min", + "Icohp_sum_std", + "bonding_perc_avg", + "bonding_perc_max", + "bonding_perc_min", + "bonding_perc_std", + "antibonding_perc_avg", + "antibonding_perc_min", + "antibonding_perc_max", + "antibonding_perc_std", + "Madelung_Mull", + "Madelung_Loew", + "bnd_wICOHP", + "antibnd_wICOHP", + "w_ICOHP", + "EIN_ICOHP", + "center_COHP", + "width_COHP", + "skewness_COHP", + "kurtosis_COHP", + "Ionicity_Mull", + ] + + self.assertEqual(list(df.columns), expected_cols) + + def test_summary_featurize_with_json_antibonding(self): + df = self.summary_featurize_with_json_antibonding.get_df() + + self.assertIsInstance(df, pd.DataFrame) + + expected_cols = [ + "Icohp_mean_avg", + "Icohp_mean_max", + "Icohp_mean_min", + "Icohp_mean_std", + "Icohp_sum_avg", + "Icohp_sum_max", + "Icohp_sum_min", + "Icohp_sum_std", + "bonding_perc_avg", + "bonding_perc_max", + "bonding_perc_min", + "bonding_perc_std", + "antibonding_perc_avg", + "antibonding_perc_min", + "antibonding_perc_max", + "antibonding_perc_std", + "Madelung_Mull", + "Madelung_Loew", + "bnd_wICOHP", + "antibnd_wICOHP", + "w_ICOHP", + "EIN_ICOHP", + "center_COHP", + "width_COHP", + "skewness_COHP", + "kurtosis_COHP", + "Ionicity_Loew", + ] + + self.assertEqual(list(df.columns), expected_cols) + + +class TestBatchCoxxFingerprint(unittest.TestCase): + def setUp(self): + self.fp_cohp_overall = BatchCoxxFingerprint( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs", + e_range=[-15, 0], + feature_type="overall", + normalize=True, + tanimoto=True, + n_jobs=3, + ) + + self.fp_cohp_bonding = BatchCoxxFingerprint( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs", + e_range=[-15, 0], + feature_type="bonding", + normalize=False, + tanimoto=True, + n_jobs=3, + ) + + self.fp_cobi = BatchCoxxFingerprint( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs", + e_range=[-15, 0], + feature_type="antibonding", + normalize=True, + tanimoto=True, + fingerprint_for="cobi", + n_jobs=3, + ) + + self.fp_coop = BatchCoxxFingerprint( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs", + e_range=[-15, 0], + feature_type="bonding", + normalize=True, + tanimoto=False, + fingerprint_for="coop", + n_jobs=3, + ) + + def test_fp_cohp_overall(self): + df = self.fp_cohp_overall.get_similarity_matrix_df() + + self.assertAlmostEqual(df.loc["mp-463", "mp-1000"], -0.033251, places=5) + self.assertAlmostEqual(df.loc["mp-463", "mp-2176"], -0.013751, places=5) + self.assertAlmostEqual(df.loc["mp-463", "mp-463"], 1, places=5) + self.assertAlmostEqual(df.loc["mp-1000", "mp-2176"], 0.046889, places=5) + + def test_fp_cohp_bonding(self): + fp_df = self.fp_cohp_bonding.fingerprint_df + df = self.fp_cohp_bonding.get_similarity_matrix_df() + + self.assertAlmostEqual(df.loc["mp-463", "mp-1000"], 0.000017, places=5) + self.assertAlmostEqual(df.loc["mp-463", "mp-2176"], 0.000000, places=5) + self.assertAlmostEqual(df.loc["mp-463", "mp-463"], 1, places=5) + self.assertAlmostEqual(df.loc["mp-1000", "mp-2176"], 0.001532, places=5) + + def test_fp_cobi(self): + fp_df = self.fp_cobi.fingerprint_df + df = self.fp_cobi.get_similarity_matrix_df() + + self.assertAlmostEqual(df.loc["mp-463", "mp-1000"], 0, places=5) + self.assertAlmostEqual(df.loc["mp-463", "mp-2176"], 0, places=5) + self.assertAlmostEqual(df.loc["mp-463", "mp-463"], 1, places=5) + self.assertAlmostEqual(df.loc["mp-1000", "mp-2176"], 0, places=5) + + def test_fp_coop(self): + fp_df = self.fp_coop.fingerprint_df + df = self.fp_coop.get_similarity_matrix_df() + + self.assertAlmostEqual(df.loc["mp-463", "mp-1000"], 0, places=5) + self.assertAlmostEqual(df.loc["mp-463", "mp-2176"], 0, places=5) + self.assertAlmostEqual(df.loc["mp-463", "mp-463"], 1, places=5) + self.assertAlmostEqual(df.loc["mp-1000", "mp-2176"], 0, places=5) + + +class TestExceptions(unittest.TestCase): + def test_batch_summary_featurizer_exception(self): + with self.assertRaises(Exception) as err1: + self.summary_featurize_with_json = BatchSummaryFeaturizer( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs_exceptions/1/", + bonds="all", + feature_type="antibonding", + include_cobi_data=True, + include_coop_data=True, + e_range=[-15, 0], + ) + + _ = self.summary_featurize_with_json.get_df() + + self.assertEqual( + err1.exception.__str__(), + "COBICAR.lobster or ICOBILIST.lobster file not found in mp-2176", + ) + + with self.assertRaises(Exception) as err2: + self.summary_featurize_with_json = BatchSummaryFeaturizer( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs_exceptions/2/", + bonds="all", + feature_type="antibonding", + include_cobi_data=True, + include_coop_data=True, + e_range=[-15, 0], + ) + + _ = self.summary_featurize_with_json.get_df() + + self.assertEqual( + err2.exception.__str__(), + "COOPCAR.lobster or ICOOPLIST.lobster file not found in mp-1000", + ) + + # COXX exception + with self.assertRaises(Exception) as err3: + self.raise_coxx_exception = BatchSummaryFeaturizer( + path_to_lobster_calcs=TestDir / "TestData/JSONS/" + ) + + _ = self.raise_coxx_exception._featurizecoxx( + path_to_lobster_calc=self.raise_coxx_exception.path_to_lobster_calcs + ) + + self.assertEqual( + err3.exception.__str__(), + "COHPCAR.lobster or POSCAR or ICOHPLIST.lobster file not found in JSONS", + ) + + # Charges exception + with self.assertRaises(Exception) as err4: + self.raise_ch_exception = BatchSummaryFeaturizer( + path_to_lobster_calcs=TestDir / "TestData/JSONS/" + ) + + _ = self.raise_ch_exception._featurizecharges( + path_to_lobster_calc=self.raise_ch_exception.path_to_lobster_calcs + ) + + self.assertEqual( + err4.exception.__str__(), + "CHARGE.lobster or POSCAR not found in JSONS", + ) + + # Fingerprint similarity exception + with self.assertRaises(Exception) as err8: + fp_cohp_bonding = BatchCoxxFingerprint( + path_to_lobster_calcs=TestDir + / "TestData/Featurizer_test_data/Lobster_calcs", + e_range=[-15, 0], + feature_type="bonding", + normalize=True, + tanimoto=True, + n_jobs=3, + ) + + fp_df = fp_cohp_bonding.fingerprint_df + + _ = fp_cohp_bonding._get_fp_similarity( + fp_df.loc["mp-1000", "COXX_FP"], + fp_df.loc["mp-2176", "COXX_FP"], + tanimoto=True, + normalize=True, + ) + + self.assertEqual( + err8.exception.__str__(), + "Cannot compute similarity index. Please set either normalize=True or " + "tanimoto=True or both to False.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/lobsterpy/featurize/test/test_core.py b/lobsterpy/featurize/test/test_core.py new file mode 100644 index 00000000..9d7fd9a6 --- /dev/null +++ b/lobsterpy/featurize/test/test_core.py @@ -0,0 +1,616 @@ +import unittest +import pandas as pd +import numpy as np +from pathlib import Path +from lobsterpy.featurize.core import FeaturizeLobsterpy, FeaturizeCharges, FeaturizeCOXX + +CurrentDir = Path(__file__).absolute().parent +TestDir = CurrentDir / "../../" + + +class TestFeaturizeLobsterpy(unittest.TestCase): + def setUp(self): + self.featurize_mp1249_json = FeaturizeLobsterpy( + path_to_json=TestDir / "TestData/JSONS/mp-1249.json.gz", bonds="all" + ) + + self.featurize_mp1249_json_ca = FeaturizeLobsterpy( + path_to_json=TestDir / "TestData/JSONS/mp-1249.json.gz", + bonds="cation-anion", + ) + self.featurize_mp1958_json = FeaturizeLobsterpy( + path_to_json=TestDir / "TestData/JSONS/mp-1958.json.gz", bonds="all" + ) + self.featurize_mp14652_json = FeaturizeLobsterpy( + path_to_json=TestDir / "TestData/JSONS/mp-14652.json.gz", bonds="all" + ) + + self.featurize_CsH_madelung = FeaturizeLobsterpy( + path_to_lobster_calc=TestDir / "TestData/CsH/", bonds="all" + ) + + def test_featurize_mp1249_json(self): + df = self.featurize_mp1249_json.get_df(ids="mp-1249") + + # Test that the function returns a pandas DataFrame + self.assertIsInstance(df, pd.DataFrame) + + # Test that the DataFrame has the expected columns + expected_cols = [ + "Icohp_mean_avg", + "Icohp_mean_max", + "Icohp_mean_min", + "Icohp_mean_std", + "Icohp_sum_avg", + "Icohp_sum_max", + "Icohp_sum_min", + "Icohp_sum_std", + "bonding_perc_avg", + "bonding_perc_max", + "bonding_perc_min", + "bonding_perc_std", + "antibonding_perc_avg", + "antibonding_perc_min", + "antibonding_perc_max", + "antibonding_perc_std", + "Madelung_Mull", + "Madelung_Loew", + ] + self.assertCountEqual(list(df.columns), expected_cols) + + # Test that the DataFrame has the expected index + self.assertEqual(df.index[0], "mp-1249") + + # Test that all the values in the DataFrame + self.assertAlmostEqual(df.loc["mp-1249", "Icohp_mean_avg"], -1.020000, places=5) + self.assertAlmostEqual(df.loc["mp-1249", "Icohp_mean_max"], -1.020000, places=5) + self.assertAlmostEqual(df.loc["mp-1249", "Icohp_mean_min"], -1.020000, places=5) + self.assertAlmostEqual(df.loc["mp-1249", "Icohp_mean_std"], 0.000000, places=5) + self.assertAlmostEqual(df.loc["mp-1249", "Madelung_Mull"], -52.000000, places=5) + self.assertAlmostEqual( + df.loc["mp-1249", "bonding_perc_avg"], 0.978985, places=5 + ) + + def test_featurize_mp1249_json_ca(self): + df = self.featurize_mp1249_json_ca.get_df(ids="mp-1249") + + # Test that the function returns a pandas DataFrame + self.assertIsInstance(df, pd.DataFrame) + + # Test that the DataFrame has the expected columns + expected_cols = [ + "Icohp_mean_avg", + "Icohp_mean_max", + "Icohp_mean_min", + "Icohp_mean_std", + "Icohp_sum_avg", + "Icohp_sum_max", + "Icohp_sum_min", + "Icohp_sum_std", + "bonding_perc_avg", + "bonding_perc_max", + "bonding_perc_min", + "bonding_perc_std", + "antibonding_perc_avg", + "antibonding_perc_min", + "antibonding_perc_max", + "antibonding_perc_std", + "Madelung_Mull", + "Madelung_Loew", + ] + self.assertCountEqual(list(df.columns), expected_cols) + + # Test that the DataFrame has the expected index + self.assertEqual(df.index[0], "mp-1249") + + def test_featurize_mp1958_json(self): + df = self.featurize_mp1958_json.get_df() + + # Test that the function returns a pandas DataFrame + self.assertIsInstance(df, pd.DataFrame) + + # Test that the DataFrame has the expected columns + expected_cols = [ + "Icohp_mean_avg", + "Icohp_mean_max", + "Icohp_mean_min", + "Icohp_mean_std", + "Icohp_sum_avg", + "Icohp_sum_max", + "Icohp_sum_min", + "Icohp_sum_std", + "bonding_perc_avg", + "bonding_perc_max", + "bonding_perc_min", + "bonding_perc_std", + "antibonding_perc_avg", + "antibonding_perc_min", + "antibonding_perc_max", + "antibonding_perc_std", + "Madelung_Mull", + "Madelung_Loew", + ] + self.assertCountEqual(list(df.columns), expected_cols) + + # Test that the DataFrame has the expected index + self.assertEqual(df.index[0], "mp-1958") + + # Test that all the values in the DataFrame + self.assertAlmostEqual(df.loc["mp-1958", "Icohp_sum_avg"], -2.96000, places=5) + self.assertAlmostEqual(df.loc["mp-1958", "Icohp_sum_max"], -2.96000, places=5) + self.assertAlmostEqual(df.loc["mp-1958", "Icohp_sum_min"], -2.96000, places=5) + self.assertAlmostEqual(df.loc["mp-1958", "Icohp_sum_std"], 0.000000, places=5) + self.assertAlmostEqual(df.loc["mp-1958", "Madelung_Loew"], -16.68000, places=5) + self.assertAlmostEqual( + df.loc["mp-1958", "antibonding_perc_avg"], 0.14528, places=5 + ) + + def test_featurize_mp14652_json(self): + df = self.featurize_mp14652_json.get_df(ids="mp-14652") + + # Test that the function returns a pandas DataFrame + self.assertIsInstance(df, pd.DataFrame) + + # Test that the DataFrame has the expected columns + expected_cols = [ + "Icohp_mean_avg", + "Icohp_mean_max", + "Icohp_mean_min", + "Icohp_mean_std", + "Icohp_sum_avg", + "Icohp_sum_max", + "Icohp_sum_min", + "Icohp_sum_std", + "bonding_perc_avg", + "bonding_perc_max", + "bonding_perc_min", + "bonding_perc_std", + "antibonding_perc_avg", + "antibonding_perc_min", + "antibonding_perc_max", + "antibonding_perc_std", + "Madelung_Mull", + "Madelung_Loew", + ] + self.assertCountEqual(list(df.columns), expected_cols) + + # Test that the DataFrame has the expected index + self.assertEqual(df.index[0], "mp-14652") + + # Test that all the values in the DataFrame + self.assertAlmostEqual(df.loc["mp-14652", "Icohp_mean_std"], 2.335070, places=5) + self.assertAlmostEqual( + df.loc["mp-14652", "bonding_perc_max"], 0.889620, places=5 + ) + self.assertAlmostEqual( + df.loc["mp-14652", "bonding_perc_min"], 0.873420, places=5 + ) + self.assertAlmostEqual( + df.loc["mp-14652", "bonding_perc_std"], 0.006339, places=5 + ) + self.assertAlmostEqual( + df.loc["mp-14652", "antibonding_perc_min"], 0.110380, places=5 + ) + self.assertAlmostEqual( + df.loc["mp-14652", "antibonding_perc_max"], 0.126580, places=5 + ) + self.assertAlmostEqual( + df.loc["mp-14652", "antibonding_perc_std"], 0.006339, places=5 + ) + + def test_featurize_CsH_madelung(self): + df = self.featurize_CsH_madelung.get_df() + + self.assertTrue(np.isnan(df.loc["CsH", "Madelung_Mull"])) + self.assertTrue(np.isnan(df.loc["CsH", "Madelung_Loew"])) + + +class TestFeaturizeCOXX(unittest.TestCase): + def setUp(self): + self.featurize_NaCl_COXX = FeaturizeCOXX( + path_to_coxxcar=TestDir / "TestData/NaCl/COHPCAR.lobster", + path_to_icoxxlist=TestDir / "TestData/NaCl/ICOHPLIST.lobster", + path_to_structure=TestDir / "TestData/NaCl/POSCAR", + feature_type="overall", + e_range=[-5, 0], + ) + self.featurize_CdF_COXX = FeaturizeCOXX( + path_to_coxxcar=TestDir / "TestData/CdF/COHPCAR.lobster", + path_to_icoxxlist=TestDir / "TestData/CdF/ICOHPLIST.lobster", + path_to_structure=TestDir / "TestData/CdF/POSCAR", + feature_type="bonding", + e_range=[-5, 0], + ) + self.featurize_K3Sb_COXX = FeaturizeCOXX( + path_to_coxxcar=TestDir / "TestData/K3Sb/COHPCAR.lobster.gz", + path_to_icoxxlist=TestDir / "TestData/K3Sb/ICOHPLIST.lobster.gz", + path_to_structure=TestDir / "TestData/K3Sb/POSCAR.gz", + feature_type="antibonding", + e_range=[-5, 0], + ) + + def test_featurize_NaCl_COXX(self): + df = self.featurize_NaCl_COXX.get_summarized_coxx_df(ids="NaCl") + + # Test that the function returns a pandas DataFrame + self.assertIsInstance(df, pd.DataFrame) + + # Test that the DataFrame has the expected columns + expected_cols = [ + "bnd_wICOHP", + "antibnd_wICOHP", + "w_ICOHP", + "EIN_ICOHP", + "center_COHP", + "width_COHP", + "skewness_COHP", + "kurtosis_COHP", + ] + self.assertCountEqual(list(df.columns), expected_cols) + + # Test that the DataFrame has the expected index + self.assertEqual(df.index[0], "NaCl") + + # Test that all the values in the DataFrame + self.assertAlmostEqual(df.loc["NaCl", "bnd_wICOHP"], 97.233878, places=5) + self.assertAlmostEqual(df.loc["NaCl", "antibnd_wICOHP"], 2.766122, places=5) + self.assertAlmostEqual(df.loc["NaCl", "w_ICOHP"], -0.150558, places=5) + + self.assertAlmostEqual(df.loc["NaCl", "EIN_ICOHP"], 27.843536, places=5) + self.assertAlmostEqual(df.loc["NaCl", "center_COHP"], -4.96241, places=5) + self.assertAlmostEqual(df.loc["NaCl", "width_COHP"], 8.881784e-16, places=5) + self.assertAlmostEqual(df.loc["NaCl", "skewness_COHP"], 1, places=5) + self.assertAlmostEqual(df.loc["NaCl", "kurtosis_COHP"], 1, places=5) + + # test summary features using label list + df1 = self.featurize_NaCl_COXX.get_summarized_coxx_df(label_list=["2", "3"]) + self.assertNotEqual( + df.loc["NaCl", "center_COHP"], + df1.loc["NaCl", "center_COHP"], + ) + + def test_featurize_NaCl_COXX_fingerprint(self): + df = self.featurize_NaCl_COXX.get_coxx_fingerprint_df(n_bins=20000) + + fingerprint = df.loc["NaCl", "COXX_FP"] + + self.assertNotEqual(fingerprint.n_bins, 20000) + + df1 = self.featurize_NaCl_COXX.get_coxx_fingerprint_df(binning=False) + + fingerprint = df1.loc["NaCl", "COXX_FP"] + + self.assertEqual(fingerprint.n_bins, 401) + + df2 = self.featurize_NaCl_COXX.get_coxx_fingerprint_df(label_list=["3", "5"]) + + fingerprint_label = df2.loc["NaCl", "COXX_FP"] + + self.assertNotEqual(fingerprint.__str__(), fingerprint_label.__str__()) + + def test_featurize_CdF_COXX_fingerprint(self): + df = self.featurize_CdF_COXX.get_coxx_fingerprint_df(n_bins=20000) + + fingerprint = df.loc["CdF", "COXX_FP"] + + self.assertNotEqual(fingerprint.n_bins, 20000) + + df1 = self.featurize_CdF_COXX.get_coxx_fingerprint_df(binning=False) + + fingerprint = df1.loc["CdF", "COXX_FP"] + + self.assertEqual(fingerprint.n_bins, 401) + + df2 = self.featurize_CdF_COXX.get_coxx_fingerprint_df(label_list=["3", "5"]) + + fingerprint_label = df2.loc["CdF", "COXX_FP"] + + self.assertNotEqual(fingerprint.__str__(), fingerprint_label.__str__()) + + def test_featurize_CdF_COXX(self): + df = self.featurize_CdF_COXX.get_summarized_coxx_df() + + # Test that the function returns a pandas DataFrame + self.assertIsInstance(df, pd.DataFrame) + + # Test that the DataFrame has the expected columns + expected_cols = [ + "bnd_wICOHP", + "antibnd_wICOHP", + "w_ICOHP", + "EIN_ICOHP", + "center_COHP", + "width_COHP", + "skewness_COHP", + "kurtosis_COHP", + ] + self.assertCountEqual(list(df.columns), expected_cols) + + # Test that the DataFrame has the expected index + self.assertEqual(df.index[0], "CdF") + + # Test that all the values in the DataFrame + self.assertAlmostEqual(df.loc["CdF", "bnd_wICOHP"], 81.112732, places=5) + self.assertAlmostEqual(df.loc["CdF", "antibnd_wICOHP"], 18.887268, places=5) + self.assertAlmostEqual(df.loc["CdF", "w_ICOHP"], -0.198235, places=5) + + self.assertAlmostEqual(df.loc["CdF", "EIN_ICOHP"], 18.76634, places=5) + self.assertAlmostEqual(df.loc["CdF", "center_COHP"], -4.748383, places=5) + self.assertAlmostEqual(df.loc["CdF", "width_COHP"], 0.157761, places=5) + self.assertAlmostEqual(df.loc["CdF", "skewness_COHP"], 0.910094, places=5) + self.assertAlmostEqual(df.loc["CdF", "kurtosis_COHP"], 2.866611, places=5) + + # test using label list + df1 = self.featurize_CdF_COXX.get_summarized_coxx_df( + label_list=["2", "3", "30"] + ) + self.assertNotEqual( + df.loc["CdF", "center_COHP"], + df1.loc["CdF", "center_COHP"], + ) + + def test_featurize_K3Sb_COXX(self): + df = self.featurize_K3Sb_COXX.get_summarized_coxx_df(ids="K3Sb") + + # Test that the function returns a pandas DataFrame + self.assertIsInstance(df, pd.DataFrame) + + # Test that the DataFrame has the expected columns + expected_cols = [ + "bnd_wICOHP", + "antibnd_wICOHP", + "w_ICOHP", + "EIN_ICOHP", + "center_COHP", + "width_COHP", + "skewness_COHP", + "kurtosis_COHP", + ] + self.assertCountEqual(list(df.columns), expected_cols) + + # Test that the DataFrame has the expected index + self.assertEqual(df.index[0], "K3Sb") + + # Test that all the values in the DataFrame + self.assertAlmostEqual(df.loc["K3Sb", "bnd_wICOHP"], 97.019044, places=5) + self.assertAlmostEqual(df.loc["K3Sb", "antibnd_wICOHP"], 2.980956, places=5) + self.assertAlmostEqual(df.loc["K3Sb", "w_ICOHP"], -0.318218, places=5) + + self.assertAlmostEqual(df.loc["K3Sb", "EIN_ICOHP"], 11.597595, places=5) + self.assertAlmostEqual(df.loc["K3Sb", "center_COHP"], -0.198211, places=5) + self.assertAlmostEqual(df.loc["K3Sb", "width_COHP"], 0.233826, places=5) + self.assertAlmostEqual(df.loc["K3Sb", "skewness_COHP"], -1.626643, places=5) + self.assertAlmostEqual(df.loc["K3Sb", "kurtosis_COHP"], 3.771873, places=5) + + +class TestFeaturizeCharges(unittest.TestCase): + def setUp(self): + self.featurize_C_Charge = FeaturizeCharges( + path_to_structure=TestDir / "TestData/C/POSCAR", + path_to_charge=TestDir / "TestData/C/CHARGE.lobster", + charge_type="mulliken", + ) + self.featurize_CdF_Charge = FeaturizeCharges( + path_to_structure=TestDir / "TestData/CdF/POSCAR", + path_to_charge=TestDir / "TestData/CdF/CHARGE.lobster", + charge_type="mulliken", + ) + self.featurize_K3Sb_Charge = FeaturizeCharges( + path_to_structure=TestDir / "TestData/K3Sb/POSCAR.gz", + path_to_charge=TestDir / "TestData/K3Sb/CHARGE.lobster.gz", + charge_type="loewdin", + ) + + def test_featurize_C_Charge(self): + df = self.featurize_C_Charge.get_df(ids="C") + + # Test that the function returns a pandas DataFrame + self.assertIsInstance(df, pd.DataFrame) + + # Test that the DataFrame has the expected columns + expected_cols = [ + "Ionicity_Mull", + ] + self.assertCountEqual(list(df.columns), expected_cols) + + # Test that the DataFrame has the expected index + self.assertEqual(df.index[0], "C") + + # Test that all the values in the DataFrame + self.assertAlmostEqual(df.loc["C", "Ionicity_Mull"], 0.0, places=5) + + def test_featurize_CdF_Charge(self): + df = self.featurize_CdF_Charge.get_df(ids="CdF") + + # Test that the function returns a pandas DataFrame + self.assertIsInstance(df, pd.DataFrame) + + # Test that the DataFrame has the expected columns + expected_cols = [ + "Ionicity_Mull", + ] + self.assertCountEqual(list(df.columns), expected_cols) + + # Test that the DataFrame has the expected index + self.assertEqual(df.index[0], "CdF") + + # Test that all the values in the DataFrame + self.assertAlmostEqual(df.loc["CdF", "Ionicity_Mull"], 0.788333, places=5) + + def test_featurize_K3Sb_Charge(self): + df = self.featurize_K3Sb_Charge.get_df(ids="K3Sb") + + # Test that the function returns a pandas DataFrame + self.assertIsInstance(df, pd.DataFrame) + + # Test that the DataFrame has the expected columns + expected_cols = [ + "Ionicity_Loew", + ] + self.assertCountEqual(list(df.columns), expected_cols) + + # Test that the DataFrame has the expected index + self.assertEqual(df.index[0], "K3Sb") + + # Test that all the values in the DataFrame + self.assertAlmostEqual(df.loc["K3Sb", "Ionicity_Loew"], 0.563333, places=5) + + +class TestExceptions(unittest.TestCase): + def test_lobsterpy_featurize_exception(self): + with self.assertRaises(Exception) as err: + self.featurize_mp1249_json = FeaturizeLobsterpy( + path_to_json=None, path_to_lobster_calc=None, bonds="all" + ) + + _ = self.featurize_mp1249_json.get_df() + + self.assertEqual( + err.exception.__str__(), + "Please provide either path to lightweight lobster jsons or path to lobster calc", + ) + + with self.assertRaises(Exception) as err: + self.featurize_mp1249_json = FeaturizeLobsterpy( + path_to_json=None, path_to_lobster_calc=TestDir, bonds="all" + ) + + _ = self.featurize_mp1249_json.get_df() + + self.assertEqual( + err.exception.__str__(), + "Path provided for Lobster calc directory seems incorrect." + "It does not contain COHPCAR.lobster, ICOHPLIST.lobster, POSCAR and " + "CHARGE.lobster files needed for automatic analysis using LobsterPy", + ) + + with self.assertRaises(Exception) as err: + self.featurize_CsH_cation_anion = FeaturizeLobsterpy( + path_to_lobster_calc=TestDir / "TestData/CsH/", bonds="cation-anion" + ) + + _ = self.featurize_CsH_cation_anion.get_df() + + self.assertEqual( + err.exception.__str__(), + "No cation-anion bonds detected for CsH structure. " + "Please switch to ´all´ bonds mode", + ) + + with self.assertRaises(Exception) as err: + self.featurize_C_cation_anion = FeaturizeLobsterpy( + path_to_lobster_calc=TestDir / "TestData/C/", bonds="cation-anion" + ) + + _ = self.featurize_C_cation_anion.get_df() + + self.assertEqual( + err.exception.__str__(), + "No cation-anion bonds detected for C structure. " + "Please switch to ´all´ bonds mode", + ) + + def test_featurize_charges(self): + with self.assertRaises(Exception) as err: + self.featurize_CdF_Charge = FeaturizeCharges( + path_to_structure=TestDir / "TestData/CdF/POSCAR", + path_to_charge=TestDir / "TestData/CdF/CHARGE.lobster", + charge_type="Mull", + ) + + _ = self.featurize_CdF_Charge.get_df() + + self.assertEqual( + err.exception.__str__(), + "Please check the requested charge_type. " + "Possible options are `Mulliken` or `Loewdin`", + ) + + def test_featurize_coxx(self): + with self.assertRaises(Exception) as err: + self.featurize_COXX = FeaturizeCOXX( + path_to_coxxcar=TestDir / "TestData/NaCl/COHPCAR.lobster", + path_to_icoxxlist=TestDir / "TestData/NaCl/ICOHPLIST.lobster", + path_to_structure=TestDir / "TestData/NaCl/POSCAR", + feature_type="summed", + e_range=[-5, 0], + ) + + _ = self.featurize_COXX.get_summarized_coxx_df() + + self.assertEqual( + err.exception.__str__(), + "Please recheck fp_type requested argument.Possible options are bonding/antibonding/overall", + ) + + with self.assertRaises(Exception) as err2: + self.featurize_COXX = FeaturizeCOXX( + path_to_coxxcar=TestDir / "TestData/NaCl/COHPCAR.lobster", + path_to_icoxxlist=TestDir / "TestData/NaCl/ICOHPLIST.lobster", + path_to_structure=TestDir / "TestData/NaCl/POSCAR", + feature_type="bonding", + e_range=[-5, 0], + ) + + _ = self.featurize_COXX.get_coxx_fingerprint_df(spin_type="-1") + + self.assertEqual( + err2.exception.__str__(), + "Check the spin_type argument." "Possible options are summed/up/down", + ) + + with self.assertRaises(Exception) as err3: + self.featurize_COXX = FeaturizeCOXX( + path_to_coxxcar=TestDir / "TestData/NaSi/COHPCAR.lobster", + path_to_icoxxlist=TestDir / "TestData/NaSi/ICOHPLIST.lobster", + path_to_structure=TestDir / "TestData/NaSi/POSCAR", + feature_type="bonding", + e_range=[-5, 0], + are_cobis=True, + are_coops=True, + ) + + _ = self.featurize_COXX.get_coxx_fingerprint_df() + + self.assertEqual( + err3.exception.__str__(), + "You cannot have info about COOPs and COBIs in the same file.", + ) + + with self.assertRaises(Exception) as err: + self.featurize_NaCl_COXX = FeaturizeCOXX( + path_to_coxxcar=TestDir / "TestData/NaCl/COHPCAR.lobster", + path_to_icoxxlist=TestDir / "TestData/NaCl/ICOHPLIST.lobster", + path_to_structure=TestDir / "TestData/NaCl/POSCAR", + feature_type="antibond", + e_range=[-5, 0], + ) + + _ = self.featurize_NaCl_COXX.get_summarized_coxx_df() + + self.assertEqual( + err.exception.__str__(), + "Please recheck fp_type requested argument." + "Possible options are bonding/antibonding/overall", + ) + + with self.assertRaises(Exception) as err: + self.featurize_NaCl_COXX = FeaturizeCOXX( + path_to_coxxcar=TestDir / "TestData/NaCl/COHPCAR.lobster", + path_to_icoxxlist=TestDir / "TestData/NaCl/ICOHPLIST.lobster", + path_to_structure=TestDir / "TestData/NaCl/POSCAR", + feature_type="antibonding", + e_range=[-5, 0], + ) + + _ = self.featurize_NaCl_COXX.get_coxx_fingerprint_df(spin_type="down") + + self.assertEqual( + err.exception.__str__(), + "LOBSTER calculation is non-spin polarized. " + "Please switch spin_type to `up`", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/pylintrc b/pylintrc index 99aefe96..f97b537b 100644 --- a/pylintrc +++ b/pylintrc @@ -60,15 +60,9 @@ confidence= # --enable=similarities". If you want to run only the classes checker, but have # no Warning level messages displayed, use "--disable=all --enable=classes # --disable=W". -disable=print-statement, - parameter-unpacking, - unpacking-in-except, - old-raise-syntax, - backtick, - long-suffix, +disable=long-suffix, old-ne-operator, old-octal-literal, - import-star-module-level, non-ascii-bytes-literal, raw-checker-failed, bad-inline-option, @@ -78,67 +72,6 @@ disable=print-statement, useless-suppression, deprecated-pragma, use-symbolic-message-instead, - apply-builtin, - basestring-builtin, - buffer-builtin, - cmp-builtin, - coerce-builtin, - execfile-builtin, - file-builtin, - long-builtin, - raw_input-builtin, - reduce-builtin, - standarderror-builtin, - unicode-builtin, - xrange-builtin, - coerce-method, - delslice-method, - getslice-method, - setslice-method, - no-absolute-import, - old-division, - dict-iter-method, - dict-view-method, - next-method-called, - metaclass-assignment, - indexing-exception, - raising-string, - reload-builtin, - oct-method, - hex-method, - nonzero-method, - cmp-method, - input-builtin, - round-builtin, - intern-builtin, - unichr-builtin, - map-builtin-not-iterating, - zip-builtin-not-iterating, - range-builtin-not-iterating, - filter-builtin-not-iterating, - using-cmp-argument, - eq-without-hash, - div-method, - idiv-method, - rdiv-method, - exception-message-attribute, - invalid-str-codec, - sys-max-int, - bad-python3-import, - deprecated-string-function, - deprecated-str-translate-call, - deprecated-itertools-function, - deprecated-types-field, - next-method-defined, - dict-items-not-iterating, - dict-keys-not-iterating, - dict-values-not-iterating, - deprecated-operator-function, - deprecated-urllib-function, - xreadlines-attribute, - deprecated-sys-function, - exception-escape, - comprehension-escape, C0103, C0201, C0209, @@ -153,13 +86,13 @@ disable=print-statement, R0916, R1702, R0903, + R1705, W, C1801, E0611, E1121, R0901, C0415, - C0330, C0302, E1136, R0801, @@ -355,8 +288,8 @@ max-module-lines=5000 # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. # `trailing-comma` allows a space between comma and closing bracket: (a, ). # `empty-line` allows space-only lines. -no-space-check=trailing-comma, - dict-separator +# no-space-check=trailing-comma, +# dict-separator # Allow the body of a class to be on the same line as the declaration if body # contains single statement. @@ -591,5 +524,5 @@ min-public-methods=2 # Exceptions that will emit a warning when being caught. Defaults to # "BaseException, Exception". -overgeneral-exceptions=BaseException, - Exception +overgeneral-exceptions=builtins.BaseException, + builtins.Exception diff --git a/pyproject.toml b/pyproject.toml index 1df4f6bb..233595b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,9 @@ repository = "https://github.com/JaGeo/LobsterPy" documentation = "https://lobsterpy.readthedocs.io/en/latest/?badge=latest" changelog = "https://lobsterpy.readthedocs.io/en/latest/changelog_link.html" +[project.optional-dependencies] +featurizer = ["mendeleev==0.12.1"] + [project.scripts] lobsterpy = "lobsterpy.cli:main"