Skip to content

Commit

Permalink
Merge pull request #84 from digital-science/grid-affiliation-improv
Browse files Browse the repository at this point in the history
v 1.3
  • Loading branch information
lambdamusic authored Jun 21, 2024
2 parents a12fbd0 + 9e1625c commit a18836f
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 7 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## v 1.3

* Function `extract_affiliations` return an extra column for unstructured data, with the input string.

## v 1.2

* Added DSL grammar for DSL V2.8
Expand Down
2 changes: 1 addition & 1 deletion dimcli/VERSION.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# !/usr/bin/env python
# -*- coding: UTF-8 -*-

__version__ = "1.2" # LATEST? => https://pypi.org/project/dimcli/
__version__ = "1.3" # LATEST? => https://pypi.org/project/dimcli/
__copyright__ = "CopyRight (C) 2018-2024 by Digital Science"
__license__ = "MIT"
__author__ = "Michele Pasin"
Expand Down
15 changes: 13 additions & 2 deletions dimcli/core/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def extract_classification(title, abstract, system="", verbose=True):



def extract_affiliations(affiliations, as_json=False):
def extract_affiliations(affiliations, as_json=False, include_input=False):
"""Python wrapper for the DSL function `extract_affiliations`.
This function returns GRID affiliations either using structured or unstructured input. Up to 200 input objects are allowed per request. See also: https://docs.dimensions.ai/dsl/functions.html#function-extract-affiliations
Expand Down Expand Up @@ -204,6 +204,8 @@ def extract_affiliations(affiliations, as_json=False):
The raw affiliation data to process.
as_json : bool, optional
Return raw JSON encoded as a Python dict (instead of a pandas dataframe, by default).
include_input: bool, optional, False
For unstructured affiliation matching, return also a column `input_affiliation` with the original input string.
Returns
-------
Expand Down Expand Up @@ -254,7 +256,16 @@ def extract_affiliations(affiliations, as_json=False):
if affiliation_type == "STRUCTURED":
temp = pd.json_normalize(output.json['results'], errors='ignore')
if affiliation_type == "UNSTRUCTURED":
temp = pd.json_normalize(output.json['results'], 'matches', errors='ignore')
if include_input:
temp = pd.json_normalize(output.json['results'], 'matches', ["input"], errors='ignore')
temp["input_affiliation"] = temp["input"].apply(lambda x : x["affiliation"])
temp.drop(columns=["input"], inplace=True)
# move input col first
col_input = temp['input_affiliation']
temp.drop(labels=['input_affiliation'], axis=1,inplace = True)
temp.insert(0, 'input_affiliation', col_input)
else:
temp = pd.json_normalize(output.json['results'], 'matches', errors='ignore')
temp = temp.explode("institutes")
temp = temp.explode("geo.countries")
temp = temp.explode("geo.states")
Expand Down
15 changes: 13 additions & 2 deletions docs/_modules/dimcli/core/functions.html
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ <h1>Source code for dimcli.core.functions</h1><div class="highlight"><pre>



<div class="viewcode-block" id="extract_affiliations"><a class="viewcode-back" href="../../../modules.html#dimcli.core.functions.extract_affiliations">[docs]</a><span class="k">def</span> <span class="nf">extract_affiliations</span><span class="p">(</span><span class="n">affiliations</span><span class="p">,</span> <span class="n">as_json</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<div class="viewcode-block" id="extract_affiliations"><a class="viewcode-back" href="../../../modules.html#dimcli.core.functions.extract_affiliations">[docs]</a><span class="k">def</span> <span class="nf">extract_affiliations</span><span class="p">(</span><span class="n">affiliations</span><span class="p">,</span> <span class="n">as_json</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">include_input</span><span class="o">=</span><span class="kc">False</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;Python wrapper for the DSL function `extract_affiliations`. </span>

<span class="sd"> This function returns GRID affiliations either using structured or unstructured input. Up to 200 input objects are allowed per request. See also: https://docs.dimensions.ai/dsl/functions.html#function-extract-affiliations</span>
Expand Down Expand Up @@ -271,6 +271,8 @@ <h1>Source code for dimcli.core.functions</h1><div class="highlight"><pre>
<span class="sd"> The raw affiliation data to process. </span>
<span class="sd"> as_json : bool, optional</span>
<span class="sd"> Return raw JSON encoded as a Python dict (instead of a pandas dataframe, by default). </span>
<span class="sd"> include_input: bool, optional, False</span>
<span class="sd"> For unstructured affiliation matching, return also a column `input_affiliation` with the original input string.</span>

<span class="sd"> Returns</span>
<span class="sd"> -------</span>
Expand Down Expand Up @@ -321,7 +323,16 @@ <h1>Source code for dimcli.core.functions</h1><div class="highlight"><pre>
<span class="k">if</span> <span class="n">affiliation_type</span> <span class="o">==</span> <span class="s2">&quot;STRUCTURED&quot;</span><span class="p">:</span>
<span class="n">temp</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">json_normalize</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">json</span><span class="p">[</span><span class="s1">&#39;results&#39;</span><span class="p">],</span> <span class="n">errors</span><span class="o">=</span><span class="s1">&#39;ignore&#39;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">affiliation_type</span> <span class="o">==</span> <span class="s2">&quot;UNSTRUCTURED&quot;</span><span class="p">:</span>
<span class="n">temp</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">json_normalize</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">json</span><span class="p">[</span><span class="s1">&#39;results&#39;</span><span class="p">],</span> <span class="s1">&#39;matches&#39;</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="s1">&#39;ignore&#39;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">include_input</span><span class="p">:</span>
<span class="n">temp</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">json_normalize</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">json</span><span class="p">[</span><span class="s1">&#39;results&#39;</span><span class="p">],</span> <span class="s1">&#39;matches&#39;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;input&quot;</span><span class="p">],</span> <span class="n">errors</span><span class="o">=</span><span class="s1">&#39;ignore&#39;</span><span class="p">)</span>
<span class="n">temp</span><span class="p">[</span><span class="s2">&quot;input_affiliation&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">temp</span><span class="p">[</span><span class="s2">&quot;input&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span> <span class="p">:</span> <span class="n">x</span><span class="p">[</span><span class="s2">&quot;affiliation&quot;</span><span class="p">])</span>
<span class="n">temp</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;input&quot;</span><span class="p">],</span> <span class="n">inplace</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># move input col first</span>
<span class="n">col_input</span> <span class="o">=</span> <span class="n">temp</span><span class="p">[</span><span class="s1">&#39;input_affiliation&#39;</span><span class="p">]</span>
<span class="n">temp</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">labels</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;input_affiliation&#39;</span><span class="p">],</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span><span class="n">inplace</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span>
<span class="n">temp</span><span class="o">.</span><span class="n">insert</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s1">&#39;input_affiliation&#39;</span><span class="p">,</span> <span class="n">col_input</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">temp</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">json_normalize</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">json</span><span class="p">[</span><span class="s1">&#39;results&#39;</span><span class="p">],</span> <span class="s1">&#39;matches&#39;</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="s1">&#39;ignore&#39;</span><span class="p">)</span>
<span class="n">temp</span> <span class="o">=</span> <span class="n">temp</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&quot;institutes&quot;</span><span class="p">)</span>
<span class="n">temp</span> <span class="o">=</span> <span class="n">temp</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&quot;geo.countries&quot;</span><span class="p">)</span>
<span class="n">temp</span> <span class="o">=</span> <span class="n">temp</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s2">&quot;geo.states&quot;</span><span class="p">)</span>
Expand Down
3 changes: 2 additions & 1 deletion docs/modules.html
Original file line number Diff line number Diff line change
Expand Up @@ -891,7 +891,7 @@ <h1>Modules Reference<a class="headerlink" href="#modules-reference" title="Perm

<dl class="py function">
<dt class="sig sig-object py" id="dimcli.core.functions.extract_affiliations">
<span class="sig-prename descclassname"><span class="pre">dimcli.core.functions.</span></span><span class="sig-name descname"><span class="pre">extract_affiliations</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">affiliations</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">as_json</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/dimcli/core/functions.html#extract_affiliations"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#dimcli.core.functions.extract_affiliations" title="Permalink to this definition"></a></dt>
<span class="sig-prename descclassname"><span class="pre">dimcli.core.functions.</span></span><span class="sig-name descname"><span class="pre">extract_affiliations</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">affiliations</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">as_json</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">include_input</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/dimcli/core/functions.html#extract_affiliations"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#dimcli.core.functions.extract_affiliations" title="Permalink to this definition"></a></dt>
<dd><p>Python wrapper for the DSL function <cite>extract_affiliations</cite>.</p>
<p>This function returns GRID affiliations either using structured or unstructured input. Up to 200 input objects are allowed per request. See also: <a class="reference external" href="https://docs.dimensions.ai/dsl/functions.html#function-extract-affiliations">https://docs.dimensions.ai/dsl/functions.html#function-extract-affiliations</a></p>
<p>The input argument <code class="docutils literal notranslate"><span class="pre">affiliations</span></code> can be one of the following:</p>
Expand Down Expand Up @@ -932,6 +932,7 @@ <h1>Modules Reference<a class="headerlink" href="#modules-reference" title="Perm
<dd class="field-odd"><ul class="simple">
<li><p><strong>affiliations</strong> (<em>str</em><em> or </em><em>list</em><em> or </em><em>dict</em>) – The raw affiliation data to process.</p></li>
<li><p><strong>as_json</strong> (<em>bool</em><em>, </em><em>optional</em>) – Return raw JSON encoded as a Python dict (instead of a pandas dataframe, by default).</p></li>
<li><p><strong>include_input</strong> (<em>bool</em><em>, </em><em>optional</em><em>, </em><em>False</em>) – For unstructured affiliation matching, return also a column <cite>input_affiliation</cite> with the original input string.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
Expand Down
Loading

0 comments on commit a18836f

Please sign in to comment.