documentation and some (backward-compatible) renaming

terrierteam · May 30, 2021 · b35c9d2 · b35c9d2
1 parent 7814f1f
commit b35c9d2
Show file tree

Hide file tree

Showing 36 changed files with 274 additions and 125 deletions.
diff --git a/README.md b/README.md
@@ -78,10 +78,10 @@ qrels = pd.DataFrame([
 
 # any iterable of namedtuples (e.g., list, generator, etc)
 qrels = [
-    ir_measures.GenericQrel("Q0", "D0", 0),
-    ir_measures.GenericQrel("Q0", "D1", 1),
-    ir_measures.GenericQrel("Q1", "D0", 0),
-    ir_measures.GenericQrel("Q1", "D3", 2),
+    ir_measures.Qrel("Q0", "D0", 0),
+    ir_measures.Qrel("Q0", "D1", 1),
+    ir_measures.Qrel("Q1", "D0", 0),
+    ir_measures.Qrel("Q1", "D3", 2),
 ]
 
 # TREC-formatted qrels file
@@ -118,10 +118,10 @@ run = pd.DataFrame([
 
 # any iterable of namedtuples (e.g., list, generator, etc)
 run = [
-    ir_measures.GenericScoredDoc("Q0", "D0", 1.2),
-    ir_measures.GenericScoredDoc("Q0", "D1", 1.0),
-    ir_measures.GenericScoredDoc("Q1", "D0", 2.4),
-    ir_measures.GenericScoredDoc("Q1", "D3", 3.6),
+    ir_measures.ScoredDoc("Q0", "D0", 1.2),
+    ir_measures.ScoredDoc("Q0", "D1", 1.0),
+    ir_measures.ScoredDoc("Q1", "D0", 2.4),
+    ir_measures.ScoredDoc("Q1", "D3", 3.6),
 ]
 ```
 

diff --git a/docs/api.rst b/docs/api.rst
@@ -0,0 +1,35 @@
+API Reference
+===========================================
+
+Metric Calculation
+-------------------------------------------
+
+.. autofunction:: ir_measures.iter_calc
+
+.. autofunction:: ir_measures.calc_aggregate
+
+.. autofunction:: ir_measures.evaluator
+
+.. autoclass:: ir_measures.providers.Provider
+   :members:
+
+.. autoclass:: ir_measures.providers.Evaluator
+   :members:
+
+Parsing
+-------------------------------------------
+
+.. autofunction:: ir_measures.parse_measure
+
+.. autofunction:: ir_measures.parse_trec_measure
+
+.. autofunction:: ir_measures.read_trec_qrels
+
+.. autofunction:: ir_measures.read_trec_run
+
+Data Classes
+-------------------------------------------
+
+.. autoclass:: ir_measures.Metric
+.. autoclass:: ir_measures.Qrel
+.. autoclass:: ir_measures.ScoredDoc
diff --git a/docs/conf.py b/docs/conf.py
@@ -27,8 +27,7 @@
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = [
-]
+extensions = ['sphinx.ext.autodoc']
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']

diff --git a/docs/getting-started.rst b/docs/getting-started.rst
@@ -110,14 +110,14 @@ map to (integer) relevance scores::
         }
     }
 
-**namedtuple iterable**: Any iterable of named tuples. You can use ``ir_measures.GenericQrel``,
+**namedtuple iterable**: Any iterable of named tuples. You can use ``ir_measures.Qrel``,
 or any other NamedTuple with the fields ``query_id``, ``doc_id``, and ``relevance``::
 
     qrels = [
-        ir_measures.GenericQrel("Q0", "D0", 0),
-        ir_measures.GenericQrel("Q0", "D1", 1),
-        ir_measures.GenericQrel("Q1", "D0", 0),
-        ir_measures.GenericQrel("Q1", "D3", 2),
+        ir_measures.Qrel("Q0", "D0", 0),
+        ir_measures.Qrel("Q0", "D1", 1),
+        ir_measures.Qrel("Q1", "D0", 0),
+        ir_measures.Qrel("Q1", "D3", 2),
     ]
 
 Note that if the results are an iterator (such as the result of a generator), ``ir_measures`` will consume
@@ -182,14 +182,14 @@ map to (float) ranking scores::
         }
     }
 
-**namedtuple iterable**: Any iterable of named tuples. You can use ``ir_measures.GenericScoredDoc``,
+**namedtuple iterable**: Any iterable of named tuples. You can use ``ir_measures.ScoredDoc``,
 or any other NamedTuple with the fields ``query_id``, ``doc_id``, and ``score``::
 
     run = [
-        ir_measures.GenericScoredDoc("Q0", "D0", 1.2),
-        ir_measures.GenericScoredDoc("Q0", "D1", 1.0),
-        ir_measures.GenericScoredDoc("Q1", "D0", 2.4),
-        ir_measures.GenericScoredDoc("Q1", "D3", 3.6),
+        ir_measures.ScoredDoc("Q0", "D0", 1.2),
+        ir_measures.ScoredDoc("Q0", "D1", 1.0),
+        ir_measures.ScoredDoc("Q1", "D0", 2.4),
+        ir_measures.ScoredDoc("Q1", "D3", 3.6),
     ]
 
 Note that if the results are an iterator (such as the result of a generator), ``ir_measures`` will consume
@@ -227,6 +227,76 @@ easily make a qrels dataframe that is compatible with ir-measures like so::
 Note that ``read_trec_run`` returns a generator. If you need to use the qrels multiple times,
 wrap it in the ``list`` constructor to read the all qrels into memory.
 
+Measure Objects
+---------------------------------------
+
+Measure objects speficy the measure you want to calculate, along with any
+parameters they may have. There are several ways to create them. The
+easiest is to specify them directly in code:
+
+    >>> from ir_measures import * # imports all measure names
+    >>> AP
+    AP
+    >>> AP(rel=2)
+    AP(rel=2)
+    >>> nDCG@20
+    nDCG@20
+    >>> P(rel=2)@10
+    P(rel=2)@10
+
+Notice that measures can include parameters. For instance, ``AP(rel=2)`` is the
+average precision measure with a minimum relevance level of 2 (i.e., documents
+need to be scored at least 2 to count as relevant.) Or ``nDCG@20``, which specifies
+a ranking cutoff threshold of 20. See the measure's documentation for full details
+of available parameters.
+
+If you need to get a measure object from a string (e.g., if specified by the user
+as a command line argument), use the ``ir_measures.parse_measure`` function:
+
+    >>> ir_measures.parse_measure('AP')
+    AP
+    >>> ir_measures.parse_measure('AP(rel=2)') 
+    AP(rel=2)
+    >>> ir_measures.parse_measure('nDCG@20')
+    nDCG@20
+    >>> ir_measures.parse_measure('P(rel=2)@10')
+    P(rel=2)@10
+
+If you are familiar with the measure and family names from ``trec_eval``, you can
+map them to measure objects using ``ir_measures.parse_trec_measure()``:
+
+    >>> ir_measures.parse_trec_measure('map')
+    [AP]
+    >>> ir_measures.parse_trec_measure('P') # expands to multiple levels
+    [P@5, P@10, P@15, P@20, P@30, P@100, P@200, P@500, P@1000]
+    >>> ir_measures.parse_trec_measure('P_3,8') # or 'P.3,8'
+    [P@3, P@8]
+    >>> ir_measures.parse_trec_measure('ndcg')
+    [nDCG]
+    >>> ir_measures.parse_trec_measure('ndcg_cut_10')
+    [nDCG@10]
+    >>> ir_measures.parse_trec_measure('official')
+    [P@5, P@10, P@15, P@20, P@30, P@100, P@200, P@500, P@1000, Rprec, Bpref, [email protected], [email protected], [email protected], [email protected], [email protected], [email protected], [email protected], [email protected], [email protected], [email protected], [email protected], AP, NumQ, NumRel, NumRet(rel=1), NumRet, RR]
+
+Note that a single ``trec_eval`` measure name can map to multiple measures,
+so measures are returned as a list.
+
+Measures are be passed into methods like ``ir_measures.calc_aggregate``, ``ir_measures.iter_calc``,
+and ``ir_measures.evaluator``. You can also calculate values from the measure object itself:
+
+    >>> AP.calc_aggregate(qrels, run)
+    0.2842120439595336
+    >>> (nDCG@10).calc_aggregate(qrels, run) # parens needed when @cutoff is used
+    0.6250748053944134
+    >>> for metric in (P(rel=2)@10).iter_calc(qrels, run):
+    ...     print(metric)
+    Metric(query_id='1', measure=P(rel=2)@10, value=0.5)
+    Metric(query_id='2', measure=P(rel=2)@10, value=0.8)
+    ...
+    Metric(query_id='35', measure=P(rel=2)@10, value=0.9)
+
+
+
 Scoring multiple runs
 ---------------------------------------
 
@@ -244,7 +314,6 @@ An evaluator object has ``calc_aggregate(run)`` and ``calc_iter(run)`` methods.
     {nDCG@10: 0.5286, P@5: 0.6228, P(rel=2)@5: 0.4628, Judged@10: 0.8485}
 
 
-
 .. [1] In the examples, ``P@5`` and ``nDCG@10`` are returned first, as they are both calculated
    in one invocation of ``pytrec_eval``. Then, results for ``P(rel=2)@5`` are returned (as a
    second invocation of ``pytrec_eval`` because it only supports one relevance level at a time).

diff --git a/docs/index.rst b/docs/index.rst
@@ -52,3 +52,4 @@ Table of Contents
    getting-started
    measures
    providers
+   api
diff --git a/ir_measures/__init__.py b/ir_measures/__init__.py
@@ -1,6 +1,14 @@
 __version__ = "0.1.4"
 from . import util
-from .util import parse_measure, convert_trec_name, read_trec_qrels, read_trec_run, GenericQrel, GenericScoredDoc
+from .util import (parse_measure, parse_trec_measure,
+	               read_trec_qrels, read_trec_run,
+	               Qrel, ScoredDoc, Metric,
+	               GenericQrel, # deprecated; replaced with Qrel
+	               GenericScoredDoc, # deprecated; replaced with ScoredDoc
+	               convert_trec_name, # deprecated; replaced with parse_trec_measure
+	               parse_trec_qrels, # deprecated; replaced with read_trec_qrels
+	               parse_trec_run, # deprecated; replaced with read_trec_run
+	              )
 from . import measures
 from .measures import *
 from . import providers
@@ -20,7 +28,7 @@
 	gdeval,  # doesn't work when installed from package #9
 ])
 evaluator = DefaultPipeline.evaluator
-calc_ctxt = DefaultPipeline.calc_ctxt # deprecated
+calc_ctxt = DefaultPipeline.calc_ctxt # deprecated; replaced with evaluator
 iter_calc = DefaultPipeline.iter_calc
 calc_aggregate = DefaultPipeline.calc_aggregate
 

diff --git a/ir_measures/__main__.py b/ir_measures/__main__.py
@@ -2,7 +2,7 @@
 import sys
 import argparse
 import ir_measures
-from ir_measures.util import GenericScoredDoc, GenericQrel
+from ir_measures.util import ScoredDoc, Qrel
 
 
 def main_cli():
@@ -15,10 +15,8 @@ def main_cli():
     parser.add_argument('--no_summary', '-n', action='store_true')
     parser.add_argument('--provider', choices=ir_measures.providers.registry.keys())
     args = parser.parse_args()
-    run = (l.split() for l in open(args.run))
-    run = (GenericScoredDoc(cols[0], cols[2], float(cols[4])) for cols in run)
-    qrels = (l.split() for l in open(args.qrels))
-    qrels = (GenericQrel(cols[0], cols[2], int(cols[3])) for cols in qrels)
+    run = ir_measures.read_trec_run(args.run)
+    qrels = ir_measures.read_trec_qrels(args.qrels)
     measures, errors = [], []
     for mstr in args.measures:
         for m in mstr.split():

diff --git a/ir_measures/measures/__init__.py b/ir_measures/measures/__init__.py
@@ -9,7 +9,7 @@ def register(measure, aliases=[], name=None):
         registry[alias] = measure
     return registry
 
-from .base import BaseMeasure, ParamInfo, MultiMeasures, MeanAgg, SumAgg
+from .base import Measure, ParamInfo, MultiMeasures, MeanAgg, SumAgg
 from .ap import AP, MAP, _AP
 from .bpref import Bpref, BPref, _Bpref
 from .err import ERR, _ERR

diff --git a/ir_measures/measures/ap.py b/ir_measures/measures/ap.py
@@ -1,8 +1,8 @@
 from ir_measures import measures
-from .base import BaseMeasure, ParamInfo
+from .base import Measure, ParamInfo
 
 
-class _AP(measures.BaseMeasure):
+class _AP(measures.Measure):
     """
     The [Mean] Average Precision ([M]AP). The average precision of a single query is the mean
     of the precision scores at each relevant item returned in a search results list.

diff --git a/ir_measures/measures/base.py b/ir_measures/measures/base.py
@@ -2,7 +2,7 @@
 import ir_measures
 
 
-class BaseMeasure:
+class Measure:
     NAME = None
     AT_PARAM = 'cutoff' # allows measures to configure which param measure@X updates (default is cutoff)
     SUPPORTED_PARAMS = {}
@@ -69,7 +69,7 @@ def __repr__(self):
         return result
 
     def __eq__(self, other):
-        if isinstance(other, BaseMeasure):
+        if isinstance(other, Measure):
             return repr(self) == repr(other)
         return False
 

diff --git a/ir_measures/measures/bpref.py b/ir_measures/measures/bpref.py
@@ -1,8 +1,8 @@
 from ir_measures import measures
-from .base import BaseMeasure, ParamInfo
+from .base import Measure, ParamInfo
 
 
-class _Bpref(measures.BaseMeasure):
+class _Bpref(measures.Measure):
     """
     Binary Preference (Bpref).
     This measure examines the relative ranks of judged relevant and non-relevant documents. Non-judged documents are not considered. 

diff --git a/ir_measures/measures/err.py b/ir_measures/measures/err.py
@@ -1,8 +1,8 @@
 from ir_measures import measures
-from .base import BaseMeasure, ParamInfo
+from .base import Measure, ParamInfo
 
 
-class _ERR(measures.BaseMeasure):
+class _ERR(measures.Measure):
     """
     The Expected Reciprocal Rank (ERR) is a precision-focused measure.
     In essence, an extension of reciprocal rank that encapsulates both graded relevance and

diff --git a/ir_measures/measures/infap.py b/ir_measures/measures/infap.py
@@ -1,8 +1,8 @@
 from ir_measures import measures
-from .base import BaseMeasure, ParamInfo
+from .base import Measure, ParamInfo
 
 
-class _infAP(measures.BaseMeasure):
+class _infAP(measures.Measure):
     """
     Inferred AP. AP implementation that accounts for pooled-but-unjudged documents by assuming
     that they are relevant at the same proportion as other judged documents. Essentially, skips

diff --git a/ir_measures/measures/iprec.py b/ir_measures/measures/iprec.py
@@ -1,8 +1,8 @@
 from ir_measures import measures
-from .base import BaseMeasure, ParamInfo
+from .base import Measure, ParamInfo
 
 
-class _IPrec(measures.BaseMeasure):
+class _IPrec(measures.Measure):
     """
     Interpolated Precision at a given recall cutoff. Used for building precision-recall graphs.
     Unlike most measures, where @ indicates an absolute cutoff threshold, here @ sets the recall

diff --git a/ir_measures/measures/judged.py b/ir_measures/measures/judged.py
@@ -1,8 +1,8 @@
 from ir_measures import measures
-from .base import BaseMeasure, ParamInfo
+from .base import Measure, ParamInfo
 
 
-class _Judged(measures.BaseMeasure):
+class _Judged(measures.Measure):
     """
     Percentage of results in the top k (cutoff) results that have relevance judgments. Equivalent to P@k with
     a rel lower than any judgment.

diff --git a/ir_measures/measures/ndcg.py b/ir_measures/measures/ndcg.py
@@ -1,8 +1,8 @@
 from ir_measures import measures
-from .base import BaseMeasure, ParamInfo
+from .base import Measure, ParamInfo
 
 
-class _nDCG(measures.BaseMeasure):
+class _nDCG(measures.Measure):
     """
     The normalized Discounted Cumulative Gain (nDCG).
     Uses graded labels - systems that put the highest graded documents at the top of the ranking.

diff --git a/ir_measures/measures/numq.py b/ir_measures/measures/numq.py
@@ -1,8 +1,8 @@
 from ir_measures import measures
-from .base import BaseMeasure, ParamInfo, SumAgg
+from .base import Measure, ParamInfo, SumAgg
 
 
-class _NumQ(measures.BaseMeasure):
+class _NumQ(measures.Measure):
     """
     The total number of queries.
     """

diff --git a/ir_measures/measures/numrel.py b/ir_measures/measures/numrel.py
@@ -1,8 +1,8 @@
 from ir_measures import measures
-from .base import BaseMeasure, ParamInfo, SumAgg
+from .base import Measure, ParamInfo, SumAgg
 
 
-class _NumRel(measures.BaseMeasure):
+class _NumRel(measures.Measure):
     """
     The number of relevant documents the query has (independent of what the system retrieved).
     """

diff --git a/ir_measures/measures/numret.py b/ir_measures/measures/numret.py
@@ -1,8 +1,8 @@
 from ir_measures import measures
-from .base import BaseMeasure, ParamInfo, SumAgg
+from .base import Measure, ParamInfo, SumAgg
 
 
-class _NumRet(measures.BaseMeasure):
+class _NumRet(measures.Measure):
     """
     The number of results returned. When rel is provided, counts the number of documents
     returned with at least that relevance score (inclusive).