Minimize requirement set

- Added opasQueryHelper with QueryTextToSolr to parse form text query fields and translate to Solr syntax - Went through a brief testing to see what libraries are required and then to produce a minimal set. Works on initial clean install testing. - changeed banners to depend on client relative image folder (for now)
Psychoanalytic-Electronic-Publishing · Dec 6, 2019 · d0c262d · d0c262d
1 parent 5ce6398
commit d0c262d
Show file tree

Hide file tree

Showing 6 changed files with 139 additions and 82 deletions.
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,8 @@ sql/schemas/pepwebdownloadcounts.sql
 sql/schemas/pepwebdownloads.sql
 scrap.py
 venv/
+app/envinstalltest.py
+app/oldrequirements.txt
+app/xrequirements.txt
+images/g/AIM.036.0275A.FIG001.jpg
+images/g/JCPTX.032.0329A.F0003g.jpg
diff --git a/app/libs/opasAPISupportLib.py b/app/libs/opasAPISupportLib.py
@@ -69,6 +69,8 @@
 import models
 
 import opasXMLHelper as opasxmllib
+import opasQueryHelper
+from opasQueryHelper import QueryTextToSolr
 import opasGenSupportLib as opasgenlib
 import opasCentralDBLib
 import sourceInfoDB as SourceInfoDB
@@ -2160,53 +2162,6 @@ def parse_search_query_parameters(search=None,
     
     <QueryParameters analyzeThis='art_authors_ngrm:Tuckett ' searchQ='*:* ' filterQ='art_pepsrccode:IJP && art_vol:57  && art_authors_ngrm:Tuckett ' searchAnalysisTermList=['art_pepsrccode:IJP ', 'art_authors_ngrm:Tuckett '] solrMax=None solrSortBy=None urlRequest=''>    
     """
-    def split_boolean(field_name, query_string):
-        """
-        >>> split_boolean("text", "dog and cat or mouse and pig or hawk")
-    
-        >>> split_boolean("text", "dog AND cat or 'mouse pig'")
-    
-        >>> split_boolean("text", "dog AND cat or 'mouse pig bird'")
-
-        >>> split_boolean("text", "dog and cat")
-    
-        >>> split_boolean("text", "dog and cat or mouse")
-    
-        >>> split_boolean("text", "dog and cat or mouse")
-        
-        #TODO: Need to make "AND" implicit with separated words
-    
-        """
-        split_pattern = "(\sand\s|\sor\s|\sAND\s|\sOR\s)"
-        ret_val = ""
-        split_list = re.split(split_pattern, query_string, maxsplit=50, flags=re.IGNORECASE)
-        term_list = [x.strip() for x in split_list]
-        prior_term = "initial"
-        default_term = ""
-        for n in term_list:
-            if n in ("and", "AND"):
-                ret_val += " && "
-                prior_term = "and"
-            elif n in ["or", "OR"]:
-                ret_val += " || "
-                prior_term = "or"
-            else:
-                if prior_term not in ("and", "AND", "or", "OR", "initial"):
-                    default_term = " && "
-                if " " in n:
-                    if n[0] not in ('"', "'"):
-                        #  split it again!
-                        wordlist = n.split(" ")
-                        ret_val += default_term + f"{field_name}:{wordlist[0]}"
-                        for n in wordlist[1:]:
-                            ret_val += default_term + f"{field_name}:{n}"
-                else:
-                    ret_val += default_term + f"{field_name}:{n}"
-                    prior_term = ""
-                    default_term = ""
-
-        return ret_val        
-
 
 
 
@@ -2222,12 +2177,14 @@ def split_boolean(field_name, query_string):
     # Could make it global to save a couple of CPU cycles, but I suspect it doesn't matter
     # and the function is cleaner this way.
     pat_prefix_amps = re.compile("^\s*&& ")
+    qparse = opasQueryHelper.QueryTextToSolr()
 
     if sort is not None:  # not sure why this seems to have a slash, but remove it
         sort = re.sub("\/", "", sort)
 
     if title is not None:
-        analyze_this = "&& art_title_xml:{} ".format(title)
+        title = qparse.markup(title, "art_title_xml")
+        analyze_this = f"&& {title} "
         filter_q += analyze_this
         search_analysis_term_list.append(analyze_this)  
 
@@ -2354,42 +2311,50 @@ def split_boolean(field_name, query_string):
         if period is None:
             period = '5'
 
-        analyze_this = "&& art_cited_{}:[{} TO {}] ".format(period.lower(), val, val_end)
+        analyze_this = f"&& art_cited_{period.lower()}:[{val} TO {val_end}] "
         filter_q += analyze_this
         search_analysis_term_list.append(analyze_this)
 
     if fulltext1 is not None:
-        analyze_this = f"&& {split_boolean('text', fulltext1)} "
+        fulltext1 = qparse.markup(fulltext1, "text_xml")
+        analyze_this = f"&& {fulltext1} "
         search_q += analyze_this
         search_analysis_term_list.append(analyze_this)
 
     if fulltext2 is not None:
-        analyze_this = "&& text:{} ".format(fulltext2)
+        # we should use this for thesaurus OFF later
+        fulltext2 = qparse.markup(fulltext2, "text_xml")
+        analyze_this = f"&& {fulltext2} "
         search_q += analyze_this
         search_analysis_term_list.append(analyze_this)
 
     if dreams is not None:
-        analyze_this = "&& dreams_xml:{} ".format(dreams)
+        dreams = qparse.markup(dreams, "dreams_xml")
+        analyze_this = f"&& {dreams} "
         search_q += analyze_this
         search_analysis_term_list.append(analyze_this)
 
     if quotes is not None:
-        analyze_this = "&& quotes_xml:{} ".format(quotes)
+        quotes = qparse.markup(quotes, "quotes_xml")
+        analyze_this = f"&& {quotes} "
         search_q += analyze_this
         search_analysis_term_list.append(analyze_this)
 
     if abstracts is not None:
-        analyze_this = "&& abstracts_xml:{} ".format(abstracts)
+        abstracts = qparse.markup(abstracts, "abstracts_xml")
+        analyze_this = f"&& {abstracts} "
         search_q += analyze_this
         search_analysis_term_list.append(analyze_this)
 
     if dialogs is not None:
-        analyze_this = "&& dialogs_xml:{} ".format(dialogs)
+        dialogs = qparse.markup(dialogs, "dialogs_xml")
+        analyze_this = f"&& {dialogs} "
         search_q += analyze_this
         search_analysis_term_list.append(analyze_this)
 
     if references is not None:
-        analyze_this = "&& references_xml:{} ".format(references)
+        references = qparse.markup(references, "references_xml")
+        analyze_this = f"&& {references} "
         search_q += analyze_this
         search_analysis_term_list.append(analyze_this)
 

diff --git a/app/libs/opasQueryHelper.py b/app/libs/opasQueryHelper.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# pylint: disable=C0321,C0103,C0301,E1101,C0303,E1004,C0330,R0915,R0914,W0703,C0326
+
+"""
+opasQueryHelper
+
+This library is meant to hold parsing and other functions which support query translation to Solr
+
+2019.1205.1 - First version
+
+"""
+__author__      = "Neil R. Shapiro"
+__copyright__   = "Copyright 2019, Psychoanalytic Electronic Publishing"
+__license__     = "Apache 2.0"
+__version__     = "2019.1205.1"
+__status__      = "Development"
+
+import re
+
+class QueryTextToSolr():
+    def __init__(self):
+        regex_token_quoted =  "[\^]?[\'\"][^\'\"]+[\'\"]"
+        regex_token_word = "(?P<word>[^\|\^\&\(\"\'\s)]+)"
+        # regex_word_or_quoted = f"{regex_token_quoted}|{regex_token_word}" 
+        # token_not = re.compile("\sAND\s", re.IGNORECASE)
+
+        self.counter = 0
+        self.token_quoted = re.compile(regex_token_quoted, re.IGNORECASE)
+        self.token_or = re.compile("\sOR\s", re.IGNORECASE)
+        self.token_and = re.compile("\sAND\s", re.IGNORECASE)
+        self.token_word = re.compile(regex_token_word, re.IGNORECASE)
+
+    def markup(self, str_input, label_word):
+        def quotedrepl(matchobj):
+            self.counter += 1
+            return f'QS{self.counter}'
+
+        self.counter = 0
+        token_list = self.token_quoted.findall(str_input)
+        ret_val = self.token_quoted.sub(quotedrepl, str_input)
+        ret_val = self.token_or.sub(" || ", ret_val)
+        ret_val = self.token_and.sub(" && ", ret_val)
+        ret_val = self.token_word.sub(f"{label_word}:\g<word>", ret_val)
+        counter2 = 1
+        # take care of ^ to - before quotes go back
+        ret_val = re.sub("\^\s*\(", "-(", ret_val)
+        for n in token_list:
+            ret_val = re.sub(f"QS{counter2}", n, ret_val)
+            counter2 += 1
+
+        ptn_token_not = f"{label_word}:(\^)"
+        ptn_token_not2 = f"(\^){label_word}:"
+        ret_val = re.sub(ptn_token_not, f"-{label_word}:", ret_val)
+        ret_val = re.sub(ptn_token_not2, f"-{label_word}:", ret_val)
+
+        # debug only    
+        # print (str_input, ":", ret_val)
+        return ret_val
+
+# -------------------------------------------------------------------------------------------------------
+# run it!
+
+if __name__ == "__main__":
+    import sys
+    print ("Running in Python %s" % sys.version_info[0])
+
+
+    tests = ["dog or 'fred flints*' and 'barney rubble'",
+             "dog and cat and ^provided", 
+             "dog and (cat or flea)",
+             "dog and ^(cat or flea)",
+             "dog or 'fred flintstone' and ^'barney rubble'",
+             "fr* and flintstone or ^barney",
+             "dog and (cat and flea)",
+             "dog or cat",
+             "fleet footed", 
+             "dog and ^cat or ^mouse and pig or hawk", 
+             "dog AND cat or 'mouse pig'", 
+             "dog AND cat or ^'mouse pig bird'",
+             "'freudian slip' or 'exposure therapy'"
+             ]
+
+    label_word = "text_xml"
+    for n in tests:
+        mu = QueryTextToSolr()
+        print (n, ":", mu.markup(n, label_word))
+
diff --git a/app/libs/styles/pepkbd3-html.xslt b/app/libs/styles/pepkbd3-html.xslt
@@ -159,7 +159,8 @@
       <p class="banner">
         <a class="anchor" name="{$document-id}" id="{$document-id}"/>
         <a class="toc-link" href="search.php?journal={$journal-code}">
-          <img src="/images/banner{$journal-code}logo.gif" alt=""/>
+		  <!--Client relative...but could use /v2/Document/Images command instead.-->
+          <img src="./images/banner{$journal-code}logo.gif" alt=""/>
         </a>
       </p>
       <div class='pubinfotop'><xsl:value-of select="'[[RunningHead]]'"/></div>

diff --git a/app/main.py b/app/main.py
@@ -44,7 +44,7 @@
 2019.1203.1 - authentication parameter default (None) error slipped in!  But important, it blocked abstracts showing.
 2019.1204.1 - modified cors origin list to try *. instead of just . origins [didn't work]
 2019.1204.3 - modified cors to use regex opion. Define regex in localsecrets CORS_REGEX
-
+2019.1205.1 - Added opasQueryHelper with QueryTextToSolr to parse form text query fields and translate to Solr syntax
 
 To Install (at least in windows)
   rem python 3.7 required
@@ -86,7 +86,7 @@
 __author__      = "Neil R. Shapiro"
 __copyright__   = "Copyright 2019, Psychoanalytic Electronic Publishing"
 __license__     = "Apache 2.0"
-__version__     = "2019.1204.3"
+__version__     = "2019.1205.1"
 __status__      = "Development"
 
 import sys
@@ -134,7 +134,7 @@
 import solrpy as solr
 import json
 import libs.opasConfig as opasConfig
-from opasConfig import OPASSESSIONID, OPASACCESSTOKEN, OPASEXPIRES 
+from opasConfig import OPASSESSIONID, OPASACCESSTOKEN, OPASEXPIRES
 import logging
 logger = logging.getLogger(__name__)
 
@@ -2196,7 +2196,8 @@ async def download_an_image(response: Response,
     """
     endpoint = opasCentralDBLib.API_DOCUMENTS_IMAGE
     ocd, session_info = opasAPISupportLib.get_session_info(request, response)
-    if not session_info.authenticated:
+    # allow viewing, but not downloading if not logged in
+    if not session_info.authenticated and download != 0:
         response.status_code = HTTP_400_BAD_REQUEST 
         status_message = "Must be logged in and authorized to download an image."
         # no need to record endpoint failure

diff --git a/app/requirements.txt b/app/requirements.txt
@@ -1,38 +1,35 @@
-# updated 20191120
 aiofiles==0.4.0
-asn1crypto==0.24.0
 bcrypt==3.1.7
-certifi==2019.6.16
-cffi==1.12.3
+certifi==2019.11.28
+cffi==1.13.2
 chardet==3.0.4
 Click==7.0
-cryptography==2.7
 dnspython==1.16.0
 EbookLib==0.17.1
-email-validator==1.0.4
-fastapi==0.38.1
-future==0.17.1
+email-validator==1.0.5
+fastapi==0.44.1
+future==0.18.2
 h11==0.8.1
 html5lib==1.0.1
 idna==2.8
 itsdangerous==1.1.0
-lxml==4.4.1
-passlib==1.7.1
-Pillow==6.2.0
+lxml==4.4.2
+parse==1.6.5
+passlib==1.7.2
+Pillow==6.2.1
 pycparser==2.19
-pydantic==0.32.2
+pydantic==1.2
 PyJWT==1.7.1
 PyMySQL==0.9.3
 PyPDF2==1.26.0
 python-multipart==0.0.5
-reportlab==3.5.28
+reportlab==3.5.32
 requests==2.22.0
-six==1.12.0
-sqlalchemy=-1.3.11
-starlette==0.12.8
-treelib==1.5.5
-urllib3==1.25.3
-uvicorn==0.9.0
+six==1.13.0
+SQLAlchemy==1.3.11
+starlette==0.12.9
+urllib3==1.25.7
+uvicorn==0.10.8
 webencodings==0.5.1
-websockets==8.0.2
-xhtml2pdf==0.2.3
+websockets==8.1
+xhtml2pdf==0.2.3