Skip to content

Commit

Permalink
Minimize requirement set
Browse files Browse the repository at this point in the history
- Added opasQueryHelper with QueryTextToSolr to parse form text query fields and translate to Solr syntax
- Went through a brief testing to see what libraries are required and then to produce a minimal set.  Works on initial clean install testing.
- changeed banners to depend on client relative image folder (for now)
  • Loading branch information
nrshapiro committed Dec 6, 2019
1 parent 5ce6398 commit d0c262d
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 82 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,8 @@ sql/schemas/pepwebdownloadcounts.sql
sql/schemas/pepwebdownloads.sql
scrap.py
venv/
app/envinstalltest.py
app/oldrequirements.txt
app/xrequirements.txt
images/g/AIM.036.0275A.FIG001.jpg
images/g/JCPTX.032.0329A.F0003g.jpg
77 changes: 21 additions & 56 deletions app/libs/opasAPISupportLib.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@
import models

import opasXMLHelper as opasxmllib
import opasQueryHelper
from opasQueryHelper import QueryTextToSolr
import opasGenSupportLib as opasgenlib
import opasCentralDBLib
import sourceInfoDB as SourceInfoDB
Expand Down Expand Up @@ -2160,53 +2162,6 @@ def parse_search_query_parameters(search=None,
<QueryParameters analyzeThis='art_authors_ngrm:Tuckett ' searchQ='*:* ' filterQ='art_pepsrccode:IJP && art_vol:57 && art_authors_ngrm:Tuckett ' searchAnalysisTermList=['art_pepsrccode:IJP ', 'art_authors_ngrm:Tuckett '] solrMax=None solrSortBy=None urlRequest=''>
"""
def split_boolean(field_name, query_string):
"""
>>> split_boolean("text", "dog and cat or mouse and pig or hawk")
>>> split_boolean("text", "dog AND cat or 'mouse pig'")
>>> split_boolean("text", "dog AND cat or 'mouse pig bird'")
>>> split_boolean("text", "dog and cat")
>>> split_boolean("text", "dog and cat or mouse")
>>> split_boolean("text", "dog and cat or mouse")
#TODO: Need to make "AND" implicit with separated words
"""
split_pattern = "(\sand\s|\sor\s|\sAND\s|\sOR\s)"
ret_val = ""
split_list = re.split(split_pattern, query_string, maxsplit=50, flags=re.IGNORECASE)
term_list = [x.strip() for x in split_list]
prior_term = "initial"
default_term = ""
for n in term_list:
if n in ("and", "AND"):
ret_val += " && "
prior_term = "and"
elif n in ["or", "OR"]:
ret_val += " || "
prior_term = "or"
else:
if prior_term not in ("and", "AND", "or", "OR", "initial"):
default_term = " && "
if " " in n:
if n[0] not in ('"', "'"):
# split it again!
wordlist = n.split(" ")
ret_val += default_term + f"{field_name}:{wordlist[0]}"
for n in wordlist[1:]:
ret_val += default_term + f"{field_name}:{n}"
else:
ret_val += default_term + f"{field_name}:{n}"
prior_term = ""
default_term = ""

return ret_val




Expand All @@ -2222,12 +2177,14 @@ def split_boolean(field_name, query_string):
# Could make it global to save a couple of CPU cycles, but I suspect it doesn't matter
# and the function is cleaner this way.
pat_prefix_amps = re.compile("^\s*&& ")
qparse = opasQueryHelper.QueryTextToSolr()

if sort is not None: # not sure why this seems to have a slash, but remove it
sort = re.sub("\/", "", sort)

if title is not None:
analyze_this = "&& art_title_xml:{} ".format(title)
title = qparse.markup(title, "art_title_xml")
analyze_this = f"&& {title} "
filter_q += analyze_this
search_analysis_term_list.append(analyze_this)

Expand Down Expand Up @@ -2354,42 +2311,50 @@ def split_boolean(field_name, query_string):
if period is None:
period = '5'

analyze_this = "&& art_cited_{}:[{} TO {}] ".format(period.lower(), val, val_end)
analyze_this = f"&& art_cited_{period.lower()}:[{val} TO {val_end}] "
filter_q += analyze_this
search_analysis_term_list.append(analyze_this)

if fulltext1 is not None:
analyze_this = f"&& {split_boolean('text', fulltext1)} "
fulltext1 = qparse.markup(fulltext1, "text_xml")
analyze_this = f"&& {fulltext1} "
search_q += analyze_this
search_analysis_term_list.append(analyze_this)

if fulltext2 is not None:
analyze_this = "&& text:{} ".format(fulltext2)
# we should use this for thesaurus OFF later
fulltext2 = qparse.markup(fulltext2, "text_xml")
analyze_this = f"&& {fulltext2} "
search_q += analyze_this
search_analysis_term_list.append(analyze_this)

if dreams is not None:
analyze_this = "&& dreams_xml:{} ".format(dreams)
dreams = qparse.markup(dreams, "dreams_xml")
analyze_this = f"&& {dreams} "
search_q += analyze_this
search_analysis_term_list.append(analyze_this)

if quotes is not None:
analyze_this = "&& quotes_xml:{} ".format(quotes)
quotes = qparse.markup(quotes, "quotes_xml")
analyze_this = f"&& {quotes} "
search_q += analyze_this
search_analysis_term_list.append(analyze_this)

if abstracts is not None:
analyze_this = "&& abstracts_xml:{} ".format(abstracts)
abstracts = qparse.markup(abstracts, "abstracts_xml")
analyze_this = f"&& {abstracts} "
search_q += analyze_this
search_analysis_term_list.append(analyze_this)

if dialogs is not None:
analyze_this = "&& dialogs_xml:{} ".format(dialogs)
dialogs = qparse.markup(dialogs, "dialogs_xml")
analyze_this = f"&& {dialogs} "
search_q += analyze_this
search_analysis_term_list.append(analyze_this)

if references is not None:
analyze_this = "&& references_xml:{} ".format(references)
references = qparse.markup(references, "references_xml")
analyze_this = f"&& {references} "
search_q += analyze_this
search_analysis_term_list.append(analyze_this)

Expand Down
88 changes: 88 additions & 0 deletions app/libs/opasQueryHelper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# pylint: disable=C0321,C0103,C0301,E1101,C0303,E1004,C0330,R0915,R0914,W0703,C0326

"""
opasQueryHelper
This library is meant to hold parsing and other functions which support query translation to Solr
2019.1205.1 - First version
"""
__author__ = "Neil R. Shapiro"
__copyright__ = "Copyright 2019, Psychoanalytic Electronic Publishing"
__license__ = "Apache 2.0"
__version__ = "2019.1205.1"
__status__ = "Development"

import re

class QueryTextToSolr():
def __init__(self):
regex_token_quoted = "[\^]?[\'\"][^\'\"]+[\'\"]"
regex_token_word = "(?P<word>[^\|\^\&\(\"\'\s)]+)"
# regex_word_or_quoted = f"{regex_token_quoted}|{regex_token_word}"
# token_not = re.compile("\sAND\s", re.IGNORECASE)

self.counter = 0
self.token_quoted = re.compile(regex_token_quoted, re.IGNORECASE)
self.token_or = re.compile("\sOR\s", re.IGNORECASE)
self.token_and = re.compile("\sAND\s", re.IGNORECASE)
self.token_word = re.compile(regex_token_word, re.IGNORECASE)

def markup(self, str_input, label_word):
def quotedrepl(matchobj):
self.counter += 1
return f'QS{self.counter}'

self.counter = 0
token_list = self.token_quoted.findall(str_input)
ret_val = self.token_quoted.sub(quotedrepl, str_input)
ret_val = self.token_or.sub(" || ", ret_val)
ret_val = self.token_and.sub(" && ", ret_val)
ret_val = self.token_word.sub(f"{label_word}:\g<word>", ret_val)
counter2 = 1
# take care of ^ to - before quotes go back
ret_val = re.sub("\^\s*\(", "-(", ret_val)
for n in token_list:
ret_val = re.sub(f"QS{counter2}", n, ret_val)
counter2 += 1

ptn_token_not = f"{label_word}:(\^)"
ptn_token_not2 = f"(\^){label_word}:"
ret_val = re.sub(ptn_token_not, f"-{label_word}:", ret_val)
ret_val = re.sub(ptn_token_not2, f"-{label_word}:", ret_val)

# debug only
# print (str_input, ":", ret_val)
return ret_val

# -------------------------------------------------------------------------------------------------------
# run it!

if __name__ == "__main__":
import sys
print ("Running in Python %s" % sys.version_info[0])


tests = ["dog or 'fred flints*' and 'barney rubble'",
"dog and cat and ^provided",
"dog and (cat or flea)",
"dog and ^(cat or flea)",
"dog or 'fred flintstone' and ^'barney rubble'",
"fr* and flintstone or ^barney",
"dog and (cat and flea)",
"dog or cat",
"fleet footed",
"dog and ^cat or ^mouse and pig or hawk",
"dog AND cat or 'mouse pig'",
"dog AND cat or ^'mouse pig bird'",
"'freudian slip' or 'exposure therapy'"
]

label_word = "text_xml"
for n in tests:
mu = QueryTextToSolr()
print (n, ":", mu.markup(n, label_word))

3 changes: 2 additions & 1 deletion app/libs/styles/pepkbd3-html.xslt
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@
<p class="banner">
<a class="anchor" name="{$document-id}" id="{$document-id}"/>
<a class="toc-link" href="search.php?journal={$journal-code}">
<img src="/images/banner{$journal-code}logo.gif" alt=""/>
<!--Client relative...but could use /v2/Document/Images command instead.-->
<img src="./images/banner{$journal-code}logo.gif" alt=""/>
</a>
</p>
<div class='pubinfotop'><xsl:value-of select="'[[RunningHead]]'"/></div>
Expand Down
9 changes: 5 additions & 4 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
2019.1203.1 - authentication parameter default (None) error slipped in! But important, it blocked abstracts showing.
2019.1204.1 - modified cors origin list to try *. instead of just . origins [didn't work]
2019.1204.3 - modified cors to use regex opion. Define regex in localsecrets CORS_REGEX
2019.1205.1 - Added opasQueryHelper with QueryTextToSolr to parse form text query fields and translate to Solr syntax
To Install (at least in windows)
rem python 3.7 required
Expand Down Expand Up @@ -86,7 +86,7 @@
__author__ = "Neil R. Shapiro"
__copyright__ = "Copyright 2019, Psychoanalytic Electronic Publishing"
__license__ = "Apache 2.0"
__version__ = "2019.1204.3"
__version__ = "2019.1205.1"
__status__ = "Development"

import sys
Expand Down Expand Up @@ -134,7 +134,7 @@
import solrpy as solr
import json
import libs.opasConfig as opasConfig
from opasConfig import OPASSESSIONID, OPASACCESSTOKEN, OPASEXPIRES
from opasConfig import OPASSESSIONID, OPASACCESSTOKEN, OPASEXPIRES
import logging
logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -2196,7 +2196,8 @@ async def download_an_image(response: Response,
"""
endpoint = opasCentralDBLib.API_DOCUMENTS_IMAGE
ocd, session_info = opasAPISupportLib.get_session_info(request, response)
if not session_info.authenticated:
# allow viewing, but not downloading if not logged in
if not session_info.authenticated and download != 0:
response.status_code = HTTP_400_BAD_REQUEST
status_message = "Must be logged in and authorized to download an image."
# no need to record endpoint failure
Expand Down
39 changes: 18 additions & 21 deletions app/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,38 +1,35 @@
# updated 20191120
aiofiles==0.4.0
asn1crypto==0.24.0
bcrypt==3.1.7
certifi==2019.6.16
cffi==1.12.3
certifi==2019.11.28
cffi==1.13.2
chardet==3.0.4
Click==7.0
cryptography==2.7
dnspython==1.16.0
EbookLib==0.17.1
email-validator==1.0.4
fastapi==0.38.1
future==0.17.1
email-validator==1.0.5
fastapi==0.44.1
future==0.18.2
h11==0.8.1
html5lib==1.0.1
idna==2.8
itsdangerous==1.1.0
lxml==4.4.1
passlib==1.7.1
Pillow==6.2.0
lxml==4.4.2
parse==1.6.5
passlib==1.7.2
Pillow==6.2.1
pycparser==2.19
pydantic==0.32.2
pydantic==1.2
PyJWT==1.7.1
PyMySQL==0.9.3
PyPDF2==1.26.0
python-multipart==0.0.5
reportlab==3.5.28
reportlab==3.5.32
requests==2.22.0
six==1.12.0
sqlalchemy=-1.3.11
starlette==0.12.8
treelib==1.5.5
urllib3==1.25.3
uvicorn==0.9.0
six==1.13.0
SQLAlchemy==1.3.11
starlette==0.12.9
urllib3==1.25.7
uvicorn==0.10.8
webencodings==0.5.1
websockets==8.0.2
xhtml2pdf==0.2.3
websockets==8.1
xhtml2pdf==0.2.3

0 comments on commit d0c262d

Please sign in to comment.